Natural language analysis of Mexican president’s morning lectures with SQLite and spaCy

Martin vega
MCD-UNISON
Published in
8 min readDec 1, 2022
Most mentioned words in the Mexican morning conferencess

In the following text we will explain how to make a wordcloud of the most used words in the morning conferences of President Lopez Obrador.

Wordclouds are a useful tool for visualizing and analyzing text data. They allow you to quickly and easily identify the most frequently used words in a body of text, providing a visual representation of the words and their relative importance. This can be useful for identifying patterns and trends in the text, as well as for generating ideas for further analysis.

Additionally, wordclouds can be a useful tool for communicating the results of text analysis to others. They provide a visually appealing and easy-to-understand representation of the data, which can be useful for presentations or reports.

Overall, wordclouds are a useful tool for text analysis and visualization, and can provide valuable insights into a body of text.

First we create the function to save the data in SQLite.

#Create Data Base
def createDB():
conn = sql.connect("conferences.db")
conn.commit()
conn.close()
if not os.path.exists('./conferences.db'):
createDB()
# Delete table
def dropTable(tabla):

#Connecting to sqlite
conn = sql.connect('conferences.db')

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Doping table if already exists
cursor.execute("DROP TABLE IF EXISTS {}".format(tabla))
print("Table dropped... ")

#Commit your changes in the database
conn.commit()

#Closing the connection
conn.close()
# Create Tables
def createTableParticipants():
conn = sql.connect("conferences.db")
cursor = conn.cursor()
cursor.execute("""CREATE TABLE participants(
IDPerson INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT NOT NULL,
unique (name)
)""")
conn.commit()
conn.close()

def createTableDialogs():
conn = sql.connect("conferences.db")
cursor = conn.cursor()
cursor.execute("""CREATE TABLE dialogs (
IDDialog INTEGER PRIMARY KEY AUTOINCREMENT,
Dialog TEXT NOT NULL,
IDPerson INTEGER NOT NULL,
Date datetime,
FOREIGN KEY (IDPerson)
REFERENCES participants(IDPerson)
)""")
conn.commit()
conn.close()
# Using the functions to create the two tables.
createTableDialogs()
createTableParticipants()
# Insert Data
def insertRowPersonas(name):
conn = sql.connect("conferences.db")
cursor = conn.cursor()
instruction = f"INSERT INTO participants(name) VALUES('{name}')"
cursor.execute(instruction)
conn.commit()
conn.close()

def insertRowDialog(dialog, IDParticipant, date):
conn = sql.connect("conferences.db")
cursor = conn.cursor()
instruction = f"INSERT INTO dialogs(Dialog,IDPerson,Date) VALUES('{dialog}','{IDParticipant}','{date}')"
cursor.execute(instruction)
conn.commit()
conn.close()
# This function creates a view called conference_view that contains 
# the data from the two tables participants and dialogs.
# You can call the function as follows:
def create_conference_view():
conn = sql.connect("conferences.db")
cursor = conn.cursor()

cursor.execute("""CREATE VIEW conference_view AS
SELECT participants.IDPerson, participants.name, dialogs.Dialog, dialogs.Date
FROM participants
INNER JOIN dialogs
ON participants.IDPerson = dialogs.IDPerson""")

conn.commit()
conn.close()
# Create stored procedure two create tables
def createConferencesTables():
conn = sql.connect("conferences.db")
cursor = conn.cursor()
cursor.execute("""CREATE PROCEDURE createConferencesTables()
BEGIN
CREATE TABLE participants(
IDPerson INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT NOT NULL,
unique (name)
);
CREATE TABLE dialogs (
IDDialog INTEGER PRIMARY KEY AUTOINCREMENT,
Dialog TEXT NOT NULL,
IDPerson INTEGER NOT NULL,
Date datetime,
FOREIGN KEY (IDPerson)
REFERENCES participants(IDPerson)
);
END;""")
conn.commit()
conn.close()

(Note that SQLite stored procedures are limited to containing a single SELECT statement. If you need to execute a series of statements, you must use a user-defined function instead.)

Second we download the data, these are stored in this link:

That repository contains the transcripts of the stenographic versions of President Andres Manuel López Obrador’s morning conferences since the beginning of the conferences in December 2018 organized in CSVs.

# For downloading Data
import wget
import zipfile
import os
if not os.path.exists('./conferencias_matutinas_amlo-master.zip'):
# Url of President Andres Manuel Lopez Obrador's Morning Conferences ( Mañaneras AMLO )
# Link to zip:https://github.com/NOSTRODATA/conferencias_matutinas_amlo
url='https://github.com/NOSTRODATA/conferencias_matutinas_amlo/archive/refs/heads/master.zip'
wget.download(url)
# Unzip the zip file
with zipfile.ZipFile('./conferencias_matutinas_amlo-master.zip', 'r') as zip_ref:
zip_ref.extractall("./")
# Delete the zip file to keep the folders
os.remove("conferencias_matutinas_amlo-master.zip")

We use python to obtain a dataframe of all the words said by each person speaking at the conferences.

from pathlib import Path
# Get all the files
path_of_the_directory = "./conferencias_matutinas_amlo-master/"
paths = Path(path_of_the_directory).glob('**/mananera*.csv')

# Read the files into dataframes
dfs = [pd.read_csv(f,index_col=False) for f in paths]

# we generate a dataframe with the downloaded data
df = pd.concat(dfs, ignore_index=True)

# create a date column taking the data from the columns "Anio", "Mes", "Dia"
cols=["Anio","Mes","Dia"]
df['date'] = df[cols].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")
df['date']=pd.to_datetime(df['date'])

This is how our dataframe is displayed:

Dataframe of all the words said by each person

Data preprocessing and insertion for the table “participants”

We need to enter data to the table “participants”, for that we take the column “Participant” from our dataframe, and we make a new dataframe with each participant of the “Mañaneras”.

participants = df["Participante"].value_counts().to_frame()
participants = participants.reset_index()
participants = participants.drop(columns=['Participante'])
participants = participants.rename(columns={"index": "Persons"})

Insert data to table “participants”

We enter all the people who have spoken at the conferences into the database.

for index, row in participants.iterrows():
insertRowPersonas(row['Persons'])

We crate a table of the participants to a new DataFrame

cnx = sql.connect('conferences.db')
persons = pd.read_sql_query("SELECT * FROM participants", cnx)
cnx.close()

Then we merge this dataframe whit our original to have the ID.

res = pd.merge(persons.assign(grouper=persons['name']),
df.assign(grouper=df['Participante']),
how='left', on='grouper')
res = res.drop(columns=["name","grouper"])
res.head()

We upload this data to the Database.

for index, row in res.iterrows():
insertRowDialog(row['Texto'],row['IDPerson'],row['date'])

At the end we create the conference_view to see the data of both tables in one:

create_conference_view()
create_conference_view()
cnx = sql.connect('conferences.db')
dfDialogs = pd.read_sql_query("SELECT * FROM conference_view", cnx)
cnx.close()

The next step is to normalize our text, we use the following function to normalice the text:

def df_normalize(df_text_rows,column):
# Remove accents
text_without_accents = []
for text in df_text_rows[column]:
accents_re = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", normalize( "NFD", text), 0, re.I)
text_without_accent = normalize( 'NFC', accents_re)
# add the text without accents to the list
text_without_accents.append(text_without_accent)

# Regular expressions to replace symbols with spaces
replace_by_spaces_re = re.compile('[\n/(){}\[\]\|@,;\.]')

# Stopwords are defined for the Spanish language
words_empty = set(nltk.corpus.stopwords.words('spanish'))

clean_text = []
for text in text_without_accents:
# Change text to lowercase
text = text.lower()

# Removes symbols
text = re.sub(replace_by_spaces_re, ' ', text)

#Elimina risas
text = re.sub(r'[ja]+aj\w', r'JA', text)
text = re.sub(r'[je]+ej\w', r'JE', text)
text = re.sub(r'[ji]+ij\w', r'JI', text)
text = re.sub(r'[ha]+ah\w', r'HA', text)
text = re.sub(r'[he]+eh\w', r'HE', text)

# Eliminates stop words
text = ' '.join([word for word in text.split()
if word not in words_empty])

# The processed text is saved
clean_text.append(text)

# the clean texts are added in a new column
df_text_rows['FULL_TEXT_NORM'] = clean_text

display(df_text_rows)

# We use the function
df_normalize(dfDialogs,'Dialog')

The new column with the standardized text is shown here:

We take the column “FULL_TEXT_NORM” and generate an array with all the words, from there we use the library “WordCloud” and give it as input our array.

from wordcloud import WordCloud, ImageColorGenerator

# We take an image to generate the wordcloud
imgMask = np.array(Image.open('amlo.jpg'))

# Generate an array with all our words
allWords = ' '.join( [dialog for dialog in dfDialogs['FULL_TEXT_NORM']] )

# Change the background from white to black
WordCloud = WordCloud(width = 1920, height = 1080, background_color="black",random_state = 42, max_font_size = 200, mask = imgMask, max_words=473).generate(allWords)
image_colors = ImageColorGenerator(imgMask)

# We show the WordCloud
plt.figure(figsize=(16,9), dpi=120)
plt.imshow(WordCloud.recolor(color_func=image_colors), interpolation = 'bilinear')
plt.title('WordCloud of the most mentioned words in the Mexican morning conferencess' , fontsize=14)
plt.axis('off')
plt.show()
plt.savefig('wordcloudAmloManianera.png')

Final image with the most mentioned words in the morning lectures:

Most mentioned words in the Mexican morning conferencess

Now the wordcloud of the most spoken words by Andres Manuel Lopez Obrador:

Most mentioned words in the Mexican morning conferences

Using spacy to generate a wordcloud of the adjectives, verbs, nouns of the 2018 dialog of AMLO.

Creating a word cloud using only nouns can provide a more focused view of the most important objects, people, places, or concepts mentioned in a piece of text. This can be useful for quickly identifying key themes or ideas in the text. Additionally, using only nouns can make the word cloud more visually appealing, as the resulting cloud will typically have a more balanced and natural-looking distribution of words. This can make it easier to understand and interpret the word cloud, as the words will be more evenly distributed and will not be crowded together in a cluttered-looking cloud.

Making use of adjectives, a word cloud can help to highlight the descriptive words used in a text, which can give insight into the tone or sentiment of the writing. Additionally, using only adjectives can make the word cloud more aesthetically pleasing and visually striking, as adjectives are often more colorful and descriptive than other types of words.

By using only verbs, the word cloud can also provide a more dynamic and active view of the text, as it will highlight the actions and events that are central to the story or message being conveyed.

#Crating a ned DF that only get what the president say
amlo_df = dfDialogs.loc[df['name']=="PRESIDENTE ANDRES MANUEL LOPEZ OBRADOR"]
amlo_df = amlo_df.loc[amlo_df['Anio']==2018]

nlp = spacy.load('es_core_news_md')
nlp.max_length = 17840260
# Generate an array with all our words
allWords = ' '.join(amlo_df.FULL_TEXT_NORM.values)
doc = nlp(allWords)

Crating arrays of the adjectives, verbs and nouns.

# Adjectives
adjectives_words = ' '.join(
[
token.norm_ for token in doc
if token.is_alpha and not token.like_num and not token.is_stop and
not token.is_currency and token.pos_ in ['ADJ']
]
)

# Verbs
verbs_words = ' '.join(
[token.norm_ for token in doc if token.pos_ in ['VERB']]
)

# Nouns
nouns_words = ' '.join(
[
token.norm_ for token in doc
if token.is_alpha and not token.like_num and not token.is_stop and
not token.is_currency and token.pos_ in ['NOUN']
]
)
from wordcloud import WordCloud, ImageColorGenerator

# We take an image to generate the wordcloud
imgMask = np.array(Image.open('amlo.jpg'))

# Change the background from white to black
WordCloud = WordCloud(width = 1920, height = 1080, background_color="black",random_state = 42, max_font_size = 200, mask = imgMask, max_words=473).generate(adjectives_words)
image_colors = ImageColorGenerator(imgMask)

# We show the WordCloud
plt.figure(figsize=(16,9), dpi=120)
plt.imshow(WordCloud.recolor(color_func=image_colors), interpolation = 'bilinear')
plt.title('WordCloud of the most mentioned adjectives in the Mexican morning conferences by the president of Mexico in 2018' , fontsize=14)
plt.axis('off')
plt.savefig('wordcloudAmloManianera.png')
plt.show()
from wordcloud import WordCloud, ImageColorGenerator

# We take an image to generate the wordcloud
imgMask = np.array(Image.open('amlo.jpg'))

# Change the background from white to black
WordCloud = WordCloud(width = 1920, height = 1080, background_color="black",random_state = 42, max_font_size = 200, mask = imgMask, max_words=473).generate(nouns_words)
image_colors = ImageColorGenerator(imgMask)

# We show the WordCloud
plt.figure(figsize=(16,9), dpi=120)
plt.imshow(WordCloud.recolor(color_func=image_colors), interpolation = 'bilinear')
plt.title('WordCloud of the most mentioned nouns in the Mexican morning conferences by the president of Mexico in 2018' , fontsize=14)
plt.axis('off')
plt.savefig('wordcloudAmloManianera.png')
plt.show()
from wordcloud import WordCloud, ImageColorGenerator

# We take an image to generate the wordcloud
imgMask = np.array(Image.open('amlo.jpg'))

# Change the background from white to black
WordCloud = WordCloud(width = 1920, height = 1080, background_color="black",random_state = 42, max_font_size = 200, mask = imgMask, max_words=473).generate(verbs_words)
image_colors = ImageColorGenerator(imgMask)

# We show the WordCloud
plt.figure(figsize=(16,9), dpi=120)
plt.imshow(WordCloud.recolor(color_func=image_colors), interpolation = 'bilinear')
plt.title('WordCloud of the most mentioned verbs in the Mexican morning conferences by the president of Mexico in 2018' , fontsize=14)
plt.axis('off')
plt.savefig('wordcloudAmloManianera.png')
plt.show()

--

--