Natural language analysis of Mexican president’s morning lectures with SQLite and spaCy

Published in

MCD-UNISON

8 min readDec 1, 2022

Most mentioned words in the Mexican morning conferencess

In the following text we will explain how to make a wordcloud of the most used words in the morning conferences of President Lopez Obrador.

Wordclouds are a useful tool for visualizing and analyzing text data. They allow you to quickly and easily identify the most frequently used words in a body of text, providing a visual representation of the words and their relative importance. This can be useful for identifying patterns and trends in the text, as well as for generating ideas for further analysis.

Additionally, wordclouds can be a useful tool for communicating the results of text analysis to others. They provide a visually appealing and easy-to-understand representation of the data, which can be useful for presentations or reports.

Overall, wordclouds are a useful tool for text analysis and visualization, and can provide valuable insights into a body of text.

First we create the function to save the data in SQLite.

#Create Data Base
def createDB():
    conn = sql.connect("conferences.db")
    conn.commit()
    conn.close()
if not os.path.exists('./conferences.db'):
    createDB()

# Delete table
def dropTable(tabla):
    
    #Connecting to sqlite
    conn = sql.connect('conferences.db')

    #Creating a cursor object using the cursor() method
    cursor = conn.cursor()

    #Doping table if already exists
    cursor.execute("DROP TABLE IF EXISTS {}".format(tabla))
    print("Table dropped... ")

    #Commit your changes in the database
    conn.commit()

    #Closing the connection
    conn.close()

# Create Tables
def createTableParticipants():
    conn = sql.connect("conferences.db")
    cursor = conn.cursor()
    cursor.execute("""CREATE TABLE participants(
        IDPerson INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
        name TEXT NOT NULL,
        unique (name)
    )""")
    conn.commit()
    conn.close()

def createTableDialogs():
    conn = sql.connect("conferences.db")
    cursor = conn.cursor()
    cursor.execute("""CREATE TABLE dialogs (
        IDDialog INTEGER PRIMARY KEY AUTOINCREMENT,
        Dialog TEXT NOT NULL,
        IDPerson INTEGER NOT NULL,
        Date datetime,
        FOREIGN KEY (IDPerson)
            REFERENCES participants(IDPerson)
    )""")
    conn.commit()
    conn.close()
# Using the functions to create the two tables.
createTableDialogs()
createTableParticipants()

# Insert Data
def insertRowPersonas(name):
    conn = sql.connect("conferences.db")
    cursor = conn.cursor()
    instruction = f"INSERT INTO participants(name) VALUES('{name}')"
    cursor.execute(instruction)
    conn.commit()
    conn.close()

def insertRowDialog(dialog, IDParticipant, date):
    conn = sql.connect("conferences.db")
    cursor = conn.cursor()
    instruction = f"INSERT INTO dialogs(Dialog,IDPerson,Date) VALUES('{dialog}','{IDParticipant}','{date}')"
    cursor.execute(instruction)
    conn.commit()
    conn.close()

# This function creates a view called conference_view that contains 
# the data from the two tables participants and dialogs. 
# You can call the function as follows:
def create_conference_view():
    conn = sql.connect("conferences.db")
    cursor = conn.cursor()

    cursor.execute("""CREATE VIEW conference_view AS
        SELECT participants.IDPerson, participants.name, dialogs.Dialog, dialogs.Date
        FROM participants
        INNER JOIN dialogs
        ON participants.IDPerson = dialogs.IDPerson""")

    conn.commit()
    conn.close()

# Create stored procedure two create tables
def createConferencesTables():
    conn = sql.connect("conferences.db")
    cursor = conn.cursor()
    cursor.execute("""CREATE PROCEDURE createConferencesTables()
    BEGIN
        CREATE TABLE participants(
            IDPerson INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
            name TEXT NOT NULL,
            unique (name)
        );
        CREATE TABLE dialogs (
            IDDialog INTEGER PRIMARY KEY AUTOINCREMENT,
            Dialog TEXT NOT NULL,
            IDPerson INTEGER NOT NULL,
            Date datetime,
            FOREIGN KEY (IDPerson)
                REFERENCES participants(IDPerson)
        );
    END;""")
    conn.commit()
    conn.close()

(Note that SQLite stored procedures are limited to containing a single SELECT statement. If you need to execute a series of statements, you must use a user-defined function instead.)

Second we download the data, these are stored in this link:

That repository contains the transcripts of the stenographic versions of President Andres Manuel López Obrador’s morning conferences since the beginning of the conferences in December 2018 organized in CSVs.

# For downloading Data
import wget
import zipfile
import os
if not os.path.exists('./conferencias_matutinas_amlo-master.zip'):
    # Url of President Andres Manuel Lopez Obrador's Morning Conferences ( Mañaneras AMLO )
    # Link to zip:https://github.com/NOSTRODATA/conferencias_matutinas_amlo
    url='https://github.com/NOSTRODATA/conferencias_matutinas_amlo/archive/refs/heads/master.zip'
    wget.download(url)
    # Unzip the zip file
    with zipfile.ZipFile('./conferencias_matutinas_amlo-master.zip', 'r') as zip_ref:
        zip_ref.extractall("./")
    # Delete the zip file to keep the folders
    os.remove("conferencias_matutinas_amlo-master.zip")

We use python to obtain a dataframe of all the words said by each person speaking at the conferences.

from pathlib import Path
# Get all the files
path_of_the_directory = "./conferencias_matutinas_amlo-master/"
paths = Path(path_of_the_directory).glob('**/mananera*.csv')

# Read the files into dataframes
dfs = [pd.read_csv(f,index_col=False) for f in paths]

# we generate a dataframe with the downloaded data
df = pd.concat(dfs, ignore_index=True)

# create a date column taking the data from the columns "Anio", "Mes", "Dia"
cols=["Anio","Mes","Dia"]
df['date'] = df[cols].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")
df['date']=pd.to_datetime(df['date'])

This is how our dataframe is displayed:

Dataframe of all the words said by each person

Data preprocessing and insertion for the table “participants”

We need to enter data to the table “participants”, for that we take the column “Participant” from our dataframe, and we make a new dataframe with each participant of the “Mañaneras”.

participants = df["Participante"].value_counts().to_frame()
participants = participants.reset_index()
participants = participants.drop(columns=['Participante'])
participants = participants.rename(columns={"index": "Persons"})

Insert data to table “participants”

We enter all the people who have spoken at the conferences into the database.

for index, row in participants.iterrows():
    insertRowPersonas(row['Persons'])

We crate a table of the participants to a new DataFrame

cnx = sql.connect('conferences.db')
persons = pd.read_sql_query("SELECT * FROM participants", cnx)
cnx.close()

Then we merge this dataframe whit our original to have the ID.

res = pd.merge(persons.assign(grouper=persons['name']),
               df.assign(grouper=df['Participante']),
               how='left', on='grouper')
res = res.drop(columns=["name","grouper"])
res.head()

We upload this data to the Database.

for index, row in res.iterrows():
    insertRowDialog(row['Texto'],row['IDPerson'],row['date'])

At the end we create the conference_view to see the data of both tables in one:

create_conference_view()
create_conference_view()
cnx = sql.connect('conferences.db')
dfDialogs = pd.read_sql_query("SELECT * FROM conference_view", cnx)
cnx.close()

The next step is to normalize our text, we use the following function to normalice the text:

def df_normalize(df_text_rows,column):
  # Remove accents
  text_without_accents = []
  for text in df_text_rows[column]:
    accents_re = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", normalize( "NFD", text), 0, re.I)
    text_without_accent = normalize( 'NFC', accents_re)
    # add the text without accents to the list
    text_without_accents.append(text_without_accent) 

  # Regular expressions to replace symbols with spaces
  replace_by_spaces_re = re.compile('[\n/(){}\[\]\|@,;\.]')

  # Stopwords are defined for the Spanish language
  words_empty = set(nltk.corpus.stopwords.words('spanish'))

  clean_text = []
  for text in text_without_accents:
    # Change text to lowercase
    text = text.lower()

    # Removes symbols
    text = re.sub(replace_by_spaces_re, ' ', text)

    #Elimina risas
    text = re.sub(r'[ja]+aj\w', r'JA', text)
    text = re.sub(r'[je]+ej\w', r'JE', text)
    text = re.sub(r'[ji]+ij\w', r'JI', text)
    text = re.sub(r'[ha]+ah\w', r'HA', text)
    text = re.sub(r'[he]+eh\w', r'HE', text)
    
    # Eliminates stop words 
    text = ' '.join([word for word in text.split() 
                    if word not in words_empty])
    
    # The processed text is saved
    clean_text.append(text)

  # the clean texts are added in a new column
  df_text_rows['FULL_TEXT_NORM'] = clean_text

  display(df_text_rows)

# We use the function
df_normalize(dfDialogs,'Dialog')

The new column with the standardized text is shown here:

We take the column “FULL_TEXT_NORM” and generate an array with all the words, from there we use the library “WordCloud” and give it as input our array.

from wordcloud import WordCloud, ImageColorGenerator

# We take an image to generate the wordcloud
imgMask = np.array(Image.open('amlo.jpg'))

# Generate an array with all our words
allWords = ' '.join( [dialog for dialog in dfDialogs['FULL_TEXT_NORM']] )

# Change the background from white to black
WordCloud = WordCloud(width = 1920, height = 1080, background_color="black",random_state = 42, max_font_size = 200, mask = imgMask, max_words=473).generate(allWords)
image_colors = ImageColorGenerator(imgMask)

# We show the WordCloud
plt.figure(figsize=(16,9), dpi=120)
plt.imshow(WordCloud.recolor(color_func=image_colors), interpolation = 'bilinear')
plt.title('WordCloud of the most mentioned words in the Mexican morning conferencess' , fontsize=14)
plt.axis('off')
plt.show()
plt.savefig('wordcloudAmloManianera.png')

Final image with the most mentioned words in the morning lectures:

Now the wordcloud of the most spoken words by Andres Manuel Lopez Obrador:

Most mentioned words in the Mexican morning conferences

Using spacy to generate a wordcloud of the adjectives, verbs, nouns of the 2018 dialog of AMLO.

Creating a word cloud using only nouns can provide a more focused view of the most important objects, people, places, or concepts mentioned in a piece of text. This can be useful for quickly identifying key themes or ideas in the text. Additionally, using only nouns can make the word cloud more visually appealing, as the resulting cloud will typically have a more balanced and natural-looking distribution of words. This can make it easier to understand and interpret the word cloud, as the words will be more evenly distributed and will not be crowded together in a cluttered-looking cloud.

Making use of adjectives, a word cloud can help to highlight the descriptive words used in a text, which can give insight into the tone or sentiment of the writing. Additionally, using only adjectives can make the word cloud more aesthetically pleasing and visually striking, as adjectives are often more colorful and descriptive than other types of words.

By using only verbs, the word cloud can also provide a more dynamic and active view of the text, as it will highlight the actions and events that are central to the story or message being conveyed.

#Crating a ned DF that only get what the president say
amlo_df = dfDialogs.loc[df['name']=="PRESIDENTE ANDRES MANUEL LOPEZ OBRADOR"]
amlo_df = amlo_df.loc[amlo_df['Anio']==2018]

nlp = spacy.load('es_core_news_md')
nlp.max_length = 17840260
# Generate an array with all our words
allWords = ' '.join(amlo_df.FULL_TEXT_NORM.values)
doc = nlp(allWords)

Crating arrays of the adjectives, verbs and nouns.

# Adjectives
adjectives_words = ' '.join(
    [ 
     token.norm_ for token in doc
     if token.is_alpha and not token.like_num and not token.is_stop and
        not token.is_currency and token.pos_ in ['ADJ']
    ]
)

# Verbs
verbs_words = ' '.join(
  [token.norm_ for token in doc if token.pos_ in ['VERB']]
)

# Nouns
nouns_words = ' '.join(
    [ 
     token.norm_ for token in doc
     if token.is_alpha and not token.like_num and not token.is_stop and
        not token.is_currency and token.pos_ in ['NOUN']
    ]
)

from wordcloud import WordCloud, ImageColorGenerator

# We take an image to generate the wordcloud
imgMask = np.array(Image.open('amlo.jpg'))

# Change the background from white to black
WordCloud = WordCloud(width = 1920, height = 1080, background_color="black",random_state = 42, max_font_size = 200, mask = imgMask, max_words=473).generate(adjectives_words)
image_colors = ImageColorGenerator(imgMask)

# We show the WordCloud
plt.figure(figsize=(16,9), dpi=120)
plt.imshow(WordCloud.recolor(color_func=image_colors), interpolation = 'bilinear')
plt.title('WordCloud of the most mentioned adjectives in the Mexican morning conferences by the president of Mexico in 2018' , fontsize=14)
plt.axis('off')
plt.savefig('wordcloudAmloManianera.png')
plt.show()

from wordcloud import WordCloud, ImageColorGenerator

# We take an image to generate the wordcloud
imgMask = np.array(Image.open('amlo.jpg'))

# Change the background from white to black
WordCloud = WordCloud(width = 1920, height = 1080, background_color="black",random_state = 42, max_font_size = 200, mask = imgMask, max_words=473).generate(nouns_words)
image_colors = ImageColorGenerator(imgMask)

# We show the WordCloud
plt.figure(figsize=(16,9), dpi=120)
plt.imshow(WordCloud.recolor(color_func=image_colors), interpolation = 'bilinear')
plt.title('WordCloud of the most mentioned nouns in the Mexican morning conferences by the president of Mexico in 2018' , fontsize=14)
plt.axis('off')
plt.savefig('wordcloudAmloManianera.png')
plt.show()

from wordcloud import WordCloud, ImageColorGenerator

# We take an image to generate the wordcloud
imgMask = np.array(Image.open('amlo.jpg'))

# Change the background from white to black
WordCloud = WordCloud(width = 1920, height = 1080, background_color="black",random_state = 42, max_font_size = 200, mask = imgMask, max_words=473).generate(verbs_words)
image_colors = ImageColorGenerator(imgMask)

# We show the WordCloud
plt.figure(figsize=(16,9), dpi=120)
plt.imshow(WordCloud.recolor(color_func=image_colors), interpolation = 'bilinear')
plt.title('WordCloud of the most mentioned verbs in the Mexican morning conferences by the president of Mexico in 2018' , fontsize=14)
plt.axis('off')
plt.savefig('wordcloudAmloManianera.png')
plt.show()

Natural language analysis of Mexican president’s morning lectures with SQLite and spaCy

Written by Martin vega