Photo by Jason Yu on Unsplash

Analyzing Obama Speeches Since 2004

Aymen El Amri
May 30, 2016 · 9 min read

Downloading Content

#!/usr/bin/env python
# coding: utf8
from goose import Goose
import urllib
import lxml.html
import codecs
def get_links(url, domain):
connection = urllib.urlopen(url)
dom = lxml.html.fromstring(connection.read())
for link in dom.xpath(‘//a/@href’): # select the url in href for all a tags(links)
if ( link.startswith(“speech”) and link.endswith(“htm”) ):
yield domain + link
def get_text(url):
g = Goose()
article = g.extract(url=url)
with codecs.open(article.link_hash + “.speech”, “w”, “utf-8-sig”) as text_file:
text_file.write(article.cleaned_text)
if (__name__ == “__main__”):
link = “http://www.americanrhetoric.com/barackobamaspeeches.htm"
domain = “http://www.americanrhetoric.com/"
for i in get_links(link, domain):
get_text(i)
import os
for file in os.listdir(“.”):
if file.endswith(“.speech”):
os.system(“cat “+ file + “ >> all.speeches”)
with codecs.open(“all.speeches”, “r”, “utf-8-sig”) as text_file:
r = text_file.read()
#Remove punctuation
tokenizer = RegexpTokenizer(r’\w+’)
_tokens = tokenizer.tokenize(r)
# Get clean tokens
tokens = [t for t in _tokens if t.lower() not in english_stopwords]

Analyzing Content

The Lexical Diversity

# Process lexical diversity
st = len(set(tokens))
lt = len(tokens)
y = [st*100/lt]
print(y)
fig = plt.figure()
ax = fig.add_subplot(111)
N = 1
# necessary variables
ind = np.arange(N)
width = 0.7
rect = ax.bar(ind, y, width, color=’black’)
# axes and labels
ax.set_xlim(-width,len(ind)+width)
ax.set_ylim(0,100)
ax.set_ylabel(‘Score’)
ax.set_title(‘Lexical Diversity’)
xTickMarks = [‘Lexical Diversity Meter’]
ax.set_xticks(ind+width)
xtickNames = ax.set_xticklabels(xTickMarks)
plt.setp(xtickNames, rotation=45, fontsize=10)
## add a legend
ax.legend( (rect[0], (‘’) ))
plt.show()

POS Tags Frequency

# get tagged tokens
tagged = nltk.pos_tag(tokens)
# top words by tag (verb, noun ..etc)
counts = Counter(tag for word,tag in tagged)
# counter data, counter is your counter object
keys = counts.keys()
y_pos = np.arange(len(keys))
# get the counts for each key
p = [counts[k] for k in keys]
error = np.random.rand(len(keys))
POS Tag | Description | Example
CC coordinating conjunction : and
CD cardinal number : 1, third
DT determiner : the
EX existential : there there is
FW foreign word : d’hoevre
IN preposition/subordinating conjunction : in, of, like
JJ adjective : big
JJR adjective, comparative : bigger
JJS adjective, superlative : biggest
LS list marker : 1)
MD modal : could, will
NN noun, singular or mass : door
NNS noun plural : doors
NNP proper noun, singular : John
NNPS proper noun, plural : Vikings
PDT predeterminer : both the boys
POS possessive ending : friend‘s
PRP personal pronoun : I, he, it
PRP$ possessive pronoun : my, his
RB adverb : however, usually, naturally, here, good
RBR adverb, comparative : better
RBS adverb, superlative : best
RP particle : give up
TO to : to go, to him
UH interjection : uhhuhhuhh
VB verb, base form : take
VBD verb, past tense : took
VBG verb, gerund/present participle : taking
VBN verb, past participle : taken
VBP verb, sing. present, non-3d : take
VBZ verb, 3rd person sing. present : takes
WDT wh-determiner : which
WP wh-pronoun : who, what
WP$ possessive wh-pronoun : whose
WRB wh-abverb : where, when
# Top 60 words
dist = nltk.FreqDist(tokens)
dist.plot(60, cumulative=False)

Common Expressions

text = nltk.Text(_tokens)
collocation = text.collocations(num=60)

Extracting Nouns, Locations, Organizations And Other Stuff

 #ORGANIZATION Georgia-Pacific Corp., WHO
#PERSON Eddy Bonte, President Obama
#LOCATION Murray River, Mount Everest
#DATE June, 2008–06–29
#TIME two fifty a m, 1:30 p.m.
#MONEY 175 million Canadian Dollars, GBP 10.40
#PERCENT twenty pct, 18.75 %
#FACILITY Washington Monument, Stonehenge
#GPE South East Asia, Midlothian
nouns = [chunk for chunk in ne_chunk(tagged) if isinstance(chunk, Tree)] persons = []
locations = []
organizations = []
dates = []
times = []
percents = []
facilities = []
gpes = []
for tree in nouns:
if tree.label() == “PERSON”:
person = ‘ ‘.join(c[0] for c in tree.leaves())
persons.append(person)
if tree.label() == “LOCATION”:
location = ‘ ‘.join(c[0] for c in tree.leaves())
locations.append(location)
if tree.label() == “ORGANIZATION”:
organization = ‘ ‘.join(c[0] for c in tree.leaves())
organizations.append(organization)
if tree.label() == “DATE”:
date = ‘ ‘.join(c[0] for c in tree.leaves())
dates.append(date)
if tree.label() == “TIME”:
time = ‘ ‘.join(c[0] for c in tree.leaves())
timess.append(time)
if tree.label() == “PERCENT”:
percent = ‘ ‘.join(c[0] for c in tree.leaves())
percents.append(percent)
if tree.label() == “FACILITY”:
facility = ‘ ‘.join(c[0] for c in tree.leaves())
facilities.append(facility)
if tree.label() == “GPE”:
gpe = ‘ ‘.join(c[0] for c in tree.leaves())
gpes.append(gpe)

Finding Other Possibilities

bi = bigrams(tokens)
tri = trigrams(tokens)
every = everygrams(_tokens, min_len= 20, max_len=20)
i = 0
bilist = list(bi)[:120]
for element in bilist:
print(element[0] + “ “ + element[1])

Let’s Generate A Speech

from pymarkovchain import MarkovChain
mc = MarkovChain()
for i in range(1,20):
mc.generateDatabase(r)
g = mc.generateString()

Connect Deeper

HackerNoon.com

how hackers start their afternoons.

Aymen El Amri

Written by

Cloud&DevOps, Entrepreneur, TechAuthor, Founder/CEO www.eralabs.io & www.faun.dev , About me : www.aymenelamri.com

HackerNoon.com

how hackers start their afternoons.