Image for post
Image for post

Automated Keyword Extraction from Articles using NLP

Sowmya Vivek
Dec 17, 2018 · 9 min read

Background

About the dataset

High-level approach

Image for post
Image for post

Importing the dataset

import pandas
# load the dataset
dataset = pandas.read_csv('papers2.txt', delimiter = '\t')
dataset.head()
Image for post
Image for post

Preliminary text exploration

Fetch word count for each abstract

#Fetch wordcount for each abstract
dataset['word_count'] = dataset['abstract1'].apply(lambda x: len(str(x).split(" ")))
dataset[['abstract1','word_count']].head()
Image for post
Image for post
##Descriptive statistics of word counts
dataset.word_count.describe()
Image for post
Image for post

Most common and uncommon words

#Identify common words
freq = pandas.Series(' '.join(dataset['abstract1']).split()).value_counts()[:20]
freq
Image for post
Image for post
Most common words
#Identify uncommon words
freq1 = pandas.Series(' '.join(dataset
['abstract1']).split()).value_counts()[-20:]
freq1

Text Pre-processing

Image for post
Image for post
Objectives of text pre-processing
Image for post
Image for post
Text pre-processing
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
stem = PorterStemmer()
word = "inversely"print("stemming:",stem.stem(word))
print("lemmatization:", lem.lemmatize(word, "v"))
Image for post
Image for post
# Libraries for text preprocessing
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
##Creating a list of custom stopwords
new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)
corpus = []
for i in range(0, 3847):
#Remove punctuations
text = re.sub('[^a-zA-Z]', ' ', dataset['abstract1'][i])

#Convert to lowercase
text = text.lower()

#remove tags
text=re.sub("</?.*?>"," <> ",text)

# remove special characters and digits
text=re.sub("(\\d|\\W)+"," ",text)

##Convert to list from string
text = text.split()

##Stemming
ps=PorterStemmer()
#Lemmatisation
lem = WordNetLemmatizer()
text = [lem.lemmatize(word) for word in text if not word in
stop_words]
text = " ".join(text)
corpus.append(text)
#View corpus item
corpus[222]
Image for post
Image for post

Data Exploration

#Word cloud
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
% matplotlib inline
wordcloud = WordCloud(
background_color='white',
stopwords=stop_words,
max_words=100,
max_font_size=50,
random_state=42
).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("word1.png", dpi=900)
Image for post
Image for post
Word cloud

Text preparation

Creating a vector of word counts

from sklearn.feature_extraction.text import CountVectorizer
import re
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(corpus)
list(cv.vocabulary_.keys())[:10]
Image for post
Image for post

Visualize top N uni-grams, bi-grams & tri-grams

#Most frequently occuring words
def get_top_n_words(corpus, n=None):
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
top_df = pandas.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
#Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=30)
Image for post
Image for post
Bar plot of most frequently occurring uni-grams
#Most frequently occuring Bi-grams
def get_top_n2_words(corpus, n=None):
vec1 = CountVectorizer(ngram_range=(2,2),
max_features=2000).fit(corpus)
bag_of_words = vec1.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec1.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
top2_words = get_top_n2_words(corpus, n=20)
top2_df = pandas.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
print(top2_df)
#Barplot of most freq Bi-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
h.set_xticklabels(h.get_xticklabels(), rotation=45)
Image for post
Image for post
Bar plot of most frequently occurring bi-grams
#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
vec1 = CountVectorizer(ngram_range=(3,3),
max_features=2000).fit(corpus)
bag_of_words = vec1.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in
vec1.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1],
reverse=True)
return words_freq[:n]
top3_words = get_top_n3_words(corpus, n=20)
top3_df = pandas.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
print(top3_df)
#Barplot of most freq Tri-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45)
Image for post
Image for post
Bar plot of most frequently occurring tri-grams

Converting to a matrix of integers

Image for post
Image for post
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
# get feature names
feature_names=cv.get_feature_names()

# fetch document for which keywords needs to be extracted
doc=corpus[532]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
tuples = zip(coo_matrix.col, coo_matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
"""get the feature names and tf-idf score of top n items"""

#use only topn items from vector
sorted_items = sorted_items[:topn]

score_vals = []
feature_vals = []

# word index and corresponding tf-idf score
for idx, score in sorted_items:

#keep track of feature name and its corresponding score
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])

#create a tuples of feature,score
#results = zip(feature_vals,score_vals)
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]

return results
#sort the tf-idf vectors by descending order of scoressorted_items=sort_coo(tf_idf_vector.tocoo())#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,5)

# now print the results
print("\nAbstract:")
print(doc)
print("\nKeywords:")
for k in keywords:
print(k,keywords[k])
Image for post
Image for post

Concluding remarks

Analytics Vidhya

Analytics Vidhya is a community of Analytics and Data…

Welcome to a place where words matter. On Medium, smart voices and original ideas take center stage - with no ads in sight. Watch

Follow all the topics you care about, and we’ll deliver the best stories for you to your homepage and inbox. Explore

Get unlimited access to the best stories on Medium — and support writers while you’re at it. Just $5/month. Upgrade

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store