Deep Learning — Natural Language Processing (Part V- b)

Dejan Jovanovic
Jun 11, 2019 · 6 min read
Image for post
Image for post
Image for post
Image for post
from os import listdir

# load training documents only in a directory
def process_documents_for_vocabulary(directory, vocabulary):
# walk through all files and folders
i = 0
print("Total number of file = %s" % len(listdir(directory)))
for fileName in listdir(directory):
# only training files
if fileName.startswith('cv9') or not
fileName.endswith('.txt'):
continue
# create the full path of the file to open
path = directory + '/' + fileName
i += 1
print("File number = ", i, " - ", path)
# add to vocabulary
add_document_to_vocabulary(path, vocabulary)
# load documents into memory
def load_document(fileName):
# open the file as read only
file = open(fileName, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text
# add document to vocabulary
def add_document_to_vocabulary(fileName, vocabulary):
# load document
document = load_document(fileName)
# clean document
tokens = clean_tokens(document)
# update counts
vocabulary.update(tokens)
  1. Remove tokens that are just punctuation
  2. Remove tokens that contain numbers
  3. Remove tokens that have only one character
  4. Remove tokens that don’t have much meaning such as ‘or’ or ‘and’
# clean and tokenize
def clean_tokens(document):
# split document into tokens by white space
tokens = document.split()
# punctuation removal
remove_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [remove_punctuation.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
return tokens
# constants
minimumOccurence = 2
# keep only tokens that occure at least 5 times
tokens = [i for i,j in vocabulary.items() if j >= minimumOccurence]

# save list to file
def save_list(lines, filename):
data = '\n'.join(lines)
file = open(filename, 'w')
file.write(data)
file.close()
from os import listdir
from nltk.corpus import stopwords
from collections import Counter
import re
import string

# constants
vocabularyFileName = 'vocabulary.txt'
negativeDirectory = 'review_polarity/txt_sentoken/neg'
positiveDirectory = 'review_polarity/txt_sentoken/pos'
minimumOccurence = 2

# initiate
vocabulary = Counter()

# save list to file
def save_list(lines, filename):
data = '\n'.join(lines)
file = open(filename, 'w')
file.write(data)
file.close()

# load documents into memory
def load_document(fileName):
# open the file as read only
file = open(fileName, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text

# clean and tokenize
def clean_tokens(document):
# split document into tokens by white space
tokens = document.split()
# punctuation removal
remove_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [remove_punctuation.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
return tokens

# add document to vocabulary
def add_document_to_vocabulary(fileName, vocabulary):
# load document
document = load_document(fileName)
# clean document
tokens = clean_tokens(document)
# update counts
vocabulary.update(tokens)

# load training documents only in a directory
def process_documents_for_vocabulary(directory, vocabulary):
# walk through all files and folders
i = 0
print("Total number of file = %s" % len(listdir(directory)))
for fileName in listdir(directory):
# only training files
if fileName.startswith('cv9') or
not fileName.endswith('.txt'):
continue
# create the full path of the file to open
path = directory + '/' + fileName
i += 1
print("File number = ", i, " - ", path)
# add to vocabulary
add_document_to_vocabulary(path, vocabulary)

# define vocabulary
vocabulary = Counter()

# add all documents to vocabulary
process_documents_for_vocabulary(negativeDirectory, vocabulary)
process_documents_for_vocabulary(positiveDirectory, vocabulary)

# Vocabulary size
print("Vocabulary size: ", len(vocabulary))

# keep only tokens that occure at least 5 times
tokens = [i for i,j in vocabulary.items() if j >= minimumOccurence]

# save vocabulary to the file for later use
save_list(tokens, vocabularyFileName)

### End of first step ####

### Report ###
print('*********************************************')
print ('Report')
print('---------------------------------------------')
# print the size of vocabulary
print("Vocabulary size: ", len(vocabulary))
# how many tokens do we have now
print("Reduced vocabulary size: ", len(tokens))

Summary

Hope you enjoyed this reading. Now that we have the vocabulary prepared, our next step is going to be the tokenization of the vocabulary and building a Deep Learning model for sentiment analysis. We will explore these topics in our next story.

References

  1. Deep Learning with Python, By Francois Chollet, ISBN 9781617294433
  2. Artificial Intelligence for Humans Volume 1: Fundamental Algorithms, By Jeff Heaton, ISBN978–1493682225
  3. Artificial Intelligence for Humans Volume 3: Deep Learning and Neural Networks, By Jeff Heaton, ISBN978–1505714340
  4. Develop Deep Learning Models on Theano and TensorFlow Using Keras, By Jason Brownlee
  5. Deep Learning, By Ian Goodfellow, Yoshua Bengio and Aaron Courville, ISBN 9780262035613
  6. Neural Networks and Learning Machines, By Simon Haykin, ISBN 9780131471399
  7. Dropout: A Simple Way to Prevent Neural Networks from Overfitting, by Nitish Srivastava, Geoffrey Hinton, Alex Krizhevsky, Ilya Sutskever and Ruslan Salakhutdinov

NewCryptoBlock

Technology in Action

Welcome to a place where words matter. On Medium, smart voices and original ideas take center stage - with no ads in sight. Watch

Follow all the topics you care about, and we’ll deliver the best stories for you to your homepage and inbox. Explore

Get unlimited access to the best stories on Medium — and support writers while you’re at it. Just $5/month. Upgrade

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store