Deep Learning — Natural Language Processing (Part V- b)

Dejan Jovanovic
Jun 11 · 6 min read
from os import listdir

# load training documents only in a directory
def process_documents_for_vocabulary(directory, vocabulary):
# walk through all files and folders
i = 0
print("Total number of file = %s" % len(listdir(directory)))
for fileName in listdir(directory):
# only training files
if fileName.startswith('cv9') or not
fileName.endswith('.txt'):
continue
# create the full path of the file to open
path = directory + '/' + fileName
i += 1
print("File number = ", i, " - ", path)
# add to vocabulary
add_document_to_vocabulary(path, vocabulary)
# load documents into memory
def load_document(fileName):
# open the file as read only
file = open(fileName, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text
# add document to vocabulary
def add_document_to_vocabulary(fileName, vocabulary):
# load document
document = load_document(fileName)
# clean document
tokens = clean_tokens(document)
# update counts
vocabulary.update(tokens)
# clean and tokenize
def clean_tokens(document):
# split document into tokens by white space
tokens = document.split()
# punctuation removal
remove_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [remove_punctuation.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
return tokens
# constants
minimumOccurence = 2
# keep only tokens that occure at least 5 times
tokens = [i for i,j in vocabulary.items() if j >= minimumOccurence]

# save list to file
def save_list(lines, filename):
data = '\n'.join(lines)
file = open(filename, 'w')
file.write(data)
file.close()
from os import listdir
from nltk.corpus import stopwords
from collections import Counter
import re
import string

# constants
vocabularyFileName = 'vocabulary.txt'
negativeDirectory = 'review_polarity/txt_sentoken/neg'
positiveDirectory = 'review_polarity/txt_sentoken/pos'
minimumOccurence = 2

# initiate
vocabulary = Counter()

# save list to file
def save_list(lines, filename):
data = '\n'.join(lines)
file = open(filename, 'w')
file.write(data)
file.close()

# load documents into memory
def load_document(fileName):
# open the file as read only
file = open(fileName, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text

# clean and tokenize
def clean_tokens(document):
# split document into tokens by white space
tokens = document.split()
# punctuation removal
remove_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [remove_punctuation.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
return tokens

# add document to vocabulary
def add_document_to_vocabulary(fileName, vocabulary):
# load document
document = load_document(fileName)
# clean document
tokens = clean_tokens(document)
# update counts
vocabulary.update(tokens)

# load training documents only in a directory
def process_documents_for_vocabulary(directory, vocabulary):
# walk through all files and folders
i = 0
print("Total number of file = %s" % len(listdir(directory)))
for fileName in listdir(directory):
# only training files
if fileName.startswith('cv9') or
not fileName.endswith('.txt'):
continue
# create the full path of the file to open
path = directory + '/' + fileName
i += 1
print("File number = ", i, " - ", path)
# add to vocabulary
add_document_to_vocabulary(path, vocabulary)

# define vocabulary
vocabulary = Counter()

# add all documents to vocabulary
process_documents_for_vocabulary(negativeDirectory, vocabulary)
process_documents_for_vocabulary(positiveDirectory, vocabulary)

# Vocabulary size
print("Vocabulary size: ", len(vocabulary))

# keep only tokens that occure at least 5 times
tokens = [i for i,j in vocabulary.items() if j >= minimumOccurence]

# save vocabulary to the file for later use
save_list(tokens, vocabularyFileName)

### End of first step ####

### Report ###
print('*********************************************')
print ('Report')
print('---------------------------------------------')
# print the size of vocabulary
print("Vocabulary size: ", len(vocabulary))
# how many tokens do we have now
print("Reduced vocabulary size: ", len(tokens))

Summary

References


NewCryptoBlock

Technology in Action

Dejan Jovanovic

Written by

Seasoned executive, business and technology leader, entrepreneur, blockchain and smart contract expert

NewCryptoBlock

Technology in Action