Deep Learning: Natural Language Processing (Part V-d)

Dejan Jovanovic
Jul 17, 2019 · 6 min read
Image for post
Image for post

Working with Text

from keras.layers import Embedding
embedding_layer = Embedding(25767, 100)
# testing vs training split
testing_split = 0.9
# create the model
def create_model(vocabulary_size, max_length):
# define network
model = Sequential()
model.add(Embedding(vocabulary_size, 250,
input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
# summarize defined model
model.summary()
return model
from os import listdir
from os import path
from nltk.corpus import stopwords
from numpy import array
from numpy import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from math import floor
from random import shuffle
import re
import string

seed = 7
random.seed(seed)

vocabulary_fileName = "vocabulary.txt"

# testing vs training split
training_split = 0.9

# load file into memory
def load_document(fileName):
# open the file as read only
file = open(fileName, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text

# get list of files from specified directory
def get_file_list_from_dir(datadirectory):
# get list of all files in the specified directory
all_files = listdir(path.abspath(datadirectory))
# make sure that we only get our review files that are ending
# with .txt
data_files = list(filter(lambda file: file.endswith('.txt'),
all_files))
return data_files

# split list on training and testing list
def get_training_and_testing_set(file_list):
split_index = floor(len(file_list) * training_split)
training = file_list[:split_index]
testing = file_list[split_index:]
return training, testing

# clean and tokenize
def clean_tokens(document):
# split document into tokens by white space
tokens = document.split()
# punctuation removal
remove_punctuation = re.compile('[%s]' %
re.escape(string.punctuation))
tokens = [remove_punctuation.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
return tokens

# load document, clean it and return line of tokens
def document_to_line(fileName, vocabulary):
# load the document
document = load_document(fileName)
# clean the document
tokens = clean_tokens(document)
# filter the tokens by vocabulary
tokens = [x for x in tokens if x in vocabulary]
return ' '.join(tokens)

# process all documents in the folder
def process_documents(directory, fileList, vocabulary):
lines = list()
# go over all files in the directory
for fileName in fileList:
# create the full path of the file to be opened
path = directory + '/' + fileName
# load and clean the data
line = document_to_line(path, vocabulary)
# add to list
lines.append(line)
return lines

# load and clean a dataset
def load_clean_dataset(vocabulary):
# get positivFeedback file list
positiveFileList =
get_file_list_from_dir('./review_polarity/txt_sentoken/pos')
# get negative Feedback file list
negativeFileList =
get_file_list_from_dir('./review_polarity/txt_sentoken/neg')
# shuffle files
shuffle(positiveFileList)
shuffle(negativeFileList)

# get training and testing file list
posTraining, posTesting =
get_training_and_testing_set(positiveFileList)
negTraining, negTesting =
get_training_and_testing_set(negativeFileList)

# load documents
negativeFeedbackTraining =
process_documents('./review_polarity/txt_sentoken/neg',
negTraining, vocabulary)
positiveFeedbackTraining =
process_documents('./review_polarity/txt_sentoken/pos',
posTraining, vocabulary)
negativeFeedbackTesting =
process_documents('./review_polarity/txt_sentoken/neg',
negTesting, vocabulary)
positiveFeedbackTesting =
process_documents('./review_polarity/txt_sentoken/pos',
posTesting, vocabulary)
trainingDocuments = positiveFeedbackTraining +
negativeFeedbackTraining
testingDocuments = positiveFeedbackTesting +
negativeFeedbackTesting
# prepare labels
trainingLabels = array([0 for _ in
range(len(negativeFeedbackTraining))] +
[1 for _ in range(len(positiveFeedbackTraining))])
testingLabels = array([0 for _ in
range(len(negativeFeedbackTesting))] +
[1 for _ in range(len(positiveFeedbackTesting))])
return trainingDocuments, trainingLabels,
testingDocuments, testingLabels

# fit a tokenizer
def create_tokenizer(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer

# integer encode and pad documents
def encode_docs(tokenizer, max_length, documents):
# integer encode
encoded = tokenizer.texts_to_sequences(documents)
# pad sequence
padded = pad_sequences(encoded, maxlen=max_length,
padding='post')
return padded

# create the model
def create_model(vocabulary_size, max_length):
# define network
model = Sequential()
model.add(Embedding(vocabulary_size, 100,
input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
# summarize defined model
model.summary()
return model

# turn the document into clean tokens
def clean_doc(doc, vocabulary):
# split doc into tokens by white space
tokens = doc.split()
# prepare regular expression for filtering
re_punc = re.compile('[%s]]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# filter out tokens that are not part of vocabulary
tokens = [word for word in tokens if word in vocabulary]
tokens = ' '.join(tokens)
return tokens

def predict_sentiment(review, vocabulary, tokenizer,
max_length, model):
# clean review
line = clean_doc(review, vocabulary)
# filter by vocabulary
padded = encode_docs(tokenizer, max_length, [line])
# predict sentiment
what = model.predict(padded, verbose=0)
percent_pos = what[0,0]
if round(percent_pos) == 0:
return (1-percent_pos), "POSITIVE"
return percent_pos, "NEGATIVE"

# load the vocabulary
vocabulary = load_document(vocabulary_fileName)
vocabulary = set(vocabulary.split())

# load all reviews
training_docs, ytrain, test_docs, ytest =
load_clean_dataset(vocabulary)

# calculate maximum sequence length
max_length = max([len(s.split()) for s in training_docs])

# create the tokenizer
tokenizer = create_tokenizer(training_docs)

# Vocabulary size
vocabulary_size = len(tokenizer.word_index)+1

# encode data
Xtrain = encode_docs(tokenizer, max_length, training_docs)
Xtest = encode_docs(tokenizer, max_length, test_docs)

model = create_model(vocabulary_size, max_length)
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

# evaluate the model
loss, accuracy = model.evaluate(Xtest, ytest, verbose=0)
print("-------------------------------------------------")
print('Test accuracy: %.2f' % (accuracy*100) + '%')
print("Vocabulary size: ", vocabulary_size)
print('Maximum length: %d' % max_length)
print("-------------------------------------------------")

# test positive text
text = 'Best movie ever! It was great, I recommend it.'
percent, sentiment = predict_sentiment(text, vocabulary, tokenizer,
max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment,
percent*100))
# test negative text
text = 'This is a bad movie.'
percent, sentiment = predict_sentiment(text, vocabulary, tokenizer,
max_length, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment,
percent*100))

Summary

References

NewCryptoBlock

Technology in Action

Medium is an open platform where 170 million readers come to find insightful and dynamic thinking. Here, expert and undiscovered voices alike dive into the heart of any topic and bring new ideas to the surface. Learn more

Follow the writers, publications, and topics that matter to you, and you’ll see them on your homepage and in your inbox. Explore

If you have a story to tell, knowledge to share, or a perspective to offer — welcome home. It’s easy and free to post your thinking on any topic. Write on Medium

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store