Deep Learning — Natural Language Processing (Part V-c)

Dejan Jovanovic
Jul 3, 2019 · 6 min read
Image for post
Image for post
vocabulary_fileName = "vocabulary.txt"

# load file into memory
def load_document(fileName):
# open the file as read only
file = open(fileName, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text


# load the vocabulary
vocabulary = load_document(vocabulary_fileName)
vocabulary = set(vocabulary.split())
vocabulary_fileName = "vocabulary.txt"


# clean and tokenize
def clean_tokens(document):
# split document into tokens by white space
tokens = document.split()
# punctuation removal
remove_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [remove_punctuation.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
return tokens


# load document, clean it and return line of tokens
def document_to_line(fileName, vocabulary):
# load the document
document = load_document(fileName)
# clean the document
tokens = clean_tokens(document)
# filter the tokens by vocanulary
tokens = [x for x in tokens if x in vocabulary]
return ' '.join(tokens)


# process all documents in the folder
def process_documents(directory, vocabulary, isTraining):
lines = list()
# go over all files in the directory
for fileName in listdir(directory):
# skip any reviews in the test set
if isTraining and fileName.startswith('cv9'):
continue
if not isTraining and not fileName.startswith('cv9'):
continue
# create the full path of the file to be opened
path = directory + '/' + fileName
# load and clean the data
line = document_to_line(path, vocabulary)
# add to list
lines.append(line)
return lines


# load and clean a dataset
def load_clean_dataset(vocabulary, isTraining):
# load documents
negativeFeedback = process_documents('./review_polarity/txt_sentoken/neg', vocabulary, isTraining)
positiveFeedabck = process_documents('./review_polarity/txt_sentoken/pos', vocabulary, isTraining)
documents = positiveFeedabck + negativeFeedback
# prepare labels
labels = array([0 for _ in range(len(negativeFeedback))] + [1 for _ in range(len(positiveFeedabck))])
return documents, labels

# load all reviews
training_docs, ytrain = load_clean_dataset(vocabulary, True)
test_docs, ytest = load_clean_dataset(vocabulary, False)
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_1 (Dense) (None, 50) 1288450
_________________________________________________________________
dense_2 (Dense) (None, 1) 51
=================================================================
Total params: 1,288,501
Trainable params: 1,288,501
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
- 5s - loss: 0.4819 - acc: 0.7794
Epoch 2/10
- 3s - loss: 0.0702 - acc: 0.9911
Epoch 3/10
- 3s - loss: 0.0179 - acc: 1.0000
Epoch 4/10
- 3s - loss: 0.0072 - acc: 1.0000
Epoch 5/10
- 3s - loss: 0.0038 - acc: 1.0000
Epoch 6/10
- 3s - loss: 0.0022 - acc: 1.0000
Epoch 7/10
- 3s - loss: 0.0014 - acc: 1.0000
Epoch 8/10
- 3s - loss: 9.5990e-04 - acc: 1.0000
Epoch 9/10
- 3s - loss: 7.0107e-04 - acc: 1.0000
Epoch 10/10
- 3s - loss: 5.3340e-04 - acc: 1.0000
Test Accuracy: 92.50
Review: [Best movie ever! It was great, I recommend it.]
Sentiment: POSITIVE (56.636%)
Review: [This is a bad movie.]
Sentiment: NEGATIVE (66.628%)
from os import listdir
from nltk.corpus import stopwords
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
import re
import string

vocabulary_fileName = "vocabulary.txt"

# load file into memory
def load_document(fileName):
# open the file as read only
file = open(fileName, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text

# clean and tokenize
def clean_tokens(document):
# split document into tokens by white space
tokens = document.split()
# punctuation removal
remove_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [remove_punctuation.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
return tokens

# load document, clean it and return line of tokens
def document_to_line(fileName, vocabulary):
# load the document
document = load_document(fileName)
# clean the document
tokens = clean_tokens(document)
# filter the tokens by vocanulary
tokens = [x for x in tokens if x in vocabulary]
return ' '.join(tokens)

# process all documents in the folder
def process_documents(directory, vocabulary, isTraining):
lines = list()
# go over all files in the directory
for fileName in listdir(directory):
# skip any reviews in the test set
if isTraining and fileName.startswith('cv9'):
continue
if not isTraining and not fileName.startswith('cv9'):
continue
# create the full path of the file to be opened
path = directory + '/' + fileName
# load and clean the data
line = document_to_line(path, vocabulary)
# add to list
lines.append(line)
return lines


# load and clean a dataset
def load_clean_dataset(vocabulary, isTraining):
# load documents
negativeFeedback = process_documents(
'./review_polarity/txt_sentoken/neg',
vocabulary, isTraining)
positiveFeedabck = process_documents(
'./review_polarity/txt_sentoken/pos',
vocabulary, isTraining)
documents = positiveFeedabck + negativeFeedback
# prepare labels
labels = array([0 for _ in range(len(negativeFeedback))] +
[1 for _ in range(len(positiveFeedabck))])
return documents, labels

# fit a tokenizer
def create_tokenizer(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer

# create the model
def create_model(n_words):
# define network
model = Sequential()
model.add(Dense(50, input_shape=(n_words,), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize defined model
model.summary()
return model

def predict_sentiment(review, vocabulary, tokenizer, model):
#
tokens = clean_tokens(review)
# filter by vocabulary
tokens = [w for w in tokens if w in vocabulary]
# convert to line
line = ' '.join(tokens)
# encode
encoded = tokenizer.texts_to_matrix([line], mode='binary')
# predict sentiment
what = model.predict(encoded, verbose=0)
percent_pos = what[0,0]
if round(percent_pos) == 0:
return (1-percent_pos), "POSITIVE"
return percent_pos, "NEGATIVE"

# load the vocabulary
vocabulary = load_document(vocabulary_fileName)
vocabulary = set(vocabulary.split())

# Vocabulary size
print("Vocabulary size: ", len(vocabulary))

# load all reviews
training_docs, ytrain = load_clean_dataset(vocabulary, True)
test_docs, ytest = load_clean_dataset(vocabulary, False)

# create the tokenizer
tokenizer = create_tokenizer(training_docs)
# encode data
Xtrain = tokenizer.texts_to_matrix(training_docs, mode='binary')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='binary')

# define the model
n_words = Xtest.shape[1]

print("n_words-", n_words)

model = create_model(n_words)
# fit the network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %.2f' % (acc*100))

# test positive text
text = 'Best movie ever! It was great, I recommend it.'
percent, sentiment = predict_sentiment(text, vocabulary,
tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text,
sentiment, percent*100))
# test negative text
text = 'This is a bad movie.'
percent, sentiment = predict_sentiment(text, vocabulary,
tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text,
sentiment, percent*100))

Summary

Hope you enjoyed this reading. We have seen that the simple MLP model doesn’t provide satisfactory results. With testing data looks good but once you start using it with real data, accuracy really drops. In our next story we will continue with the exploration of different models, techniques and network architectures in order to improve the accuracy of the sentiment analyses model for movie reviews.

References

  1. Deep Learning with Python, By Francois Chollet, ISBN 9781617294433
  2. Artificial Intelligence for Humans Volume 1: Fundamental Algorithms, By Jeff Heaton, ISBN978–1493682225
  3. Develop Deep Learning Models on Theano and TensorFlow Using Keras, By Jason Brownlee
  4. Deep Learning, By Ian Goodfellow, Yoshua Bengio and Aaron Courville, ISBN 9780262035613
  5. Neural Networks and Learning Machines, By Simon Haykin, ISBN 9780131471399

NewCryptoBlock

Technology in Action

Welcome to a place where words matter. On Medium, smart voices and original ideas take center stage - with no ads in sight. Watch

Follow all the topics you care about, and we’ll deliver the best stories for you to your homepage and inbox. Explore

Get unlimited access to the best stories on Medium — and support writers while you’re at it. Just $5/month. Upgrade

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store