Deep Learning — Natural Language Processing (Part V-c)

Dejan Jovanovic
Jul 3 · 6 min read
vocabulary_fileName = "vocabulary.txt"

# load file into memory
def load_document(fileName):
# open the file as read only
file = open(fileName, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text


# load the vocabulary
vocabulary = load_document(vocabulary_fileName)
vocabulary = set(vocabulary.split())
vocabulary_fileName = "vocabulary.txt"


# clean and tokenize
def clean_tokens(document):
# split document into tokens by white space
tokens = document.split()
# punctuation removal
remove_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [remove_punctuation.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
return tokens


# load document, clean it and return line of tokens
def document_to_line(fileName, vocabulary):
# load the document
document = load_document(fileName)
# clean the document
tokens = clean_tokens(document)
# filter the tokens by vocanulary
tokens = [x for x in tokens if x in vocabulary]
return ' '.join(tokens)


# process all documents in the folder
def process_documents(directory, vocabulary, isTraining):
lines = list()
# go over all files in the directory
for fileName in listdir(directory):
# skip any reviews in the test set
if isTraining and fileName.startswith('cv9'):
continue
if not isTraining and not fileName.startswith('cv9'):
continue
# create the full path of the file to be opened
path = directory + '/' + fileName
# load and clean the data
line = document_to_line(path, vocabulary)
# add to list
lines.append(line)
return lines


# load and clean a dataset
def load_clean_dataset(vocabulary, isTraining):
# load documents
negativeFeedback = process_documents('./review_polarity/txt_sentoken/neg', vocabulary, isTraining)
positiveFeedabck = process_documents('./review_polarity/txt_sentoken/pos', vocabulary, isTraining)
documents = positiveFeedabck + negativeFeedback
# prepare labels
labels = array([0 for _ in range(len(negativeFeedback))] + [1 for _ in range(len(positiveFeedabck))])
return documents, labels

# load all reviews
training_docs, ytrain = load_clean_dataset(vocabulary, True)
test_docs, ytest = load_clean_dataset(vocabulary, False)
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_1 (Dense) (None, 50) 1288450
_________________________________________________________________
dense_2 (Dense) (None, 1) 51
=================================================================
Total params: 1,288,501
Trainable params: 1,288,501
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
- 5s - loss: 0.4819 - acc: 0.7794
Epoch 2/10
- 3s - loss: 0.0702 - acc: 0.9911
Epoch 3/10
- 3s - loss: 0.0179 - acc: 1.0000
Epoch 4/10
- 3s - loss: 0.0072 - acc: 1.0000
Epoch 5/10
- 3s - loss: 0.0038 - acc: 1.0000
Epoch 6/10
- 3s - loss: 0.0022 - acc: 1.0000
Epoch 7/10
- 3s - loss: 0.0014 - acc: 1.0000
Epoch 8/10
- 3s - loss: 9.5990e-04 - acc: 1.0000
Epoch 9/10
- 3s - loss: 7.0107e-04 - acc: 1.0000
Epoch 10/10
- 3s - loss: 5.3340e-04 - acc: 1.0000
Test Accuracy: 92.50
Review: [Best movie ever! It was great, I recommend it.]
Sentiment: POSITIVE (56.636%)
Review: [This is a bad movie.]
Sentiment: NEGATIVE (66.628%)
from os import listdir
from nltk.corpus import stopwords
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense
import re
import string

vocabulary_fileName = "vocabulary.txt"

# load file into memory
def load_document(fileName):
# open the file as read only
file = open(fileName, 'r')
# read all text
text = file.read()
# close the file
file.close()
return text

# clean and tokenize
def clean_tokens(document):
# split document into tokens by white space
tokens = document.split()
# punctuation removal
remove_punctuation = re.compile('[%s]' % re.escape(string.punctuation))
tokens = [remove_punctuation.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
return tokens

# load document, clean it and return line of tokens
def document_to_line(fileName, vocabulary):
# load the document
document = load_document(fileName)
# clean the document
tokens = clean_tokens(document)
# filter the tokens by vocanulary
tokens = [x for x in tokens if x in vocabulary]
return ' '.join(tokens)

# process all documents in the folder
def process_documents(directory, vocabulary, isTraining):
lines = list()
# go over all files in the directory
for fileName in listdir(directory):
# skip any reviews in the test set
if isTraining and fileName.startswith('cv9'):
continue
if not isTraining and not fileName.startswith('cv9'):
continue
# create the full path of the file to be opened
path = directory + '/' + fileName
# load and clean the data
line = document_to_line(path, vocabulary)
# add to list
lines.append(line)
return lines


# load and clean a dataset
def load_clean_dataset(vocabulary, isTraining):
# load documents
negativeFeedback = process_documents(
'./review_polarity/txt_sentoken/neg',
vocabulary, isTraining)
positiveFeedabck = process_documents(
'./review_polarity/txt_sentoken/pos',
vocabulary, isTraining)
documents = positiveFeedabck + negativeFeedback
# prepare labels
labels = array([0 for _ in range(len(negativeFeedback))] +
[1 for _ in range(len(positiveFeedabck))])
return documents, labels

# fit a tokenizer
def create_tokenizer(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer

# create the model
def create_model(n_words):
# define network
model = Sequential()
model.add(Dense(50, input_shape=(n_words,), activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# summarize defined model
model.summary()
return model

def predict_sentiment(review, vocabulary, tokenizer, model):
#
tokens = clean_tokens(review)
# filter by vocabulary
tokens = [w for w in tokens if w in vocabulary]
# convert to line
line = ' '.join(tokens)
# encode
encoded = tokenizer.texts_to_matrix([line], mode='binary')
# predict sentiment
what = model.predict(encoded, verbose=0)
percent_pos = what[0,0]
if round(percent_pos) == 0:
return (1-percent_pos), "POSITIVE"
return percent_pos, "NEGATIVE"

# load the vocabulary
vocabulary = load_document(vocabulary_fileName)
vocabulary = set(vocabulary.split())

# Vocabulary size
print("Vocabulary size: ", len(vocabulary))

# load all reviews
training_docs, ytrain = load_clean_dataset(vocabulary, True)
test_docs, ytest = load_clean_dataset(vocabulary, False)

# create the tokenizer
tokenizer = create_tokenizer(training_docs)
# encode data
Xtrain = tokenizer.texts_to_matrix(training_docs, mode='binary')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='binary')

# define the model
n_words = Xtest.shape[1]

print("n_words-", n_words)

model = create_model(n_words)
# fit the network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %.2f' % (acc*100))

# test positive text
text = 'Best movie ever! It was great, I recommend it.'
percent, sentiment = predict_sentiment(text, vocabulary,
tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text,
sentiment, percent*100))
# test negative text
text = 'This is a bad movie.'
percent, sentiment = predict_sentiment(text, vocabulary,
tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text,
sentiment, percent*100))

Summary

References


NewCryptoBlock

Technology in Action

Dejan Jovanovic

Written by

Seasoned executive, business and technology leader, entrepreneur, blockchain and smart contract expert

NewCryptoBlock

Technology in Action