Text Classification with scikit-learn on Khmer Documents

https://www.flickr.com/photos/mikecogh/12083974574

Steps

  1. Data loading and tagging — parse data using segmentation on Khmer text
  2. Extract features from the document using TFIDF
  3. Run a few different ML algorithms and compare the results
  4. Save the chosen model and load it to run it in production

1. Data Loading and Tagging

select id, title, body, category from dbo.article 
WHERE category IS NOT NULL
(docIds, doc_titles, doc_contents, categories) = getArticles();
របស់យើងប្រសើរជាងមុន។ => របស់ យើង ប្រសើរ ជាង មុន។

2. Extracting Features

from sklearn.feature_extraction.text import TfidfVectorizer
def tokenizersplit(str):
return str.split();
tfidf = TfidfVectorizer(tokenizer=tokenizersplit, encoding='utf-8', min_df=2, ngram_range=(1, 2), max_features=25000)tfidf_vect.fit(df['text'])
tfidf_vect.transform(df['text'])
  • encoding: is set to ‘utf-8’ to handle Khmer Unicode characters
  • min_df: ignore term with number doc count less than a given value. Value 2 means, a term must exist at least on 2 documents to be counted.
  • ngram_range: ngram you want to extract (more detail below)
  • max_features: max number of features
Naive Bayes accuracy:         0.63, 0.63
Logistic Regression accuracy: 0.90, 0.96
SVM accuracy: 0.50, 0.50
Random Forest accuracy: 0.83, 0.96
Naive Bayes accuracy:         0.60, 0.60
Logistic Regression accuracy: 0.77, 0.80
SVM accuracy: 0.50, 0.50
Random Forest accuracy: 0.83, 0.93

3. Evaluate Performance with Different Classifier

Accuracy on different algorithms on 3 different training size

Performance analysis

from sklearn import model_selection, linear_model, metrics
# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['text'], df['cat'], test_size=0.35, random_state=0)
from sklearn import metrics
m=linear_model.LogisticRegression()
m.fit(xtrain_tfidf, train_y)
y_pred = m.predict(xvalid_tfidf)
print(metrics.classification_report(valid_y, y_pred,
'''
## 85 docs for training, 15 docs for validation set
precision recall f1-score support

no_accident 0.93 1.00 0.96 13
accident 1.00 0.92 0.96 13

avg / total 0.96 0.96 0.96 26

## 65 docs on training, 35 docs for validation set
precision recall f1-score support

no_accident 0.80 1.00 0.89 20
accident 1.00 0.71 0.83 17

avg / total 0.89 0.86 0.86 37
'''

4. Save Model and Load it Run in Production

import pickle;
tfidf = TfidfVectorizer(tokenizer=tokenizersplit, encoding=
’utf-8');
tfidf.fit(df.text);
pickle.dump(tfidf, open(‘
feature_100.pkl’, ‘wb’));
...model = linear_model.LogisticRegression()
model.fit(features, labels)
import pickle
pickle.dump(model, open("model.pkl", 'wb'))
import pickle
# needed to load pickle tfidf
def tokenizersplit(str):
return str.split();
tfidf = pickle.load(open('feature_100.pkl', 'rb'))
...
loaded_model = pickle.load(open('model.pkl', 'rb'))

Steps by Steps detail on Training Process

  1. Load training documents
(docIds, docTitles, docBodies, categories) = getArticles();
(token_bodies, token_titles) = tokenizeDocs(docTitles, docBodies);
# concatenate title, body with space into tokenText
tokenText = [token_titles[i] + " " + token_bodies[i] for i in xrange(len(token_titles))]
import pandas as pd
df = pd.DataFrame({id: docIds});
df[
‘text’] = tokenText;
df[
‘cat’] = categories;
tfidf = TfidfVectorizer(tokenizer=tokenizersplit, encoding=’utf-8');
tfidf.fit(df.text);
import pickle;
pickle.dump(tfidf, open(‘
feature_100.pkl’, ‘wb’));
features = tfidf.transform(df.text)
from sklearn import model_selection, preprocessing
# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['text'], df['cat'], test_size=0.30, random_state=1)

# label encode the target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)
from sklearn import metrics, linear_model, naive_bayes, metrics, svm, xgboostdef train_model(classifier, trains, t_labels, valids, v_labels):
# fit the training dataset on the classifier
classifier.fit(trains, t_labels)

# predict the labels on validation dataset
predictions = classifier.predict(valids)

return metrics.accuracy_score(predictions, v_labels)
# Naive Bayes
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y);
print "NB accuracy: ", accuracy; # 94%, 65%, 60%, 60%

# Logistic Regression
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y);
print "LR accuracy: ", accuracy; # 96%, 84%, 94%, 100%, 97%
# SVM
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y);
print "SVM accuracy: ", accuracy; # 54%, 48%, 48%
# Random Forest
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y)
print "RF accuracy: ", accuracy # 94% ,97%, 94%, 85%
# Extereme Gradient Boosting (not from scikit-learn)
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc());
print "Xgb accuracy: ", accuracy; # 82%, 91%,92%
# convert cat ("accident/non-accident) into category_id 0,1
df['category_id'] = df['cat'].factorize(sort=True)[0]
labels = df.category_id
features = tfidf.transform(df.text)
model = linear_model.LogisticRegression()
model.fit(features, labels)
import pickle
pickle.dump(model, open("model.pkl", 'wb'))

Steps by Steps to Run in Production

(all_documents, doctitles, docIds) = getNewArticles()
(tokenized_documents, tokenized_document_title) = tokenizeDocs(all_documents, doctitles);
# concatenate title, body with space into tokenText
tokenText = [doctitles[i] + " " + all_documents[i] for i in xrange(len(doctitles))]
import pandas as pd
df = pd.DataFrame({id: docIds});
df[
‘text’] = tokenText;
# needed to load pickle feature_100.pkl
def tokenizersplit(str):
return str.split();

# load tfidf.fit
tfidf = pickle.load(open('feature_100.pkl', 'rb'))
features = tfidf.transform(df.text)
features = tfidf.transform(df.text)
import pickle
loaded_model = pickle.load(open('model.pkl', 'rb'))
y_pred = loaded_model.predict(features)
df['tag'] = y_pred
print(df[[id,'tag']])

Conclusion

Updates

# 20% validation set
NB accuracy: 0.960784313725
LR accuracy: 0.941176470588
SVM accuracy: 0.666666666667
RF accuracy: 0.911764705882
Xgb accuracy: 0.980392156863
# 25% validation set
LR accuracy: 0.934210526316
SVM accuracy: 0.684210526316
RF accuracy: 0.960526315789
Xgb accuracy: 0.986842105263
# 30% validation set
NB accuracy: 0.947368421053
LR accuracy: 0.940789473684
SVM accuracy: 0.644736842105
RF accuracy: 0.953947368421
Xgb accuracy: 0.960526315789

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Phylypo Tum

Phylypo Tum

Software Engineer and ML Enthusiast