Very helpful getting me started. I wanted to work with a corpus using data stored in a .csv (could be a database query as well). Here are my modifications.
import pandas as pd
data = pd.read_csv(‘your.csv’) #text in column 1, classifier in column 2.
import numpy as np
numpy_array = data.as_matrix()
X = numpy_array[:,0]
Y = numpy_array[:,1]
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.4, random_state=42)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([(‘vect’, CountVectorizer(stop_words=’english’)),
(‘tfidf’, TfidfTransformer()),
(‘clf’, MultinomialNB()),
])
text_clf = text_clf.fit(X_train,Y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)