MLearning.ai
Published in

MLearning.ai

Disastrous Tweets Classification using BERT

source
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import
Embedding,Dense,SpatialDropout1D,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import seaborn as sns
import matplotlib.pyplot as plt
data.columns [out]>>Index(['id', 'keyword', 'location', 'text', 'target'],  dtype='object')
data.shape 
[out]>> (7613, 5) #dataset has 7613 rows and 5 columns
data.info [out]> <class 'pandas.core.frame.DataFrame'> RangeIndex: 7613 entries, 0 to 7612 Data columns (total 5 columns):  #   Column    Non-Null Count  Dtype  ---  ------    --------------  -----   
0 id 7613 non-null int64
1 keyword 7552 non-null object
2 location 5080 non-null object
3 text 7613 non-null object
4 target 7613 non-null int64 dtypes: int64(2), object(3) memory usage: 297.5+ KB
data.describe() #description of numerical data
figure 1
cat_data=(data.dtypes[data.dtypes=='object']).describe()
cat_data #description of categorical features
figure 2
daat['target'].value_counts()[out]>>    0    4342 
1 3271
Name: target, dtype: int64
plt.figure(figsize=(8,6))
sns.set_style(style='darkgrid')
sns.countplot(data['target'])
plt.title('Disastrous and Non-Disastrous Tweets')
plt.show()
figure 3
plt.figure(figsize=(6,8))
sns.set_style("darkgrid")data['target'].value_counts().plot.pie(autopct='%0.2f%%')
plt.title("Percentage Contribution")
plt.xlabel("percent contribution")
plt.ylabel("target")
plt.show()
figure 4
import preprocess_kgptalkie as akhil
df=akhil.get_basic_features(data)
df.head()
sns.kdeplot(df['char_counts'],shade=True,color='green')
plt.show()
figure 5
plt.figure(figsize=(6,8))
sns.kdeplot(df[df['target']==1]
['char_counts'],color='red',shade=True)
sns.kdeplot(df[df['target']==0]
['char_counts'],color='green',shade=True)
plt.show()
Figure 6
#Distribution of stop words on both the classes
sns.boxplot(data['target'],y=data['stopwords_counts'])
plt.show()
figure 7

#let's see how random the hastag has been used in both the situaton
sns.violinplot(x=data['target'],y=data['hastag_counts'])
plt.show()
figure 8
freq_occuring=akhil.get_word_freqs(data,'text')
top_20=freq_occuring[:20]
sns.barplot(top_20.index,top_20.values)
plt.xticks(rotation=70)
plt.show()
figure 9
#least 20 occurring words
least_20=freq_occuring[:20]
sns.barplot(least_20.index,least_20.values)
plt.xticks(rotation=70)
plt.show()
figure 10
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
nltk.download('stopwords')
stopwords=set(stopwords.words('english'))
def cleaner(text):
cleaned=text.replace("//"," ").replace("."," ")
cleaned=re.sub(r'[^a-zA-Z]'," ",cleaned)
cleaned=cleaned.strip() #removing whitespace
cleaned=cleaned.lower() #converting into lower case words
cleaned=re.sub(r'\w+\d+'," ",cleaned)#remove alphanumeric words
cleaned=ps.stem(cleaned) #stemming
cleaned=[word for word in cleaned.split if len(word)>2]
cleaned=" ".join(cleaned)
return cleaned
data['text']=data['text'].apply(lambda text:cleaner(text))
#let's check for some text
data['text'][0:10]
figure 11
from wordcloud import WordCloud,STOPWORDS
dataset=akhil.get_word_freqs(data[data['target']==1],'text')
print(dataset.index)
dataset=" ".join(dataset.index)
word_cloud=WordCloud(max_font_size=60,background_color='white').generate(dataset)
plt.imshow(word_cloud)
plt.axis('off')
plt.show()
figure 12
from wordcloud import WordCloud,STOPWORDS
dataset=akhil.get_word_freqs(data[data['target']==0],'text')
print(dataset.index)
dataset=" ".join(dataset.index)
word_cloud=WordCloud(max_font_size=60,background_color='white').generate(dataset)
plt.imshow(word_cloud)
plt.axis('off')
plt.show()
figure 13
import ktrain
from ktrain import text
(x_train, y_train),
(x_test,y_test),preprocess=text.texts_from_df(data,
text_column='text'
,label_columns='target',maxlen=50,
preprocess_mode='bert')
model=text.text_classifier('bert',train_data=(x_train,y_train) 
,preproc=preprocess)
learner=ktrain.get_learner(model,train_data=
(x_train,y_train),val_data=(x_test,y_test),batch_size=64)
learner.for_onecycle(lr=1e-5,epochs=4)
Figure 14
prediction=ktrain.get_predictor(learner.model,preprocess)
data=["US did this! Loudly crying faceLoudly crying face"]
prediction.predict(data)
[out]>> 'not-Disastrous'

--

--

Data Scientists must think like an artist when finding a solution when creating a piece of code. ⚪️ Artists enjoy working on interesting problems, even if there is no obvious answer ⚪️ linktr.ee/mlearning 🔵 Follow to join our 28K+ Unique DAILY Readers 🟠

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store