MLearning.ai
Published in

MLearning.ai

Sentiment Classification Using Distil Bert on Custom Dataset

Source
Source
import requests
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen as uReq
import numpy as np
import pandas as pd
print("Enter the product of your own chice")
product_name=input()
web_link='https://www.flipkart.com/search?q='
url=web_link+product.replace(" ","")
url_clinet=uReq(url) #clinet requesting to access the url from servr
url=url_client.read() #readng the url
url.close() #closing the conncetion
soup=bs(url,'html.parser')
data_class=soup.find_all('div',{"class":"_1AtVbE col-12-12"})
data=data_class[0] #it indicates we are taking first product from a
complete given list present on UI
#list of things i am going to scrap from website
product_name=[] #name of product
comment_header=[] #comment_heading
comments=[] #full description of comment
ratings=[] #rating of product
user_name=[] #person who has commented about product
likes=[] #people aggreng with the comments
dislikes=[] #people don't agree with comment
region=[] #location from where user has commented
import requests
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
search_product=input()
url="https://www.flipkart.com/search?q="+search_product.replace(" ","")
uClient=uReq(url) #opening the url
url=uClient.read() #reading the url
uClient.close() #closing the connection this is must needed thing when we are doing scrapping
soup=BeautifulSoup(url,"html.parser") #parsing the html file for given product page
ratings=[]
headings=[]
user_comments=[]
names=[]
likes_count=[]
region=[]
n_data=soup.find_all("div",class_="_1AtVbE col-12-12") #finding all the product present on requested page
data=n_data[2]
for page in range(1, 45):
click_data = "https://www.flipkart.com" + data.div.div.div.a["href"].format(page) # clicking on that particular bject link
req = requests.get(click_data)
soup_next = BeautifulSoup(req.text, "html.parser")
comment_boxes = soup_next.find_all("div", class_="col _2wzgFH")
for comment in comment_boxes:
rating = comment.find("div", {"class": "_3LWZlK _1BLPMq"})
if rating is not None: # if there will be some ratings
ratings.append(float(rating.text))
else: # if the rating will be missing then will add null values
ratings.append(np.nan)
heading = comment.find("p", class_="_2-N8zT")
if heading is not None:
headings.append(heading.text)
else:
headings.append(np.nan)
user_comment = comment.find("div", class_="t-ZTKy")
if user_comment is not None:
user_comments.append(user_comment.text)
else:
user_comments.append(np.nan)
name = comment.find("p", class_="_2sc7ZR _2V5EHH")
if name is not None:
names.append(name.text)
else:
names.append(np.nan)
like_count = comment.find("span", class_="_3c3Px5")
if like_count is not None:
likes_count.append(like_count.text)
else:
likes_count.append(np.nan)
region=soup_updated.find_all('p',{"class":"_2mcZGG"})
dict1={"header":headings,"comments":user_comments,"user name":names,"number of likes":likes_count,"ratings":ratings} #saving all these data into dictionary file
df=pd.DataFrame(dict1)
df.to_csv("final_result1.csv")
data=pd.read_csv('custom_data.csv')
data.head()
figure 1
data.shape #number of rows and column in dataset
[out]>> (1850, 5)
def rating_into_sentiment(x):
if x>3:
return 0
else:
return 1
data['ratings']=data['ratings'].apply(rating_into_sentiment)
data.head(10)
figure 2
data.isnull().values.any()[out]>> False
from utils import char_count
from utils import word_count
from utils import stop_word_count
from utils import email_removal
from utils import mention_count
from utils import numeric_digit_count
from utils import upper_case_count
from utils import lower_case_conversion
from utils import cont_to_exp
from utils import remove_mul_space,spelling_correction,remove_spec_char
from utils import remove_stop_words,remove_ac_char
from utils import base_root_form,remove_common_words,remove_rare_words
data['cont_to_exp']=data['comments'].apply(cont_to_exp)
data['upper_case_count']=data['comments'].apply(upper_case_count)
data['stop_word_count']=data['comments'].apply(stop_word_count)
data['word_count']=data['comments'].apply(word_count)
data['char_count']=data['comments'].apply(char_count)
data['comments']=data['comments'].apply(remove_mul_space)
data['comments']=data['comments'].apply(remove_spec_char)
data['comments']=data['comments'].apply(remove_stop_words)
data['comments']=data['comments'].apply(remove_ac_char)
data['comments']=data['comments'].apply(base_root_form)
figure 3
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
sns.set_style('darkgrid')
sns.kdeplot(data[data['ratings']==0]['number of likes'],shade=True,color='red')
sns.kdeplot(data[data['ratings']==1]['number of likes'],shade=True,color='green')
plt.title("like affecting the sentiment")
plt.show()
figure 5
plt.figure(figsize=(6,8))
sns.set_style("ticks")
data['ratings'].value_counts().plot.pie(autopct='%0.2f%%')
plt.title("Percentage Contribution")
plt.xlabel("percent contribution")
plt.ylabel("target")
plt.show()
figure 6
sns.set_style('darkgrid')
sns.kdeplot(data[data['ratings']==0]['char_count'],shade=True,color='red')
sns.kdeplot(data[data['ratings']==1]['char_count'],shade=True,color='green')
plt.title("like affecting the sentiment")
plt.show()
figure 7
sns.set_style('darkgrid')
sns.countplot(data['ratings'])
plt.show()
figure 8
figure 8
from sklearn.model_selection import train_test_split
data_train,data_test =
train_test_split(data,test_size=0.2,random_state=1)
(x_train,y_train),(x_test,y_test),preprocess=
text.texts_from_df(data=data_train,text_column='comments',
label_columns='ratings',
'preprocess_mode='distilbert')
train,test,preprocess=text.texts_from_df(data_train,text_column 
='comments',label_columns='ratings'
,preprocess_mode='distilbert',maxlen=100)
model=text.text_classifier('distilbert',train_data=train,preproc 
=preprocess,verbose=1)
learner=ktrain.get_learner(model,train_data=train,val_data=test, 
batch_size=32)
learner.fit_onecycle(lr=2e-5,epochs=4)
figure 9
predictor=ktrain.get_predictor(learner.model,preprocess)
from google.colab import drive
drive.mount('/content/drive')
predictor.save('/content/drive/MyDrive/distilbert')
data='''Let me get straight to the point.From day 1 this phone hangs.multitasking performance is very bad.. sometimes wifi gets disconnected automatically..apps like whatsapp take 15 seconds to open.and flipkart 20-25 seconds.In the time of usb type c,we are getting micro usb.charging takes a long time..and 2 gb ram is not sufficient to run apps..phone is good for just to make calls..and do light works.not meant for multi tasking'''
def prediction(data):
if predictor.predict([data])=='rating':
return 'Positive Sentiment'
else:
return 'Negative Sentiment'
prediction(data)
figure 10

--

--

Data Scientists must think like an artist when finding a solution when creating a piece of code. ⚪️ Artists enjoy working on interesting problems, even if there is no obvious answer ⚪️ linktr.ee/mlearning 🔵 Follow to join our 28K+ Unique DAILY Readers 🟠

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store