INSAID
Published in

INSAID

Extract data from any webpage using this technique!

In this article, we are going to take a look at how to scrape data from web using BeautifulSoup and Selenium and clean the data for further analysis.

INTRODUCTION

Selenium is an automation software however BeautifulSoup isn’t.

APPLICATION

WEB SCRAPING JOB POSTINGS USING BEAUTIFULSOUP

from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
soup = BeautifulSoup(response.text,'html.parser')
#Fetching Job titles
req = soup.select('div h2[itemprop="name"]')
#Cleaning the data using list comprehension
titles = [r.text for r in req]
#getting rid of the pipe symbol
titles1 = [t.replace("|","") for t in titles]
#getting rid of any extra spaces
titles = [t.replace(" ", "") for t in titles1]
#fetching all the results that contains the employer name
results = soup.find_all('div', class_='jobCard_jobCard_cName__mYnow')
#Getting only the text from the results
cleanresults = [o.text for o in results]
sub_str = "Hiring"
#Splitting data based on condition and getting first element
companies = [o.split(sub_str)[0] for o in cleanresults]
#Collecting the Job Locations results
results = soup.find_all('div', class_='jobCard_jobCard_lists__fdnsc')
#fetching the text from the results collected
results = [l.div.text for l in results]
#replacing + sign with ,
locations = [l.replace("+", ",") for l in results]
#Using regex to get rid of any numbers
pattern = r'[0-9]'
locations = [re.sub(pattern, '', l) for l in locations]
#Fetching the experience requirements
experience = [l.find_all('div')[-1].text for l in loc]
#Gettig the number of vacancies
vacancies = soup.find_all('ul', class_='jobCard_jobCard_jobDetail__jD82J')
#Cleaning up the vacancies results
vac = [v.text.split("Positions")[0][-3:] for v in vacancies ]
vac = [v.replace('lar', '1') for v in vac]
strpattern = r'[a-z]'
vac = [re.sub(strpattern, '', l) for l in vac]
vacancies = [v.replace(' ','') for v in vac]
df = pd.DataFrame({'Titles':titles, 'Firm Name': companies, 
'Job Location':location, 'Experience':experience, 'Positions': vacancies})
df = df.drop_duplicates(subset=['Titles'])
df['Positions'] = df['Positions'].astype('int32')
#Creating a New Column 
df['Category'] = ['Fresher' if '0' in i else 'Experienced' for i in df['Experience']]

WEB SCRAPING AMAZON PRODUCTS USING SELENIUM

from selenium import webdriver
from time import sleep
driver_path = 'chromedriver.exe'
#Opens the browser
browser = webdriver.Chrome(executable_path=driver_path)
#Goes to amazon website
browser.get('https://www.amazon.in')
#Maximize Window
browser.maximize_window()
#Finding Elements
input_search = browser.find_element_by_id('twotabsearchtextbox')
search_button = browser.find_element_by_xpath("(//input[@type='submit'])[1]")
#Inserting the search keyword
input_search.send_keys("Smartphones under 50000")
#waits for 2 seconds
sleep(2)
#click on search button to get the results
search_button.click()
#Lists where we want to store the data
products = []
prices = []
numReviews = []
for i in range(10):
#printing the page number
print('Scraping page', i+1)
#Getting the product names on the page
product = browser.find_elements_by_xpath("//span[@class='a-size-medium a-color-base a-text-normal']")

#Getting the product prices on the page
price = browser.find_elements_by_xpath("//span[@class='a-price-whole']")
#Getting the number of reviews on the page
numReview = browser.find_elements_by_xpath("//span[@class='a-size-base s-underline-text']")

#Iterating through each page to get the text of individual product names, prices and number of reviews
for p in product:
products.append(p.text)
for pr in price:
prices.append(pr.text)
for n in numReview:
numReviews.append(n.text)
#setting up the next button once a page is scraped
next_button = browser.find_element_by_xpath("//span[@class='s-pagination-strip']")

#Clicking on next button
next_button.click()
#Waiting for 2 seconds for the page to load
sleep(2)
#Closing the browser window once the loop is finished running
browser.quit()
#List of lists of product, prices and reviews
data = [products, prices, numReviews]
#Creating a dataframe
import pandas as pd
df = pd.DataFrame(data).T
#Renaming columns
df.columns = ['Products', 'Prices', 'NumReviews']
#Dropping any rows with nan values
df.dropna(inplace=True)
#Getting the brand name of the mobile phone
df['Brand_Name'] = [(i.split(' ')[0]) for i in df.Products]
#Removing the comma in the prices column
df.Prices = [i.replace(',', '') for i in df.Prices]
#Converting the prices column to integer type
df['Prices'] = df['Prices'].astype('int64')
#Removing the comma in the prices column
df.NumReviews = [i.replace(',', '') for i in df.NumReviews]
#Converting the prices column to integer type
df['NumReviews'] = df['NumReviews'].astype('int64')
import seaborn as sns
import matplotlib.pyplot as plt
df['Brand_Name'].value_counts().plot(kind='bar', figsize=(22,7))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.xlabel('Brands')
plt.ylabel('Count')
plt.title('Mobile Phones countplot, Brand Wise')
plt.show()
plt.figure(figsize=(22,7))
sns.kdeplot(df['Prices'])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.title('Mobile Prices distribution')
plt.show()
plt.figure(figsize=(22,7))
sns.histplot(df['NumReviews'])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.title('NumReviews distribution')
plt.show()
plt.figure(figsize=(22,7))
sns.barplot(x=df['Brand_Name'], y=df['Prices'])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.title('Mobile Prices distribution')
plt.show()

CONCLUSION

Final Thoughts and Closing Comments

--

--

INSAID is India’s leading powerhouse in Data Science & AI research and education. INSAID provides world-class programs and certifications to working professionals across 300+ companies https://www.insaid.co/.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
INSAID

One of India’s leading institutions providing world-class Data Science & AI programs for working professionals with a mission to groom Data leaders of tomorrow!