Automated Web Parser In Python Using Azure Functions & Selenium

Swapnil Jindal
6 min readOct 4, 2023

--

What is an Azure Function?

Azure Functions is a serverless computing service provided by Microsoft Azure.

Steps To Create a Function App in Python

This tutorial will show how to build an Azure Function project in Python from scratch and part 2 will talk about deploying it on Azure using Docker, Azure Container Registry & Azure Pipelines.

I’ll be using VSCode as my IDE and MacOS as my OS.

  1. Open an empty directory in VSCode and run the following command from Terminal.
func init --worker-runtime python --docker -m V2

func init: Initializes the directory with a new function project.

— worker-runtime python: Sets the runtime language to python

— docker: Gives you a boilerplate Dockerfile that can be used to deploy to Functions App on Azure. We’ll be updating this later on in our series.

-m V2: Using the v2 Python programming model.

I have function app version 4.0.5198 installed on my system

Once you run the command, your file structure will look something like

2. Initialize a new HTTP Trigger— Open function_app.py and add the below code

@app.route(route="SearchAmazonProducts", auth_level=func.AuthLevel.ANONYMOUS)
def SearchAmazonProducts(req: func.HttpRequest) -> func.HttpResponse:
return func.HttpResponse(f"This HTTP triggered function executed successfully.")

3. Create a Python virtual environment and activate it. I am using python3.10

python3.10 -m venv .venv && source .venv/bin/activate # create and activate a new virtual environment

4. Start the function app

func start

You’d see the following output

Open the URL: http://localhost:7071/api/SearchAmazonProducts in a new browser tab and you’ll see the output as “This HTTP triggered function executed successfully.”

Parsing Amazon Using Selenium & Beautifulsoup

Let’s convert the code into an automated web parser using Selenium and BeautifulSoup libraries.

1. Open requirements.txt, and add the Python libraries we would need to make this happen. Your final requirements.txt would look like

azure-functions
beautifulsoup4
selenium
webdriver_manager

2. With your virtual environment still open, install the dependencies by running pip install.

pip3.10 install -r requirements.txt

3. You can also select your current Python interpreter by pressing CMD+SHIFT+P typing “Python: Select Interpreter” and choosing the Python version from the virtual environment we installed in the previous steps. This will ensure all the error squiggly lines are gone from your code.

4. Onto the action:

Add the following as imports

import logging
import re
import json

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

Function to initialize the Chrome driver


def get_chrome_driver():
logging.info("Initializing chrome driver")

try:
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.198 Safari/537.36 Edg/95.0.1020.30"

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument(f"user-agent={user_agent}")

driver = webdriver.Chrome(options=chrome_options)
logging.info("Chrome driver initialized successfully")
return driver
except:
logging.error("Error initializing Chrome driver")

Function to get the page url’s source code

def get_page_source(driver, url):
driver.get(url)

source = driver.page_source
logging.info("Amazon page source code: " + driver.page_source)

return source

Function to parse the source code as received from the previous function

def parse_data_for_term(driver, query, pages_to_parse=1):
data_items = []

for page_num in range(0, pages_to_parse):
url = "https://www.amazon.com/s?k=" + query + "&page=" + str(page_num + 1)
try:
content = get_page_source(driver, url)
soup = BeautifulSoup(content, "html.parser")
all_products = soup.findAll("div", attrs={"class": "s-result-item"})

for product in all_products:
asin = product["data-asin"]
title = ""
star_rating = ""
review_count = ""
price = ""

if asin != "":
try:
item = product.find(
"a", {"href": re.compile(asin), "class": "a-text-normal"}
)
title = item.find("span").text.strip()
star_rating = (
product.find("i", {"class": "a-icon-star-small"})
.find("span")
.text.strip()
.split(" ")[0]
)
review_count = clean_text(
product.find(
"a",
{
"href": re.compile(asin),
"href": re.compile("Reviews"),
},
)
.find("span")
.text.strip()
.split(" ")[0]
)
price = (
product.find("span", {"class": "a-price"})
.find("span")
.text.strip()
)
except:
logging.error("An error has occurred")

data_items.append((asin, title, star_rating, review_count, price))
except Exception as error:
logging.error(error)
return (asin, None, None, None)
return data_items

Updating the main HttpTrigger as below


@app.route(route="SearchAmazonProducts", auth_level=func.AuthLevel.ANONYMOUS)
def SearchAmazonProducts(req: func.HttpRequest) -> func.HttpResponse:
query = req.params.get("search")
pages_to_parse = req.params.get("pages")
pages_to_parse = (
int(pages_to_parse) if pages_to_parse is None or pages_to_parse is "" else 1
)

if query == None or query == "":
response = {"success": True, "data": None}
return func.HttpResponse(
json.dumps(response),
status_code=200,
)

driver = get_chrome_driver()
asins_data = parse_data_for_term(driver, query, pages_to_parse)
data_array = list(
map(
lambda data: {
"asin": data[0],
"title": data[1],
"rating": data[2],
"reviewCount": data[3],
"price": data[4],
},
asins_data,
)
)
response = {"success": True, "data": data_array}

return func.HttpResponse(
json.dumps(response),
status_code=200,
headers={"Content-Type": "application/json"},
)

Additional functions

def clean_text(str):
return re.sub("[^A-Za-z0-9]+", "", str)

5. Save the code and run it using

func start

6. Open the URL

http://localhost:7071/api/SearchAmazonProducts?search=toys&pages=1

You’ll see the following response:

You can change the search term and the pages you’d like to parse on Amazon for your purposes.

Selenium is using headless Chrome to automate this scrapping in the background. So you won’t see the Chrome browser launch.

The script can be adjusted as per needs.

Complete function_app.py would look something like this

import azure.functions as func
import logging
import re
import json

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

app = func.FunctionApp()


def get_chrome_driver():
logging.info("Initializing chrome driver")

try:
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.198 Safari/537.36 Edg/95.0.1020.30"

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument(f"user-agent={user_agent}")

driver = webdriver.Chrome(options=chrome_options)
logging.info("Chrome driver initialized successfully")
return driver
except:
logging.error("Error initializing Chrome driver")


def get_page_source(driver, url):
driver.get(url)

source = driver.page_source
logging.info("Amazon page source code: " + driver.page_source)

return source


def parse_data_for_term(driver, query, pages_to_parse=1):
data_items = []

for page_num in range(0, pages_to_parse):
url = "https://www.amazon.com/s?k=" + query + "&page=" + str(page_num + 1)
try:
content = get_page_source(driver, url)
soup = BeautifulSoup(content, "html.parser")
all_products = soup.findAll("div", attrs={"class": "s-result-item"})

for product in all_products:
asin = product["data-asin"]
title = ""
star_rating = ""
review_count = ""
price = ""

if asin != "":
try:
item = product.find(
"a", {"href": re.compile(asin), "class": "a-text-normal"}
)
title = item.find("span").text.strip()
star_rating = (
product.find("i", {"class": "a-icon-star-small"})
.find("span")
.text.strip()
.split(" ")[0]
)
review_count = clean_text(
product.find(
"a",
{
"href": re.compile(asin),
"href": re.compile("Reviews"),
},
)
.find("span")
.text.strip()
.split(" ")[0]
)
price = (
product.find("span", {"class": "a-price"})
.find("span")
.text.strip()
)
except:
logging.error("An error has occurred")

data_items.append((asin, title, star_rating, review_count, price))
except Exception as error:
logging.error(error)
return (asin, None, None, None)
return data_items


@app.route(route="SearchAmazonProducts", auth_level=func.AuthLevel.ANONYMOUS)
def SearchAmazonProducts(req: func.HttpRequest) -> func.HttpResponse:
query = req.params.get("search")
pages_to_parse = req.params.get("pages")
pages_to_parse = (
int(pages_to_parse) if pages_to_parse is None or pages_to_parse is "" else 1
)

if query == None or query == "":
response = {"success": True, "data": None}
return func.HttpResponse(
json.dumps(response),
status_code=200,
)

driver = get_chrome_driver()
asins_data = parse_data_for_term(driver, query, pages_to_parse)
data_array = list(
map(
lambda data: {
"asin": data[0],
"title": data[1],
"rating": data[2],
"reviewCount": data[3],
"price": data[4],
},
asins_data,
)
)
response = {"success": True, "data": data_array}

return func.HttpResponse(
json.dumps(response),
status_code=200,
headers={"Content-Type": "application/json"},
)


def clean_text(str):
return re.sub("[^A-Za-z0-9]+", "", str)

In the next chapter, we will be looking at deploying this on Azure using Docker, Azure Containers, and Azure Pipelines.

--

--