Build SEO site Analyzer using Python

Arif Rahman
Zetta Tech
Published in
8 min readJan 29, 2023
Photo by Luke Chesser on Unsplash

Background

When I analyze a website to get some data reports from a web target, I need to create a real-time scraper that is used to analyze and predict keywords and count the word from a web target, so I created some real-time analysis tools using python and flask micro-framework

Project Setup

First, install the poetry package manager to manage the python package, let’s start to set up the project

Create Virtual Environment

create virtual environment

python -m venv venv
venv\Scripts\activate # on windows
source venv/bin/activate # on mac or linux

Install Required Library

pip install poetry

Required Library

install the required library below

  • requests
  • pyseoanalyzer
  • Beautifulsoup4
  • flask
  • celery
  • backoff

Create the core

First, create the core of the project foundation like this

# src/core/factory.py
import os
from flask import Flask

from core import init_celery
from apps.home.views import home_bp

def create_app(**kwargs) -> Flask:
app: Flask = Flask(__name__)
# environtment
env = os.environ.get("MODE")
if env == "development":
print("Run In Development Mode")
app.config.from_object("core.config.DevConfig")
else:
print("Run In Production Mode")
app.config.from_object("core.config.ProdConfig")
if kwargs.get("celery"):
init_celery(app=app, celery=kwargs.get("celery"))

# initiate blueprint
app.register_blueprint(home_bp)

return app

Setup celery config

next step is setup celery config

# src/core/__init__.py
import os

from celery import Celery
from flask import Flask
from typing import Any

def celery_app(app_name=__name__) -> Celery:
"""create celery object to make a module of celery task
Args:
app (None): celery app object
Returns:
Celery: return configured Celery Objects
"""
if os.environ.get("MODE") == "development":
print("Broker development")
redis_broker = os.environ.get("CELERY_BROKER_URL_DEV")
redis_backend = os.environ.get("CELERY_RESULT_BACKEND_DEV")
return Celery(app_name, backend=redis_backend, redis_broker=redis_broker)
else:
print("Broker Production")
redis_broker = os.environ.get("CELERY_BROKER_URL")
redis_backend = os.environ.get("CELERY_RESULT_BACKEND")
return Celery(app_name, backend=redis_backend, redis_broker=redis_broker)


def init_celery(celery: Celery, app: Flask):
"""Adding Flask to Celery Support
Args:
app (Flask): Flask Object
Returns:
Celery: Celery Object
"""

app.config.update({
'broker_url': os.environ.get("CELERY_BROKER_URL"),
"result_backend": os.environ.get("CELERY_RESULT_BACKEND")
})

celery.conf.update(app.config)
TaskBase = celery.Task
class ContextTask(TaskBase):
abstract = True
def __call__(self, *args: Any, **kwargs: Any) -> Any:
with app.app_context():
return TaskBase.__call__(self, *args, **kwargs)
celery.Task = ContextTask
return celery


celery_ext = celery_app()

after creating the celery extension let’s build the config for config the project

# src/core/config.py 

import os
from typing import Literal
from pathlib import Path

from dotenv import dotenv_values

dotenv_path = os.path.join(Path(__file__).resolve().parent.parent.parent, '.env')

tes = dotenv_values(dotenv_path)

class BaseConfig:
SECRET_KEY: Literal = os.environ.get("SECRET_KEY")


class DevConfig(BaseConfig):
DEBUG: bool = True
FLASK_DEBUG: bool =True

class ProdConfig(BaseConfig):
DEBUG: bool = False
broker_url: str = os.environ.get("CELERY_BROKER_URL")
RESULT_BACKEND = os.environ.get("CELERY_RESULT_BACKEND")

Build Scraper Module

in this case, we need to rebuild the scraper module to build custom results for our API

Building Crawler Instance

# src/modules/crawler.py

from seoanalyzer.website import Website
from xml.dom import minidom

from modules.scrape import PageCrawler
from modules.helper import http

class WebCrawler(Website):
def __init__(self, base_url, sitemap, analyze_headings, analyze_extra_tags, follow_links, scrape_img):
super().__init__(base_url, sitemap, analyze_headings, analyze_extra_tags, follow_links)
self.scrape_img = scrape_img

def crawl(self):
if self.sitemap:
page = http.get(self.sitemap)
if self.sitemap.endswith('xml'):
xmldoc = minidom.parseString(page.data.decode('utf-8'))
sitemap_urls = xmldoc.getElementsByTagName('loc')
for url in sitemap_urls:
self.page_queue.append(self.get_text_from_xml(url.childNodes))
elif self.sitemap.endswith('txt'):
sitemap_urls = page.data.decode('utf-8').split('\n')
for url in sitemap_urls:
self.page_queue.append(url)


self.page_queue.append(self.base_url)

for url in self.page_queue:

if url in self.crawled_urls:
print("Crawled URL: {}".format(url))
continue
else:
print("process URL to crawl: {}".format(url))
page = PageCrawler(url=url, base_url=self.base_url,
analyze_headings=self.analyze_headings,
analyze_extra_tags=self.analyze_extra_tags, scrape_img=self.scrape_img)

if page.parsed_url.netloc != page.base_domain.netloc:
continue

page.analyze()

self.content_hashes[page.content_hash].add(page.url)

for w in page.wordcount:
self.wordcount[w] += page.wordcount[w]

for b in page.bigrams:
self.bigrams[b] += page.bigrams[b]

for t in page.trigrams:
self.trigrams[t] += page.trigrams[t]

self.page_queue.extend(page.links)

self.crawled_pages.append(page)
self.crawled_urls.add(page.url)

if not self.follow_links:
break

if not self.scrape_img:
break

and build a page crawler to analyze sites per page

# src/modules/scrape.py

# file to process page
import re
import hashlib
import requests

from seoanalyzer.page import Page
from bs4 import BeautifulSoup
from urllib3.exceptions import HTTPError, MaxRetryError
from urllib.parse import urlsplit
from typing import Any

from modules.helper import http


class PageCrawler(Page):
def __init__(
self,
url: str,
base_url: str,
analyze_headings: bool = False,
analyze_extra_tags: bool = False,
scrape_img: bool = False,
) -> None:
super().__init__(url, base_url, analyze_headings, analyze_extra_tags)
self.scrape_img: bool = scrape_img
self.analyze_headings: bool = analyze_headings
self.analyze_extra_tags: bool = analyze_extra_tags

# adding broken and internal link
self.internal_links: list = []
self.broken_link: list = []
self.external_link: list = []

if self.scrape_img:
self.images = []

if self.analyze_headings:
self.headings = {}
if analyze_extra_tags:
self.additional_info = {}

def scrape_external_link(self, bs: BeautifulSoup):
external_link = bs.find_all("a")
hostname = urlsplit(self.url).hostname
for link in external_link:
valid_link = link.get("href")
if valid_link.startswith("http://") or valid_link.startswith("https://"):
if urlsplit(valid_link).hostname != hostname:
self.external_link.append(valid_link)

return self.external_link

def scrape_broken_link(self, bs: BeautifulSoup) -> list:
broken_link = bs.find_all("a")
for broke in broken_link:
valid_url: str = broke.get("href")
if valid_url.startswith("http://") or valid_url.startswith("https://"):
try:
broken = http.get(valid_url)
except MaxRetryError:
broken = http.get_without_redirect(valid_url)
try:
if broken.status == 404:
self.broken_link.append(valid_url)
except:
if broken.status_code == 404:
self.broken_link.append(valid_url)
else:
continue

return self.broken_link

def scrape_internal_link(self, bs: BeautifulSoup) -> list:
internal_link = bs.find_all("a")
hostname = urlsplit(self.url).hostname
for link in internal_link:
valid_link = link.get("href")
if valid_link.startswith("http://") or valid_link.startswith("https://"):
if urlsplit(valid_link).hostname == hostname:
self.internal_links.append(valid_link)

return self.internal_links

def scrape_image(self, bs: BeautifulSoup) -> list:
"""scrape images data

Args:
bs (BeautifulSoup): beautifulsoup object

Returns:
list: return list of images
"""
images = bs.find_all("img")
for img in images:
source = img.get("alt")

self.images.append(source)

return self.images

def talk(self) -> dict[str, Any]:
"""Returns a dictionary that can be printed

Returns:
dict: dictionary context data that can be printed
"""
context: dict = {
"url": self.url,
"title": self.title,
"description": self.description,
"word_count": self.total_word_count,
"keywords": self.sort_freq_dist(self.keywords, limit=5),
"bigrams": self.bigrams,
"trigrams": self.trigrams,
"broken links": self.broken_link,
"internal links": self.internal_links,
"external links": self.external_link,
"warnings": self.warnings,
"content_hash": self.content_hash,
}

# append new items
if self.analyze_headings:
context["headings"] = self.headings
if self.analyze_extra_tags:
context["additional_info"] = self.additional_info

if self.scrape_img:
context["images"] = self.images

return context

def analyze(self, raw_html=None):
"""
Analyze the page and populate the warnings list
"""

if not raw_html:
valid_prefixes = []

# only allow http:// https:// and //
for s in [
"http://",
"https://",
"//",
]:
valid_prefixes.append(self.url.startswith(s))

if True not in valid_prefixes:
self.warn(f"{self.url} does not appear to have a valid protocol.")
return

if self.url.startswith("//"):
self.url = f"{self.base_domain.scheme}:{self.url}"

if self.parsed_url.netloc != self.base_domain.netloc:
self.warn(f"{self.url} is not part of {self.base_domain.netloc}.")
return

try:
page = http.get(self.url)
except HTTPError as e:
self.warn(f"Returned {e}")
return

encoding = "ascii"

if "content-type" in page.headers:
encoding = page.headers["content-type"].split("charset=")[-1]

if encoding.lower() not in ("text/html", "text/plain", "utf-8"):
self.warn(f"Can not read {encoding}")
return
else:
raw_html = page.data.decode("utf-8")

self.content_hash = hashlib.sha1(raw_html.encode("utf-8")).hexdigest()

# remove comments, they screw with BeautifulSoup
clean_html = re.sub(r"<!--.*?-->", r"", raw_html, flags=re.DOTALL)

soup_lower = BeautifulSoup(
clean_html.lower(), "html.parser"
) # .encode('utf-8')
soup_unmodified = BeautifulSoup(clean_html, "html.parser") # .encode('utf-8')

texts = soup_lower.findAll(text=True)
visible_text = [w for w in filter(self.visible_tags, texts)]

self.process_text(visible_text)

self.populate(soup_lower)

self.analyze_title()
self.analyze_description()
self.analyze_og(soup_lower)
self.analyze_a_tags(soup_unmodified)
self.analyze_img_tags(soup_lower)
self.analyze_h1_tags(soup_lower)

# add broken and alt image text wrapper
self.scrape_broken_link(soup_unmodified)
self.scrape_internal_link(soup_unmodified)
self.scrape_external_link(soup_unmodified)

if self.analyze_headings:
self.analyze_heading_tags(soup_unmodified)
if self.analyze_extra_tags:
self.analyze_additional_tags(soup_unmodified)

# add scrape image function
if self.scrape_img:
self.scrape_image(soup_unmodified)

return True

### Build Analyzer Instance

to run the scraper module let’s build the analyzer instance to combine a custom module

# src/modules/analyzer.py
import time

from typing import Any

from operator import itemgetter
from modules.crawler import WebCrawler


def analyze(url, sitemap_url=None, analyze_headings=False, analyze_extra_tags=False, follow_links=True, scrape_img=True) -> dict[str, Any]:
start_time = time.time()

def calc_total_time():
return time.time() - start_time

output = {'pages': [], 'keywords': [], 'errors': [], 'total_time': calc_total_time()}

site = WebCrawler(base_url=url, sitemap=sitemap_url, analyze_headings=analyze_headings, analyze_extra_tags=analyze_extra_tags, follow_links=follow_links, scrape_img=scrape_img)

site.crawl()

for p in site.crawled_pages:
output['pages'].append(p.talk())

output['duplicate_pages'] = [list(site.content_hashes[p]) for p in site.content_hashes if len(site.content_hashes[p]) > 1]

sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True)
sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True)
sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True)

output['keywords'] = []

for w in sorted_words:
if w[1] > 4:
output['keywords'].append({
'word': w[0],
'count': w[1],
})

for w, v in sorted_bigrams:
if v > 4:
output['keywords'].append({
'word': w,
'count': v,
})

for w, v in sorted_trigrams:
if v > 4:
output['keywords'].append({
'word': w,
'count': v,
})

# Sort one last time...
output['keywords'] = sorted(output['keywords'], key=itemgetter('count'), reverse=True)

output['total_time'] = calc_total_time()

return output

### Adding Helper Modules

add this `helper.py` to create a utility for the scraper module

# src/modules/helper.py

import certifi
import requests

from urllib3 import PoolManager
from urllib3 import Timeout
from urllib3.response import HTTPResponse
from urllib3.util import Retry
from typing import Any



class Http:
def __init__(self):
user_agent: dict[str, str] = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
self.retry_strategy: Retry = Retry(
total=10, # maximum number of retries
redirect=100, # maximum number of redirects to follow
status_forcelist=[
429,
500,
502,
503,
504,
], # list of HTTP status codes to retry on
method_whitelist=[
"HEAD",
"TRACE",
"GET",
"PUT",
"OPTIONS",
"DELETE",
], # list of HTTP methods to retry on
)
self.http: PoolManager = PoolManager(
timeout=Timeout(connect=1.0, read=2.0),
cert_reqs="CERT_REQUIRED",
ca_certs=certifi.where(),
headers=user_agent,
retries=self.retry_strategy,
)

def get(self, url) -> Any:
return self.http.request("GET", url)

def get_without_redirect(self, url: str) -> HTTPResponse:
res = requests.get(url, verify=False)
return res

def post(self, url: str, data: dict[str, Any]) -> Any:
return self.http.request("POST", url, fields=data)

def put(self, url: str, data: dict[str, Any]) -> HTTPResponse:
return self.http.request("PUT", url, fields=data)

def delete(self, url: str) -> HTTPResponse:
return self.http.request("DELETE", url)


http = Http()

and well done the scraper module is ready to use, with reusable custom function, let’s continue to the next step let’s build the web app using flask

Build Web Apps Using Flask

In the previous step we already setup the flask application, so in this step, we build the routes and view the web apps

# src/apps/home/views.py

import json

# from seoanalyzer import analyze
from flask import Blueprint, render_template, request, jsonify
from typing import Any


from modules.analyzer import analyze

home_bp: Blueprint = Blueprint(
"home", __name__, template_folder="templates", static_folder="static"
)


@home_bp.route("/", methods=["GET", "POST"])
def index():
if request.method == "POST":
query = request.form.get("website")

analyzer = analyze(
url=query,
follow_links=False,
)
results: list = [dict[str, Any]]

# process pages here
for page in analyzer["pages"]:
data_dict: dict[str, Any] = {
"word count": page["word_count"],
"page title": page["title"],
}
results.append(data_dict)
for keywords in page["keywords"]:
print(keywords)

# (['pages', 'keywords', 'errors', 'total_time', 'duplicate_pages']

# return json
return render_template("index.html", datas=results)

return render_template("index.html")

### Design The Templates

after designing the views, let’s create a template using bootstrap in index.htmlthat's created at the src/apps/home/templates directory

<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>SEO Audit Tools</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-rbsA2VBKQhggwzxH7pPCaAqO46MgnOM80zW1RWuH61DGLwZJEdK2Kadq2F9CUG65" crossorigin="anonymous">
</head>
<body>
<div class="container justify-content-center mx-auto">
<h1 class="text-center">Seo Audit Tools</h1>
<form action="" method="post">
<div class="input-group">
<input name="website" type="search" class="form-control rounded" placeholder="example.com" aria-label="Search" aria-describedby="search-addon" />
<button type="submit" class="btn btn-warning text-white">Audit</button>
</div>
</form>
</div>
<div class="container">
<table class="table">

<thead>
<tr>
<th scope="col">No.</th>
<th scope="col">Word Count</th>
<th scope="col">Page Title</th>
<th scope="col">Meta Description</th>
<th scope="col">Details Page</th>
</tr>
</thead>
<tbody>
{% for data in datas %}
<tr>
<th scope="row">{{ loop.index }}</th>
<td>{{ data['word count'] }}</td>
<td>{{ data['page title'] }}</td>
<td>{{ data['meta discription'] }}</td>
<td><a class="btn btn-outline-primary btn-sm" href="">See Details</a></td>

</tr>
</tbody>
{% endfor %}
</table>
</div>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.2.3/dist/js/bootstrap.bundle.min.js" integrity="sha384-kenU1KFdBIe4zVF0s0G1M5b4hcpxyD9F7jL+jjXkk+Q2h455rYXK/7HAuoJl+0I4" crossorigin="anonymous"></script>
</body>
</html>

Conclusion

this post discusses how to create custom modules based on third-party modules on python, this project is also supported on docker, please check it out on GitHub, for more amazing projects, good luck!

--

--