Web Crawler in python

#! /usr/bin/pythonclass AppCrawler:
def __init__(self, starting_url, depth):
self.starting_url = starting_url
self.depth = depth
self.apps = []
def crawl(self):
# All the crawl logic goes here.
return
def get_app_from_link(self, link):
# Get information from link
return
# Do something with the data. We'll just be printing it.
# We can do some analytics or whatever.
class App:
def __init__(self, name, developer, price, links):
self.name = name
self.developer = developer
self.price = price
self.links = links
def __str__(self):
return ("Name" + self.name.encode('UTF-8') + "\r\nDeveloper: " + self.developer.encode('UTF-8') + "\r\nPrice: " + self.price.encode('UTF-8') + "\r\n")

def main():
# Execution starts here
crawler = AppCrawler('https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8', 0)
crawler.crawl()
for app in crawler.apps:
print app

if __name__ == "__main__": main()
from lxml import html
import requests
def crawl(self):
# All the crawl logic goes here.
self.get_app_from_link(self.starting_url)
return

def get_app_from_link(self, link):
# Get information from link
start_page = requests.get(link)
print start_page.text
return
tree = html.fromstring(start_page.text)
name = tree.xpath('//h1[@itemprop="name"]/text()')
name = tree.xpath('//h1[@itemprop="name"]/text()')[0]
developer = tree.xpath('//div[@class="left"]/h2/text()')[0]
price = tree.xpath('//div[@itemprop="price"]/text()')[0]
links = tree.xpath('//div[@class="center-stack"]//*/a[@class="name"]/@href')
for link in links:
print link
app = App(name, developer, price, link)
self.apps.append(app)

#! /usr/bin/python
from lxml import html
import requests
class AppCrawler:
def __init__(self, starting_url, depth):
self.starting_url = starting_url
self.depth = depth
self.apps = []
def crawl(self):
# All the crawl logic goes here.
self.get_app_from_link(self.starting_url)
return

def get_app_from_link(self, link):
# Get information from link
start_page = request.get(link)
tree = html.fromstring(start_page.text)
name = tree.xpath('//h1[@itemprop="name"]/text()')[0]
developer = tree.xpath('//div[@class="left"]/h2/text()')[0]
price = tree.xpath('//div[@itemprop="price"]/text()')[0]
links = tree.xpath('//div[@class="center-stack"]//*/a[@class="name"]/@href')app = App(name, developer, price, link)
self.apps.append(app)
# Do something with the data. We'll just be printing it.
# We can do some analytics or whatever.
class App:
def __init__(self, name, developer, price, links):
self.name = name
self.developer = developer
self.price = price
self.links = links
def __str__(self):
return ("Name" + self.name.encode('UTF-8') + "\r\nDeveloper: " + self.developer.encode('UTF-8') + "\r\nPrice: " + self.price.encode('UTF-8') + "\r\n")

def main():
# Execution starts here
crawler = AppCrawler('https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8', 0)
crawler.crawl()
for app in crawler.apps:
print app

if __name__ == "__main__": main()
Name: Candy Crash Saga
Developer: By King.com Limited
Price: Free

self.starting_url = starting_url
self.depth = depth
self.current_depth = 0
self.depth_links = []
self.apps = []
while self.current_depth < self.depth:
for link in self.depth_links[self.current_depth]:
current_app = self.get_app_from_link(link)
current_links.extend(current_app.links)
self.app.append(current_app)
self.current_depth += 1
self.depth_links.append(current_links)

#! /usr/bin/python
from lxml import html
import requests
import time
class AppCrawler:
def __init__(self, starting_url, depth):
self.starting_url = starting_url
self.depth = depth
self.current_depth = 0
self.depth_links = []
self.apps = []
def crawl(self):
# All the crawl logic goes here.
app = self.get_app_from_link(self.starting_url)
self.apps.append(app)
self.depth_links.append(app.links)
while self.current_depth < self.depth:
for link in self.depth_links[self.current_depth]:
current_app = self.get_app_from_link(link)
current_links.extend(current_app.links)
self.app.append(current_app)
time.sleep(1)
self.current_depth += 1
self.depth_links.append(current_links)
return

def get_app_from_link(self, link):
# Get information from link
start_page = request.get(link)
tree = html.fromstring(start_page.text)
name = tree.xpath('//h1[@itemprop="name"]/text()')[0]
developer = tree.xpath('//div[@class="left"]/h2/text()')[0]
price = tree.xpath('//div[@itemprop="price"]/text()')[0]
links = tree.xpath('//div[@class="center-stack"]//*/a[@class="name"]/@href')app = App(name, developer, price, link)return app# Do something with the data. We'll just be printing it.
# We can do some analytics or whatever.
class App:
def __init__(self, name, developer, price, links):
self.name = name
self.developer = developer
self.price = price
self.links = links
def __str__(self):
return ("Name" + self.name.encode('UTF-8') + "\r\nDeveloper: " + self.developer.encode('UTF-8') + "\r\nPrice: " + self.price.encode('UTF-8') + "\r\n")

def main():
# Execution starts here
crawler = AppCrawler('https://itunes.apple.com/us/app/candy-crush-saga/id553834731?mt=8', 0)
crawler.crawl()
for app in crawler.apps:
print app

if __name__ == "__main__": main()

Sricharan Chiruvolu

Written by

MSc CS student @ TU Munich

Welcome to a place where words matter. On Medium, smart voices and original ideas take center stage - with no ads in sight. Watch
Follow all the topics you care about, and we’ll deliver the best stories for you to your homepage and inbox. Explore
Get unlimited access to the best stories on Medium — and support writers while you’re at it. Just $5/month. Upgrade