Escape From Blocking IP Using Rotating Proxy in Python Yell.com Case Study
Often what we encounter in scraping is IP blocking, which is a condition where our IP is blocked when making requests to a site, there are several ways to bypass it.
- Using Proxy
- Using VPN
With these two ways, we can avoid our IP being blocked by the destination website when we send a lot of requests
Using Virtual Private Network (VPN)
in this case, we must have a VPN, there are several ways to get a VPN such as buying or we can get it for free there are also activate the VPN on our host computer and then use it for scraping
import requests
from base64 import b64encode
proxy = {
'http': 'http://173.208.208.74:60099'
}
class HTTPProxyAuth(requests.auth.HTTPBasicAuth):
"""Like requests.auth.HTTPBasicAuth, but adds a Proxy-Authorization header"""
def __call__(self, r):
auth_s = b64encode('%s:%s' % (self.username, self.password))
r.headers['Proxy-Authorization'] = ('Basic %s' % auth_s)
return r
auth = HTTPProxyAuth('user', 'password')
r = requests.get('http://httpbin.org/', proxies=proxy, return_response=False)
r = auth(r)
r.send()
print r.response
r = requests.get('http://httpbin.org/', proxies=proxy, return_response=False)
r = auth(r)
r.send()
print r.response
Using Proxy with Selenium
here is scraping with proxy implementation with selenium
import os
import random
import zipfile
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
def get_driver(use_proxy=False, user_agent: list = None, ip_proxy: str = None, port: int = None, username: str = None,
password: str = None, debug=True):
"""Selenium setup"""
options = Options()
if use_proxy:
print(f'Using Proxy: {ip_proxy}:{port}')
try:
os.mkdir('temp/plugins')
except FileExistsError:
pass
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = """
var config = {
mode: "fixed_servers",
rules: {
singleProxy: {
scheme: "http",
host: "%s",
port: parseInt(%s)
},
bypassList: ["localhost"]
}
};
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) {
return {
authCredentials: {
username: "%s",
password: "%s"
}
};
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls: ["<all_urls>"]},
['blocking']
);
""" % (ip_proxy, port, username, password)
plugin_file = 'temp/plugins/proxy_auth.zip'
with zipfile.ZipFile(plugin_file, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
# raise Exception(plugin_file)
options.add_extension(plugin_file)
options.add_argument(argument=f'argument={random.choice(user_agent)}')
if not debug:
options.add_argument(argument='--headless')
driver = webdriver.Chrome(ChromeDriverManager(path='temp/driver_path').install(), options=options)
return driver
else:
options.add_argument(argument=f'user-agent={random.choice(user_agent)}')
options.add_argument(argument='--incognito')
if not debug:
options.add_argument(argument='--headless')
driver = webdriver.Chrome(ChromeDriverManager(path='temp/driver_path').install(), options=options)
return driver
Conclusion
Well, this way we can bypass and do scraping without having to worry about our IP being blocked, okay that’s it