From 2e8e920dd4676d0245326f9f15cdb4bab12ff1fc Mon Sep 17 00:00:00 2001 From: la55u Date: Wed, 4 Jul 2018 16:05:48 +0200 Subject: [PATCH 1/5] Added a way to filter proxies by protocol + fixed PremProxy provider Proxies can now be filtered with an optional parameter in the RequestProxy constructor. For this the parsers have to set the protocol for each proxy object they build. Protocols are stored in the ProxyObject.Protocol enum. PremProxy: parsing was broken because the port number was 'encrypted' and no longer stored in CSS. They are now obtained from a javascript file that holds a function to the key-port pairs. --- .../requests/parsers/FreeProxyParser.py | 14 +++-- ...amairProxyParser.py => PremProxyParser.py} | 51 +++++++++---------- .../requests/parsers/UrlParser.py | 4 ++ .../requests/parsers/jsunpacker.py | 39 ++++++++++++++ .../requests/proxy/ProxyObject.py | 7 +++ .../requests/proxy/requestProxy.py | 27 ++++++---- 6 files changed, 101 insertions(+), 41 deletions(-) rename http_request_randomizer/requests/parsers/{SamairProxyParser.py => PremProxyParser.py} (73%) create mode 100644 http_request_randomizer/requests/parsers/jsunpacker.py diff --git a/http_request_randomizer/requests/parsers/FreeProxyParser.py b/http_request_randomizer/requests/parsers/FreeProxyParser.py index 1ff496f..112a26d 100644 --- a/http_request_randomizer/requests/parsers/FreeProxyParser.py +++ b/http_request_randomizer/requests/parsers/FreeProxyParser.py @@ -4,7 +4,7 @@ from bs4 import BeautifulSoup from http_request_randomizer.requests.parsers.UrlParser import UrlParser -from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel +from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel, Protocol logger = logging.getLogger(__name__) __author__ = 'pgaref' @@ -30,7 +30,7 @@ def parse_proxyList(self): headings = [th.get_text() for th in table.find("tr").find_all("th")] datasets = [] - for row in table.find_all("tr")[1:]: + for row in table.find_all("tr")[1:-1]: dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) if dataset: datasets.append(dataset) @@ -57,6 +57,7 @@ def create_proxy_object(self, dataset): port = None anonymity = AnonymityLevel.UNKNOWN country = None + protocols = [] for field in dataset: if field[0] == 'IP Address': # Make sure it is a Valid IP @@ -71,8 +72,11 @@ def create_proxy_object(self, dataset): anonymity = AnonymityLevel.get(field[1].strip()) # String strip() elif field[0] == 'Country': country = field[1].strip() # String strip() - return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country) + elif field[0] == 'Https': + if field[1].strip().lower() == 'yes': protocols.extend([Protocol.HTTP, Protocol.HTTPS]) + elif field[1].strip().lower() == 'no': protocols.append(Protocol.HTTP) + return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country, protocols=protocols) def __str__(self): - return "FreeProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \ - .format(self.url, self.minimum_bandwidth_in_KBs) + return "{0} parser of '{1}' with required bandwidth: '{2}' KBs" \ + .format(self.id, self.url, self.minimum_bandwidth_in_KBs) diff --git a/http_request_randomizer/requests/parsers/SamairProxyParser.py b/http_request_randomizer/requests/parsers/PremProxyParser.py similarity index 73% rename from http_request_randomizer/requests/parsers/SamairProxyParser.py rename to http_request_randomizer/requests/parsers/PremProxyParser.py index 7b3d8c4..afeb06e 100644 --- a/http_request_randomizer/requests/parsers/SamairProxyParser.py +++ b/http_request_randomizer/requests/parsers/PremProxyParser.py @@ -3,16 +3,18 @@ import requests from bs4 import BeautifulSoup +from http_request_randomizer.requests.parsers.jsunpacker import JsUnpacker from http_request_randomizer.requests.parsers.UrlParser import UrlParser -from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel +from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel, Protocol logger = logging.getLogger(__name__) __author__ = 'pgaref' # Samair Proxy now renamed to: premproxy.com -class SamairProxyParser(UrlParser): +class PremProxyParser(UrlParser): def __init__(self, id, web_url, timeout=None): + self.base_url = web_url web_url += "/list/" UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout) @@ -32,33 +34,30 @@ def parse_proxyList(self): # Return proxies parsed so far return curr_proxy_list content = response.content - soup = BeautifulSoup(content, "html.parser") - # css provides the port number so we reverse it - # for href in soup.findAll('link'): - # if '/styles/' in href.get('href'): - # style = "http://www.samair.ru" + href.get('href') - # break - # css = requests.get(style).content.split('\n') - # css.pop() - # ports = {} - # for l in css: - # p = l.split(' ') - # key = p[0].split(':')[0][1:] - # value = p[1].split('\"')[1] - # ports[key] = value + soup = BeautifulSoup(content, "html.parser", from_encoding="iso-8859-1") + # js file contains the values for the ports + jsUrl = '' + for script in soup.findAll('script'): + if '/js/' in script.get('src'): + jsUrl = self.base_url + script.get('src') + #logger.debug('Found script url: '+jsUrl) + break + jsUnpacker = JsUnpacker(jsUrl) + ports = jsUnpacker.get_ports() table = soup.find("div", attrs={"id": "proxylist"}) # The first tr contains the field names. headings = [th.get_text() for th in table.find("tr").find_all("th")] - for row in table.find_all("tr")[1:]: + for row in table.find_all("tr")[1:-1]: td_row = row.find("td") - # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) - proxy_obj = self.create_proxy_object(row) + portKey = td_row.find('span', attrs={'class':True}).get('class')[0] + port = ports[portKey] + proxy_obj = self.create_proxy_object(row, port) # Make sure it is a Valid Proxy Address - if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text): + if proxy_obj is not None and UrlParser.valid_ip(proxy_obj.ip) and UrlParser.valid_port(port): curr_proxy_list.append(proxy_obj) else: - logger.debug("Proxy Invalid: {}".format(td_row.text)) + logger.debug("Proxy Invalid: {}".format(proxy_obj.to_str())) except AttributeError as e: logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e)) except KeyError as e: @@ -87,7 +86,7 @@ def get_pagination_set(self): page_set.add("") return page_set - def create_proxy_object(self, row): + def create_proxy_object(self, row, port): for td_row in row.findAll("td"): if td_row.attrs['data-label'] == 'IP:port ': text = td_row.text.strip() @@ -96,13 +95,13 @@ def create_proxy_object(self, row): if not UrlParser.valid_ip(ip): logger.debug("IP with Invalid format: {}".format(ip)) return None - port = text.split(":")[1] elif td_row.attrs['data-label'] == 'Anonymity Type: ': anonymity = AnonymityLevel.get(td_row.text.strip()) elif td_row.attrs['data-label'] == 'Country: ': country = td_row.text.strip() - return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country) + protocols = [Protocol.HTTP] + return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country, protocols=protocols) def __str__(self): - return "SemairProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \ - .format(self.url, self.minimum_bandwidth_in_KBs) \ No newline at end of file + return "{0} parser of '{1}' with required bandwidth: '{2}' KBs" \ + .format(self.id, self.url, self.minimum_bandwidth_in_KBs) diff --git a/http_request_randomizer/requests/parsers/UrlParser.py b/http_request_randomizer/requests/parsers/UrlParser.py index a0f0e09..f76f281 100644 --- a/http_request_randomizer/requests/parsers/UrlParser.py +++ b/http_request_randomizer/requests/parsers/UrlParser.py @@ -76,3 +76,7 @@ def valid_ip_port(address): if not match: return False return True + + @staticmethod + def valid_port(port): + return 1 <= int(port) <= 65535 diff --git a/http_request_randomizer/requests/parsers/jsunpacker.py b/http_request_randomizer/requests/parsers/jsunpacker.py new file mode 100644 index 0000000..b8433f4 --- /dev/null +++ b/http_request_randomizer/requests/parsers/jsunpacker.py @@ -0,0 +1,39 @@ +import re +import requests +import logging + +logger = logging.getLogger(__name__) + +class JsUnpacker: + """ + It takes the javascript file's url which contains the port numbers for + the encrypted strings. The file has to be unpacked to a readable form just like + http://matthewfl.com/unPacker.html does. Then we create a dictionary for + every key:port pair. + """ + # TODO: it might not be necessary to unpack the js code + + def __init__(self, jsFileUrl): + r = requests.get(jsFileUrl) + encrypted = r.text.strip() + encrypted = '(' + encrypted.split('}(')[1][:-1] + unpacked = eval('self.unpack' +encrypted) # string of the js code in unpacked form + matches = re.findall(r".*?\('\.([a-zA-Z0-9]{1,6})'\).*?\((\d+)\)", unpacked) + self.ports = dict((key, port) for key, port in matches) + #logger.debug('portmap: '+str(self.ports)) + + def baseN(self, num,b,numerals="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"): + return ((num == 0) and numerals[0]) or (self.baseN(num // b, b, numerals).lstrip(numerals[0]) + numerals[num % b]) + + def unpack(self, p, a, c, k, e=None, d=None): + while (c): + c-=1 + if (k[c]): + p = re.sub("\\b" + self.baseN(c, a) + "\\b", k[c], p) + return p + + def get_port(self, key): + return self.ports[key] + + def get_ports(self): + return self.ports diff --git a/http_request_randomizer/requests/proxy/ProxyObject.py b/http_request_randomizer/requests/proxy/ProxyObject.py index fe48b7e..707f9ee 100644 --- a/http_request_randomizer/requests/proxy/ProxyObject.py +++ b/http_request_randomizer/requests/proxy/ProxyObject.py @@ -82,3 +82,10 @@ def get(cls, name): return cls(name) except ValueError: return cls.UNKNOWN + +class Protocol(Enum): + UNKNOWN = 0 + HTTP = 1 + HTTPS = 2 + SOCS4 = 3 + SOCS5 = 4 diff --git a/http_request_randomizer/requests/proxy/requestProxy.py b/http_request_randomizer/requests/proxy/requestProxy.py index 4ed6723..f349d90 100644 --- a/http_request_randomizer/requests/proxy/requestProxy.py +++ b/http_request_randomizer/requests/proxy/requestProxy.py @@ -10,11 +10,12 @@ from requests.exceptions import ConnectionError from requests.exceptions import ReadTimeout +from http_request_randomizer.requests.proxy.ProxyObject import Protocol from http_request_randomizer.requests.errors.ProxyListException import ProxyListException from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser -from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser +from http_request_randomizer.requests.parsers.PremProxyParser import PremProxyParser from http_request_randomizer.requests.useragent.userAgent import UserAgentManager __author__ = 'pgaref' @@ -29,7 +30,7 @@ class RequestProxy: - def __init__(self, web_proxy_list=[], sustain=False, timeout=5): + def __init__(self, web_proxy_list=[], sustain=False, timeout=5, protocol=Protocol.HTTP): self.userAgent = UserAgentManager() self.logger = logging.getLogger() self.logger.addHandler(handler) @@ -40,9 +41,9 @@ def __init__(self, web_proxy_list=[], sustain=False, timeout=5): ##### parsers = list([]) parsers.append(FreeProxyParser('FreeProxy', 'http://free-proxy-list.net', timeout=timeout)) - parsers.append(ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) - parsers.append(RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout)) - parsers.append(SamairProxyParser('Samair', 'https://premproxy.com', timeout=timeout)) + #parsers.append(ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) <--doesn't work anymore + #parsers.append(RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout)) <--doesn't work anymore + parsers.append(PremProxyParser('PremProxy', 'https://premproxy.com', timeout=timeout)) self.logger.debug("=== Initialized Proxy Parsers ===") for i in range(len(parsers)): @@ -52,11 +53,17 @@ def __init__(self, web_proxy_list=[], sustain=False, timeout=5): self.sustain = sustain self.parsers = parsers self.proxy_list = web_proxy_list - for i in range(len(parsers)): + for parser in parsers: try: - self.proxy_list += parsers[i].parse_proxyList() + size = len(self.proxy_list) + self.proxy_list += parser.parse_proxyList() + self.logger.debug('Added {} proxies from {}'.format(len(self.proxy_list)-size, parser.id)) except ReadTimeout: - self.logger.warning("Proxy Parser: '{}' TimedOut!".format(parsers[i].url)) + self.logger.warning("Proxy Parser: '{}' TimedOut!".format(parser.url)) + self.logger.debug('Total proxies = '+str(len(self.proxy_list))) + # filtering the list of available proxies according to user preferences + self.proxy_list = [p for p in self.proxy_list if protocol in p.protocols] + self.logger.debug('Filtered proxies = '+str(len(self.proxy_list))) self.current_proxy = self.randomize_proxy() def set_logger_level(self, level): @@ -101,8 +108,8 @@ def generate_proxied_request(self, url, method="GET", params={}, data={}, header self.logger.debug("Using headers: {0}".format(str(headers))) self.logger.debug("Using proxy: {0}".format(str(self.current_proxy))) - request = requests.request(method, url, proxies={"http": self.current_proxy.get_address()}, - headers=headers, data=data, params=params, timeout=req_timeout) + request = requests.request(method, url, headers=headers, data=data, params=params, timeout=req_timeout, + proxies={"http": self.current_proxy.get_address(), "https": self.current_proxy.get_address()}) # Avoid HTTP request errors if request.status_code == 409: raise ConnectionError("HTTP Response [409] - Possible Cloudflare DNS resolution error") From 532164a25dcfc94411c2577c9addf328c7ad9cbe Mon Sep 17 00:00:00 2001 From: la55u Date: Wed, 4 Jul 2018 16:51:20 +0200 Subject: [PATCH 2/5] fixed travis-ci checks --- tests/test_providers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index 7fe7998..73ca12c 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -12,7 +12,7 @@ from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser -from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser +from http_request_randomizer.requests.parsers.PremProxyParser import PremProxyParser __author__ = 'pgaref' @@ -48,7 +48,7 @@ def test_RebroWeeblyParser(self): def test_SemairProxyParser(self): with HTTMock(samair_mock): - proxy_provider = SamairProxyParser('Samair', 'https://www.premproxy.com') + proxy_provider = PremProxyParser('Prem', 'https://www.premproxy.com') proxy_list = proxy_provider.parse_proxyList() proxy_list_addr = [] for proxy in proxy_list: From 458c7e638470f61ca06b0e6b9f333f2f5a2cddf9 Mon Sep 17 00:00:00 2001 From: la55u Date: Wed, 4 Jul 2018 17:04:51 +0200 Subject: [PATCH 3/5] fixed runners --- http_request_randomizer/requests/runners/proxyList.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/http_request_randomizer/requests/runners/proxyList.py b/http_request_randomizer/requests/runners/proxyList.py index 2a22a68..d8b58d2 100644 --- a/http_request_randomizer/requests/runners/proxyList.py +++ b/http_request_randomizer/requests/runners/proxyList.py @@ -7,7 +7,7 @@ from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser -from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser +from http_request_randomizer.requests.parsers.PremProxyParser import PremProxyParser __author__ = 'pgaref' @@ -23,7 +23,7 @@ def __init__(self, timeout=1.0, bandwidth=10.0): # Each of the entries implements a specific URL Parser self.parsers = dict() self.parsers['rebro'] = RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout) - self.parsers['samair'] = SamairProxyParser('Samair', 'https://premproxy.com', timeout=timeout) + self.parsers['prem'] = SamairProxyParser('Prem', 'https://premproxy.com', timeout=timeout) self.parsers['freeproxy'] = FreeProxyParser('FreeProxy', 'http://free-proxy-list.net', timeout=timeout) self.parsers['proxyforeu'] = ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', bandwidth=bandwidth, timeout=timeout) From d13c2ece4cb765f0e4a1648175c86c8a5535bdf7 Mon Sep 17 00:00:00 2001 From: la55u Date: Wed, 4 Jul 2018 17:18:50 +0200 Subject: [PATCH 4/5] fixed runners --- http_request_randomizer/requests/runners/proxyList.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/http_request_randomizer/requests/runners/proxyList.py b/http_request_randomizer/requests/runners/proxyList.py index d8b58d2..fec850e 100644 --- a/http_request_randomizer/requests/runners/proxyList.py +++ b/http_request_randomizer/requests/runners/proxyList.py @@ -23,7 +23,7 @@ def __init__(self, timeout=1.0, bandwidth=10.0): # Each of the entries implements a specific URL Parser self.parsers = dict() self.parsers['rebro'] = RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout) - self.parsers['prem'] = SamairProxyParser('Prem', 'https://premproxy.com', timeout=timeout) + self.parsers['prem'] = PremProxyParser('Prem', 'https://premproxy.com', timeout=timeout) self.parsers['freeproxy'] = FreeProxyParser('FreeProxy', 'http://free-proxy-list.net', timeout=timeout) self.parsers['proxyforeu'] = ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', bandwidth=bandwidth, timeout=timeout) From 5ed3ba7ae9fca4a0fd20f93ede0f16ddc51aac8f Mon Sep 17 00:00:00 2001 From: la55u Date: Wed, 4 Jul 2018 18:39:27 +0200 Subject: [PATCH 5/5] fixed tests (hopefully) --- tests/mocks.py | 4 ++-- tests/test_providers.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/mocks.py b/tests/mocks.py index d05da3b..feabee9 100644 --- a/tests/mocks.py +++ b/tests/mocks.py @@ -4,7 +4,7 @@ free_proxy_expected = ['138.197.136.46:3128', '177.207.75.227:8080'] proxy_for_eu_expected = ['107.151.136.222:80', '37.187.253.39:8115'] rebro_weebly_expected = ['213.149.105.12:8080', '119.188.46.42:8080'] -samair_expected = ['191.252.61.28:80', '167.114.203.141:8080', '152.251.141.93:8080'] +prem_expected = ['191.252.61.28:80', '167.114.203.141:8080', '152.251.141.93:8080'] @urlmatch(netloc=r'(.*\.)?free-proxy-list\.net$') def free_proxy_mock(url, request): @@ -135,7 +135,7 @@ def rebro_weebly_mock(url, request): @urlmatch(netloc=r'(.*\.)?www\.premproxy\.com') -def samair_mock(url, request): +def prem_mock(url, request): return """
\n \n IP address diff --git a/tests/test_providers.py b/tests/test_providers.py index 73ca12c..ac36fb4 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,8 +7,8 @@ sys.path.insert(0, os.path.abspath('.')) -from tests.mocks import free_proxy_mock, proxy_for_eu_mock, rebro_weebly_mock, samair_mock -from tests.mocks import free_proxy_expected, proxy_for_eu_expected, rebro_weebly_expected, samair_expected +from tests.mocks import free_proxy_mock, proxy_for_eu_mock, rebro_weebly_mock, prem_mock +from tests.mocks import free_proxy_expected, proxy_for_eu_expected, rebro_weebly_expected, prem_expected from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser @@ -46,14 +46,14 @@ def test_RebroWeeblyParser(self): proxy_list_addr.append(proxy.get_address()) self.assertEqual(proxy_list_addr, rebro_weebly_expected) - def test_SemairProxyParser(self): - with HTTMock(samair_mock): + def test_PremProxyParser(self): + with HTTMock(prem_mock): proxy_provider = PremProxyParser('Prem', 'https://www.premproxy.com') proxy_list = proxy_provider.parse_proxyList() proxy_list_addr = [] for proxy in proxy_list: proxy_list_addr.append(proxy.get_address()) - for item in samair_expected: + for item in prem_expected: self.assertTrue(item in proxy_list_addr)