diff --git a/http_request_randomizer/requests/parsers/FreeProxyParser.py b/http_request_randomizer/requests/parsers/FreeProxyParser.py index 1ff496f..112a26d 100644 --- a/http_request_randomizer/requests/parsers/FreeProxyParser.py +++ b/http_request_randomizer/requests/parsers/FreeProxyParser.py @@ -4,7 +4,7 @@ from bs4 import BeautifulSoup from http_request_randomizer.requests.parsers.UrlParser import UrlParser -from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel +from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel, Protocol logger = logging.getLogger(__name__) __author__ = 'pgaref' @@ -30,7 +30,7 @@ def parse_proxyList(self): headings = [th.get_text() for th in table.find("tr").find_all("th")] datasets = [] - for row in table.find_all("tr")[1:]: + for row in table.find_all("tr")[1:-1]: dataset = zip(headings, (td.get_text() for td in row.find_all("td"))) if dataset: datasets.append(dataset) @@ -57,6 +57,7 @@ def create_proxy_object(self, dataset): port = None anonymity = AnonymityLevel.UNKNOWN country = None + protocols = [] for field in dataset: if field[0] == 'IP Address': # Make sure it is a Valid IP @@ -71,8 +72,11 @@ def create_proxy_object(self, dataset): anonymity = AnonymityLevel.get(field[1].strip()) # String strip() elif field[0] == 'Country': country = field[1].strip() # String strip() - return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country) + elif field[0] == 'Https': + if field[1].strip().lower() == 'yes': protocols.extend([Protocol.HTTP, Protocol.HTTPS]) + elif field[1].strip().lower() == 'no': protocols.append(Protocol.HTTP) + return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country, protocols=protocols) def __str__(self): - return "FreeProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \ - .format(self.url, self.minimum_bandwidth_in_KBs) + return "{0} parser of '{1}' with required bandwidth: '{2}' KBs" \ + .format(self.id, self.url, self.minimum_bandwidth_in_KBs) diff --git a/http_request_randomizer/requests/parsers/SamairProxyParser.py b/http_request_randomizer/requests/parsers/PremProxyParser.py similarity index 73% rename from http_request_randomizer/requests/parsers/SamairProxyParser.py rename to http_request_randomizer/requests/parsers/PremProxyParser.py index 7b3d8c4..afeb06e 100644 --- a/http_request_randomizer/requests/parsers/SamairProxyParser.py +++ b/http_request_randomizer/requests/parsers/PremProxyParser.py @@ -3,16 +3,18 @@ import requests from bs4 import BeautifulSoup +from http_request_randomizer.requests.parsers.jsunpacker import JsUnpacker from http_request_randomizer.requests.parsers.UrlParser import UrlParser -from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel +from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel, Protocol logger = logging.getLogger(__name__) __author__ = 'pgaref' # Samair Proxy now renamed to: premproxy.com -class SamairProxyParser(UrlParser): +class PremProxyParser(UrlParser): def __init__(self, id, web_url, timeout=None): + self.base_url = web_url web_url += "/list/" UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout) @@ -32,33 +34,30 @@ def parse_proxyList(self): # Return proxies parsed so far return curr_proxy_list content = response.content - soup = BeautifulSoup(content, "html.parser") - # css provides the port number so we reverse it - # for href in soup.findAll('link'): - # if '/styles/' in href.get('href'): - # style = "http://www.samair.ru" + href.get('href') - # break - # css = requests.get(style).content.split('\n') - # css.pop() - # ports = {} - # for l in css: - # p = l.split(' ') - # key = p[0].split(':')[0][1:] - # value = p[1].split('\"')[1] - # ports[key] = value + soup = BeautifulSoup(content, "html.parser", from_encoding="iso-8859-1") + # js file contains the values for the ports + jsUrl = '' + for script in soup.findAll('script'): + if '/js/' in script.get('src'): + jsUrl = self.base_url + script.get('src') + #logger.debug('Found script url: '+jsUrl) + break + jsUnpacker = JsUnpacker(jsUrl) + ports = jsUnpacker.get_ports() table = soup.find("div", attrs={"id": "proxylist"}) # The first tr contains the field names. headings = [th.get_text() for th in table.find("tr").find_all("th")] - for row in table.find_all("tr")[1:]: + for row in table.find_all("tr")[1:-1]: td_row = row.find("td") - # curr_proxy_list.append('http://' + row.text + ports[row['class'][0]]) - proxy_obj = self.create_proxy_object(row) + portKey = td_row.find('span', attrs={'class':True}).get('class')[0] + port = ports[portKey] + proxy_obj = self.create_proxy_object(row, port) # Make sure it is a Valid Proxy Address - if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text): + if proxy_obj is not None and UrlParser.valid_ip(proxy_obj.ip) and UrlParser.valid_port(port): curr_proxy_list.append(proxy_obj) else: - logger.debug("Proxy Invalid: {}".format(td_row.text)) + logger.debug("Proxy Invalid: {}".format(proxy_obj.to_str())) except AttributeError as e: logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e)) except KeyError as e: @@ -87,7 +86,7 @@ def get_pagination_set(self): page_set.add("") return page_set - def create_proxy_object(self, row): + def create_proxy_object(self, row, port): for td_row in row.findAll("td"): if td_row.attrs['data-label'] == 'IP:port ': text = td_row.text.strip() @@ -96,13 +95,13 @@ def create_proxy_object(self, row): if not UrlParser.valid_ip(ip): logger.debug("IP with Invalid format: {}".format(ip)) return None - port = text.split(":")[1] elif td_row.attrs['data-label'] == 'Anonymity Type: ': anonymity = AnonymityLevel.get(td_row.text.strip()) elif td_row.attrs['data-label'] == 'Country: ': country = td_row.text.strip() - return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country) + protocols = [Protocol.HTTP] + return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country, protocols=protocols) def __str__(self): - return "SemairProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \ - .format(self.url, self.minimum_bandwidth_in_KBs) \ No newline at end of file + return "{0} parser of '{1}' with required bandwidth: '{2}' KBs" \ + .format(self.id, self.url, self.minimum_bandwidth_in_KBs) diff --git a/http_request_randomizer/requests/parsers/UrlParser.py b/http_request_randomizer/requests/parsers/UrlParser.py index a0f0e09..f76f281 100644 --- a/http_request_randomizer/requests/parsers/UrlParser.py +++ b/http_request_randomizer/requests/parsers/UrlParser.py @@ -76,3 +76,7 @@ def valid_ip_port(address): if not match: return False return True + + @staticmethod + def valid_port(port): + return 1 <= int(port) <= 65535 diff --git a/http_request_randomizer/requests/parsers/jsunpacker.py b/http_request_randomizer/requests/parsers/jsunpacker.py new file mode 100644 index 0000000..b8433f4 --- /dev/null +++ b/http_request_randomizer/requests/parsers/jsunpacker.py @@ -0,0 +1,39 @@ +import re +import requests +import logging + +logger = logging.getLogger(__name__) + +class JsUnpacker: + """ + It takes the javascript file's url which contains the port numbers for + the encrypted strings. The file has to be unpacked to a readable form just like + http://matthewfl.com/unPacker.html does. Then we create a dictionary for + every key:port pair. + """ + # TODO: it might not be necessary to unpack the js code + + def __init__(self, jsFileUrl): + r = requests.get(jsFileUrl) + encrypted = r.text.strip() + encrypted = '(' + encrypted.split('}(')[1][:-1] + unpacked = eval('self.unpack' +encrypted) # string of the js code in unpacked form + matches = re.findall(r".*?\('\.([a-zA-Z0-9]{1,6})'\).*?\((\d+)\)", unpacked) + self.ports = dict((key, port) for key, port in matches) + #logger.debug('portmap: '+str(self.ports)) + + def baseN(self, num,b,numerals="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"): + return ((num == 0) and numerals[0]) or (self.baseN(num // b, b, numerals).lstrip(numerals[0]) + numerals[num % b]) + + def unpack(self, p, a, c, k, e=None, d=None): + while (c): + c-=1 + if (k[c]): + p = re.sub("\\b" + self.baseN(c, a) + "\\b", k[c], p) + return p + + def get_port(self, key): + return self.ports[key] + + def get_ports(self): + return self.ports diff --git a/http_request_randomizer/requests/proxy/ProxyObject.py b/http_request_randomizer/requests/proxy/ProxyObject.py index fe48b7e..707f9ee 100644 --- a/http_request_randomizer/requests/proxy/ProxyObject.py +++ b/http_request_randomizer/requests/proxy/ProxyObject.py @@ -82,3 +82,10 @@ def get(cls, name): return cls(name) except ValueError: return cls.UNKNOWN + +class Protocol(Enum): + UNKNOWN = 0 + HTTP = 1 + HTTPS = 2 + SOCS4 = 3 + SOCS5 = 4 diff --git a/http_request_randomizer/requests/proxy/requestProxy.py b/http_request_randomizer/requests/proxy/requestProxy.py index 4ed6723..f349d90 100644 --- a/http_request_randomizer/requests/proxy/requestProxy.py +++ b/http_request_randomizer/requests/proxy/requestProxy.py @@ -10,11 +10,12 @@ from requests.exceptions import ConnectionError from requests.exceptions import ReadTimeout +from http_request_randomizer.requests.proxy.ProxyObject import Protocol from http_request_randomizer.requests.errors.ProxyListException import ProxyListException from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser -from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser +from http_request_randomizer.requests.parsers.PremProxyParser import PremProxyParser from http_request_randomizer.requests.useragent.userAgent import UserAgentManager __author__ = 'pgaref' @@ -29,7 +30,7 @@ class RequestProxy: - def __init__(self, web_proxy_list=[], sustain=False, timeout=5): + def __init__(self, web_proxy_list=[], sustain=False, timeout=5, protocol=Protocol.HTTP): self.userAgent = UserAgentManager() self.logger = logging.getLogger() self.logger.addHandler(handler) @@ -40,9 +41,9 @@ def __init__(self, web_proxy_list=[], sustain=False, timeout=5): ##### parsers = list([]) parsers.append(FreeProxyParser('FreeProxy', 'http://free-proxy-list.net', timeout=timeout)) - parsers.append(ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) - parsers.append(RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout)) - parsers.append(SamairProxyParser('Samair', 'https://premproxy.com', timeout=timeout)) + #parsers.append(ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) <--doesn't work anymore + #parsers.append(RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout)) <--doesn't work anymore + parsers.append(PremProxyParser('PremProxy', 'https://premproxy.com', timeout=timeout)) self.logger.debug("=== Initialized Proxy Parsers ===") for i in range(len(parsers)): @@ -52,11 +53,17 @@ def __init__(self, web_proxy_list=[], sustain=False, timeout=5): self.sustain = sustain self.parsers = parsers self.proxy_list = web_proxy_list - for i in range(len(parsers)): + for parser in parsers: try: - self.proxy_list += parsers[i].parse_proxyList() + size = len(self.proxy_list) + self.proxy_list += parser.parse_proxyList() + self.logger.debug('Added {} proxies from {}'.format(len(self.proxy_list)-size, parser.id)) except ReadTimeout: - self.logger.warning("Proxy Parser: '{}' TimedOut!".format(parsers[i].url)) + self.logger.warning("Proxy Parser: '{}' TimedOut!".format(parser.url)) + self.logger.debug('Total proxies = '+str(len(self.proxy_list))) + # filtering the list of available proxies according to user preferences + self.proxy_list = [p for p in self.proxy_list if protocol in p.protocols] + self.logger.debug('Filtered proxies = '+str(len(self.proxy_list))) self.current_proxy = self.randomize_proxy() def set_logger_level(self, level): @@ -101,8 +108,8 @@ def generate_proxied_request(self, url, method="GET", params={}, data={}, header self.logger.debug("Using headers: {0}".format(str(headers))) self.logger.debug("Using proxy: {0}".format(str(self.current_proxy))) - request = requests.request(method, url, proxies={"http": self.current_proxy.get_address()}, - headers=headers, data=data, params=params, timeout=req_timeout) + request = requests.request(method, url, headers=headers, data=data, params=params, timeout=req_timeout, + proxies={"http": self.current_proxy.get_address(), "https": self.current_proxy.get_address()}) # Avoid HTTP request errors if request.status_code == 409: raise ConnectionError("HTTP Response [409] - Possible Cloudflare DNS resolution error") diff --git a/http_request_randomizer/requests/runners/proxyList.py b/http_request_randomizer/requests/runners/proxyList.py index 2a22a68..fec850e 100644 --- a/http_request_randomizer/requests/runners/proxyList.py +++ b/http_request_randomizer/requests/runners/proxyList.py @@ -7,7 +7,7 @@ from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser -from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser +from http_request_randomizer.requests.parsers.PremProxyParser import PremProxyParser __author__ = 'pgaref' @@ -23,7 +23,7 @@ def __init__(self, timeout=1.0, bandwidth=10.0): # Each of the entries implements a specific URL Parser self.parsers = dict() self.parsers['rebro'] = RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout) - self.parsers['samair'] = SamairProxyParser('Samair', 'https://premproxy.com', timeout=timeout) + self.parsers['prem'] = PremProxyParser('Prem', 'https://premproxy.com', timeout=timeout) self.parsers['freeproxy'] = FreeProxyParser('FreeProxy', 'http://free-proxy-list.net', timeout=timeout) self.parsers['proxyforeu'] = ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', bandwidth=bandwidth, timeout=timeout) diff --git a/tests/mocks.py b/tests/mocks.py index d05da3b..feabee9 100644 --- a/tests/mocks.py +++ b/tests/mocks.py @@ -4,7 +4,7 @@ free_proxy_expected = ['138.197.136.46:3128', '177.207.75.227:8080'] proxy_for_eu_expected = ['107.151.136.222:80', '37.187.253.39:8115'] rebro_weebly_expected = ['213.149.105.12:8080', '119.188.46.42:8080'] -samair_expected = ['191.252.61.28:80', '167.114.203.141:8080', '152.251.141.93:8080'] +prem_expected = ['191.252.61.28:80', '167.114.203.141:8080', '152.251.141.93:8080'] @urlmatch(netloc=r'(.*\.)?free-proxy-list\.net$') def free_proxy_mock(url, request): @@ -135,7 +135,7 @@ def rebro_weebly_mock(url, request): @urlmatch(netloc=r'(.*\.)?www\.premproxy\.com') -def samair_mock(url, request): +def prem_mock(url, request): return """