Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions http_request_randomizer/requests/parsers/FreeProxyParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from bs4 import BeautifulSoup

from http_request_randomizer.requests.parsers.UrlParser import UrlParser
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel, Protocol

logger = logging.getLogger(__name__)
__author__ = 'pgaref'
Expand All @@ -30,7 +30,7 @@ def parse_proxyList(self):
headings = [th.get_text() for th in table.find("tr").find_all("th")]

datasets = []
for row in table.find_all("tr")[1:]:
for row in table.find_all("tr")[1:-1]:
dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
if dataset:
datasets.append(dataset)
Expand All @@ -57,6 +57,7 @@ def create_proxy_object(self, dataset):
port = None
anonymity = AnonymityLevel.UNKNOWN
country = None
protocols = []
for field in dataset:
if field[0] == 'IP Address':
# Make sure it is a Valid IP
Expand All @@ -71,8 +72,11 @@ def create_proxy_object(self, dataset):
anonymity = AnonymityLevel.get(field[1].strip()) # String strip()
elif field[0] == 'Country':
country = field[1].strip() # String strip()
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
elif field[0] == 'Https':
if field[1].strip().lower() == 'yes': protocols.extend([Protocol.HTTP, Protocol.HTTPS])
elif field[1].strip().lower() == 'no': protocols.append(Protocol.HTTP)
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country, protocols=protocols)

def __str__(self):
return "FreeProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
.format(self.url, self.minimum_bandwidth_in_KBs)
return "{0} parser of '{1}' with required bandwidth: '{2}' KBs" \
.format(self.id, self.url, self.minimum_bandwidth_in_KBs)
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@
import requests
from bs4 import BeautifulSoup

from http_request_randomizer.requests.parsers.jsunpacker import JsUnpacker
from http_request_randomizer.requests.parsers.UrlParser import UrlParser
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel
from http_request_randomizer.requests.proxy.ProxyObject import ProxyObject, AnonymityLevel, Protocol

logger = logging.getLogger(__name__)
__author__ = 'pgaref'


# Samair Proxy now renamed to: premproxy.com
class SamairProxyParser(UrlParser):
class PremProxyParser(UrlParser):
def __init__(self, id, web_url, timeout=None):
self.base_url = web_url
web_url += "/list/"
UrlParser.__init__(self, id=id, web_url=web_url, timeout=timeout)

Expand All @@ -32,33 +34,30 @@ def parse_proxyList(self):
# Return proxies parsed so far
return curr_proxy_list
content = response.content
soup = BeautifulSoup(content, "html.parser")
# css provides the port number so we reverse it
# for href in soup.findAll('link'):
# if '/styles/' in href.get('href'):
# style = "http://www.samair.ru" + href.get('href')
# break
# css = requests.get(style).content.split('\n')
# css.pop()
# ports = {}
# for l in css:
# p = l.split(' ')
# key = p[0].split(':')[0][1:]
# value = p[1].split('\"')[1]
# ports[key] = value
soup = BeautifulSoup(content, "html.parser", from_encoding="iso-8859-1")
# js file contains the values for the ports
jsUrl = ''
for script in soup.findAll('script'):
if '/js/' in script.get('src'):
jsUrl = self.base_url + script.get('src')
#logger.debug('Found script url: '+jsUrl)
break
jsUnpacker = JsUnpacker(jsUrl)
ports = jsUnpacker.get_ports()

table = soup.find("div", attrs={"id": "proxylist"})
# The first tr contains the field names.
headings = [th.get_text() for th in table.find("tr").find_all("th")]
for row in table.find_all("tr")[1:]:
for row in table.find_all("tr")[1:-1]:
td_row = row.find("td")
# curr_proxy_list.append('http://' + row.text + ports[row['class'][0]])
proxy_obj = self.create_proxy_object(row)
portKey = td_row.find('span', attrs={'class':True}).get('class')[0]
port = ports[portKey]
proxy_obj = self.create_proxy_object(row, port)
# Make sure it is a Valid Proxy Address
if proxy_obj is not None and UrlParser.valid_ip_port(td_row.text):
if proxy_obj is not None and UrlParser.valid_ip(proxy_obj.ip) and UrlParser.valid_port(port):
curr_proxy_list.append(proxy_obj)
else:
logger.debug("Proxy Invalid: {}".format(td_row.text))
logger.debug("Proxy Invalid: {}".format(proxy_obj.to_str()))
except AttributeError as e:
logger.error("Provider {0} failed with Attribute error: {1}".format(self.id, e))
except KeyError as e:
Expand Down Expand Up @@ -87,7 +86,7 @@ def get_pagination_set(self):
page_set.add("")
return page_set

def create_proxy_object(self, row):
def create_proxy_object(self, row, port):
for td_row in row.findAll("td"):
if td_row.attrs['data-label'] == 'IP:port ':
text = td_row.text.strip()
Expand All @@ -96,13 +95,13 @@ def create_proxy_object(self, row):
if not UrlParser.valid_ip(ip):
logger.debug("IP with Invalid format: {}".format(ip))
return None
port = text.split(":")[1]
elif td_row.attrs['data-label'] == 'Anonymity Type: ':
anonymity = AnonymityLevel.get(td_row.text.strip())
elif td_row.attrs['data-label'] == 'Country: ':
country = td_row.text.strip()
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country)
protocols = [Protocol.HTTP]
return ProxyObject(source=self.id, ip=ip, port=port, anonymity_level=anonymity, country=country, protocols=protocols)

def __str__(self):
return "SemairProxy Parser of '{0}' with required bandwidth: '{1}' KBs" \
.format(self.url, self.minimum_bandwidth_in_KBs)
return "{0} parser of '{1}' with required bandwidth: '{2}' KBs" \
.format(self.id, self.url, self.minimum_bandwidth_in_KBs)
4 changes: 4 additions & 0 deletions http_request_randomizer/requests/parsers/UrlParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,7 @@ def valid_ip_port(address):
if not match:
return False
return True

@staticmethod
def valid_port(port):
return 1 <= int(port) <= 65535
39 changes: 39 additions & 0 deletions http_request_randomizer/requests/parsers/jsunpacker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import re
import requests
import logging

logger = logging.getLogger(__name__)

class JsUnpacker:
"""
It takes the javascript file's url which contains the port numbers for
the encrypted strings. The file has to be unpacked to a readable form just like
http://matthewfl.com/unPacker.html does. Then we create a dictionary for
every key:port pair.
"""
# TODO: it might not be necessary to unpack the js code

def __init__(self, jsFileUrl):
r = requests.get(jsFileUrl)
encrypted = r.text.strip()
encrypted = '(' + encrypted.split('}(')[1][:-1]
unpacked = eval('self.unpack' +encrypted) # string of the js code in unpacked form
matches = re.findall(r".*?\('\.([a-zA-Z0-9]{1,6})'\).*?\((\d+)\)", unpacked)
self.ports = dict((key, port) for key, port in matches)
#logger.debug('portmap: '+str(self.ports))

def baseN(self, num,b,numerals="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"):
return ((num == 0) and numerals[0]) or (self.baseN(num // b, b, numerals).lstrip(numerals[0]) + numerals[num % b])

def unpack(self, p, a, c, k, e=None, d=None):
while (c):
c-=1
if (k[c]):
p = re.sub("\\b" + self.baseN(c, a) + "\\b", k[c], p)
return p

def get_port(self, key):
return self.ports[key]

def get_ports(self):
return self.ports
7 changes: 7 additions & 0 deletions http_request_randomizer/requests/proxy/ProxyObject.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,10 @@ def get(cls, name):
return cls(name)
except ValueError:
return cls.UNKNOWN

class Protocol(Enum):
UNKNOWN = 0
HTTP = 1
HTTPS = 2
SOCS4 = 3
SOCS5 = 4
27 changes: 17 additions & 10 deletions http_request_randomizer/requests/proxy/requestProxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
from requests.exceptions import ConnectionError
from requests.exceptions import ReadTimeout

from http_request_randomizer.requests.proxy.ProxyObject import Protocol
from http_request_randomizer.requests.errors.ProxyListException import ProxyListException
from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser
from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser
from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser
from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser
from http_request_randomizer.requests.parsers.PremProxyParser import PremProxyParser
from http_request_randomizer.requests.useragent.userAgent import UserAgentManager

__author__ = 'pgaref'
Expand All @@ -29,7 +30,7 @@


class RequestProxy:
def __init__(self, web_proxy_list=[], sustain=False, timeout=5):
def __init__(self, web_proxy_list=[], sustain=False, timeout=5, protocol=Protocol.HTTP):
self.userAgent = UserAgentManager()
self.logger = logging.getLogger()
self.logger.addHandler(handler)
Expand All @@ -40,9 +41,9 @@ def __init__(self, web_proxy_list=[], sustain=False, timeout=5):
#####
parsers = list([])
parsers.append(FreeProxyParser('FreeProxy', 'http://free-proxy-list.net', timeout=timeout))
parsers.append(ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', 1.0, timeout=timeout))
parsers.append(RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout))
parsers.append(SamairProxyParser('Samair', 'https://premproxy.com', timeout=timeout))
#parsers.append(ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php', 1.0, timeout=timeout)) <--doesn't work anymore
#parsers.append(RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout)) <--doesn't work anymore
parsers.append(PremProxyParser('PremProxy', 'https://premproxy.com', timeout=timeout))

self.logger.debug("=== Initialized Proxy Parsers ===")
for i in range(len(parsers)):
Expand All @@ -52,11 +53,17 @@ def __init__(self, web_proxy_list=[], sustain=False, timeout=5):
self.sustain = sustain
self.parsers = parsers
self.proxy_list = web_proxy_list
for i in range(len(parsers)):
for parser in parsers:
try:
self.proxy_list += parsers[i].parse_proxyList()
size = len(self.proxy_list)
self.proxy_list += parser.parse_proxyList()
self.logger.debug('Added {} proxies from {}'.format(len(self.proxy_list)-size, parser.id))
except ReadTimeout:
self.logger.warning("Proxy Parser: '{}' TimedOut!".format(parsers[i].url))
self.logger.warning("Proxy Parser: '{}' TimedOut!".format(parser.url))
self.logger.debug('Total proxies = '+str(len(self.proxy_list)))
# filtering the list of available proxies according to user preferences
self.proxy_list = [p for p in self.proxy_list if protocol in p.protocols]
self.logger.debug('Filtered proxies = '+str(len(self.proxy_list)))
self.current_proxy = self.randomize_proxy()

def set_logger_level(self, level):
Expand Down Expand Up @@ -101,8 +108,8 @@ def generate_proxied_request(self, url, method="GET", params={}, data={}, header

self.logger.debug("Using headers: {0}".format(str(headers)))
self.logger.debug("Using proxy: {0}".format(str(self.current_proxy)))
request = requests.request(method, url, proxies={"http": self.current_proxy.get_address()},
headers=headers, data=data, params=params, timeout=req_timeout)
request = requests.request(method, url, headers=headers, data=data, params=params, timeout=req_timeout,
proxies={"http": self.current_proxy.get_address(), "https": self.current_proxy.get_address()})
# Avoid HTTP request errors
if request.status_code == 409:
raise ConnectionError("HTTP Response [409] - Possible Cloudflare DNS resolution error")
Expand Down
4 changes: 2 additions & 2 deletions http_request_randomizer/requests/runners/proxyList.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser
from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser
from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser
from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser
from http_request_randomizer.requests.parsers.PremProxyParser import PremProxyParser

__author__ = 'pgaref'

Expand All @@ -23,7 +23,7 @@ def __init__(self, timeout=1.0, bandwidth=10.0):
# Each of the entries implements a specific URL Parser
self.parsers = dict()
self.parsers['rebro'] = RebroWeeblyParser('ReBro', 'http://rebro.weebly.com', timeout=timeout)
self.parsers['samair'] = SamairProxyParser('Samair', 'https://premproxy.com', timeout=timeout)
self.parsers['prem'] = PremProxyParser('Prem', 'https://premproxy.com', timeout=timeout)
self.parsers['freeproxy'] = FreeProxyParser('FreeProxy', 'http://free-proxy-list.net', timeout=timeout)
self.parsers['proxyforeu'] = ProxyForEuParser('ProxyForEU', 'http://proxyfor.eu/geo.php',
bandwidth=bandwidth, timeout=timeout)
Expand Down
4 changes: 2 additions & 2 deletions tests/mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
free_proxy_expected = ['138.197.136.46:3128', '177.207.75.227:8080']
proxy_for_eu_expected = ['107.151.136.222:80', '37.187.253.39:8115']
rebro_weebly_expected = ['213.149.105.12:8080', '119.188.46.42:8080']
samair_expected = ['191.252.61.28:80', '167.114.203.141:8080', '152.251.141.93:8080']
prem_expected = ['191.252.61.28:80', '167.114.203.141:8080', '152.251.141.93:8080']

@urlmatch(netloc=r'(.*\.)?free-proxy-list\.net$')
def free_proxy_mock(url, request):
Expand Down Expand Up @@ -135,7 +135,7 @@ def rebro_weebly_mock(url, request):


@urlmatch(netloc=r'(.*\.)?www\.premproxy\.com')
def samair_mock(url, request):
def prem_mock(url, request):
return """<div id="proxylist">\n
<tr class="anon">\n
<th><a href="/list/ip-address-01.htm" title="Proxy List sorted by ip address">IP address</a></th>
Expand Down
14 changes: 7 additions & 7 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@

sys.path.insert(0, os.path.abspath('.'))

from tests.mocks import free_proxy_mock, proxy_for_eu_mock, rebro_weebly_mock, samair_mock
from tests.mocks import free_proxy_expected, proxy_for_eu_expected, rebro_weebly_expected, samair_expected
from tests.mocks import free_proxy_mock, proxy_for_eu_mock, rebro_weebly_mock, prem_mock
from tests.mocks import free_proxy_expected, proxy_for_eu_expected, rebro_weebly_expected, prem_expected
from http_request_randomizer.requests.parsers.FreeProxyParser import FreeProxyParser
from http_request_randomizer.requests.parsers.ProxyForEuParser import ProxyForEuParser
from http_request_randomizer.requests.parsers.RebroWeeblyParser import RebroWeeblyParser
from http_request_randomizer.requests.parsers.SamairProxyParser import SamairProxyParser
from http_request_randomizer.requests.parsers.PremProxyParser import PremProxyParser

__author__ = 'pgaref'

Expand Down Expand Up @@ -46,14 +46,14 @@ def test_RebroWeeblyParser(self):
proxy_list_addr.append(proxy.get_address())
self.assertEqual(proxy_list_addr, rebro_weebly_expected)

def test_SemairProxyParser(self):
with HTTMock(samair_mock):
proxy_provider = SamairProxyParser('Samair', 'https://www.premproxy.com')
def test_PremProxyParser(self):
with HTTMock(prem_mock):
proxy_provider = PremProxyParser('Prem', 'https://www.premproxy.com')
proxy_list = proxy_provider.parse_proxyList()
proxy_list_addr = []
for proxy in proxy_list:
proxy_list_addr.append(proxy.get_address())
for item in samair_expected:
for item in prem_expected:
self.assertTrue(item in proxy_list_addr)


Expand Down