diff --git a/.travis.yml b/.travis.yml index d0bf2415..f908c350 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,8 @@ env: - TOXENV=py33 - TOXENV=py34 - TOXENV=py35 + - TOXENV=mypy2 + - TOXENV=mypy3 install: - pip install -U tox twine wheel codecov diff --git a/setup.py b/setup.py index 0bab41d8..cad76f72 100644 --- a/setup.py +++ b/setup.py @@ -27,5 +27,5 @@ 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Internet :: WWW/HTTP', ], - install_requires=['six >= 1.4.1'], + install_requires=['six >= 1.4.1', 'typing'], ) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index df2e5ce4..b1bcecbb 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -3,6 +3,7 @@ from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode, http_content_type_encoding, resolve_encoding, html_to_unicode) + class RequestEncodingTests(unittest.TestCase): utf8_fragments = [ # Content-Type as meta http-equiv @@ -51,6 +52,7 @@ def test_html_body_declared_encoding(self): for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment) self.assertEqual(encoding, 'utf-8', fragment) + self.assertEqual(None, html_body_declared_encoding(b"something else")) self.assertEqual(None, html_body_declared_encoding(b"""
@@ -76,6 +78,11 @@ def test_html_body_declared_encoding_unicode(self): self.assertEqual(None, html_body_declared_encoding( u"""""")) + def test_html_body_declared_encoding_aliases(self): + fragment = b"""""" + self.assertEqual("cp1251", html_body_declared_encoding(fragment)) + self.assertEqual("cp1251", html_body_declared_encoding(fragment.decode('utf8'))) + class CodecsEncodingTestCase(unittest.TestCase): def test_resolve_encoding(self): @@ -97,9 +104,11 @@ def test_invalid_utf8(self): def ct(charset): return "Content-Type: text/html; charset=" + charset if charset else None + def norm_encoding(enc): return codecs.lookup(enc).name + class HtmlConversionTests(unittest.TestCase): def test_unicode_body(self): diff --git a/tox.ini b/tox.ini index 5fb4f780..4a0318ae 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27, pypy, py33, py34, py35 +envlist = py27, pypy, py33, py34, py35, mypy2, mypy3 [testenv] deps = @@ -12,3 +12,21 @@ deps = pytest-cov commands = py.test --cov=w3lib --cov-report= {posargs:w3lib tests} + + +[testenv:mypy2] +basepython = python3.5 +deps = + mypy-lang + typing +commands = + mypy --py2 w3lib tests + + +[testenv:mypy3] +basepython = python3.5 +deps = + mypy-lang + typing +commands = + mypy w3lib tests diff --git a/w3lib/_types.py b/w3lib/_types.py new file mode 100644 index 00000000..324a3e49 --- /dev/null +++ b/w3lib/_types.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +""" +Which string type to use? +========================= + +1. Variable is an URL ==> use ``str`` +2. Variable is binary; unicode is not accepted ==> use ``bytes`` +3. Variable is text, and it can be only unicode in Python 2 ==> use + ``six.text_type`` (or typing.Text??) +4. Variable is text, but it can be ascii or utf8-encoded str + in Python 2 ==> use w3lib._types.String +5. Variable can be either bytes or unicode both in Python 2 + and Python 3 ==> use typing.AnyStr +6. Variable should be str (==bytes) in Python 2 + and str (==unicode) in Python 3 ==> use ``str``. + +""" +from __future__ import absolute_import +from typing import Union +import six + +if six.PY2: + String = Union[bytes, unicode] +else: + String = str diff --git a/w3lib/encoding.py b/w3lib/encoding.py index b7370367..31ec04ae 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -2,11 +2,20 @@ """ Functions for handling encoding of web pages """ -import re, codecs, encodings +import re +import codecs +import encodings # type: ignore +from typing import Optional, AnyStr, Tuple, Callable, Union +import six + +from .util import to_native_str + _HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I) + def http_content_type_encoding(content_type): + # type: (str) -> Optional[str] """Extract the encoding in the content-type header >>> import w3lib.encoding @@ -20,6 +29,7 @@ def http_content_type_encoding(content_type): if match: return resolve_encoding(match.group(1)) + # regexp for parsing HTTP meta tags _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' _SKIP_ATTRS = '''(?x)(?:\\s+ @@ -39,12 +49,14 @@ def http_content_type_encoding(content_type): _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P .*?)(?P=quote)'), re.DOTALL | re.IGNORECASE) _cdata_re = re.compile(r'((?P .*?)(?P \]\]>))', re.DOTALL) + def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): r""" @@ -35,7 +38,9 @@ def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): return replace_entities(text, keep, remove_illegal, encoding) + def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): + # type: (AnyStr, Sequence[String], bool, String) -> six.text_type u"""Remove entities from the given `text` by converting them to their corresponding unicode character. @@ -93,14 +98,19 @@ def convert_entity(m): return _ent_re.sub(convert_entity, to_unicode(text, encoding)) + def has_entities(text, encoding=None): + # type: (AnyStr, Optional[String]) -> bool return bool(_ent_re.search(to_unicode(text, encoding))) + def replace_tags(text, token='', encoding=None): + # type: (AnyStr, String, Optional[String]) -> six.text_type + """Replace all markup tags found in the given `text` by the given token. By default `token` is an empty string so it just removes all tags. - `text` can be a unicode string or a regular string encoded as `encoding` + `text` can be a unicode string or a byte string encoded as `encoding` (or ``'utf-8'`` if `encoding` is not given.) Always returns a unicode string. @@ -121,6 +131,7 @@ def replace_tags(text, token='', encoding=None): _REMOVECOMMENTS_RE = re.compile(u'', re.DOTALL) def remove_comments(text, encoding=None): + # type: (AnyStr, Optional[String]) -> six.text_type """ Remove HTML Comments. >>> import w3lib.html @@ -130,10 +141,13 @@ def remove_comments(text, encoding=None): """ - text = to_unicode(text, encoding) - return _REMOVECOMMENTS_RE.sub(u'', text) + text_unicode = to_unicode(text, encoding) + return _REMOVECOMMENTS_RE.sub(u'', text_unicode) + def remove_tags(text, which_ones=(), keep=(), encoding=None): + # type: (AnyStr, Sequence[String], Sequence[String], Optional[String]) -> six.text_type + """ Remove HTML Tags only. `which_ones` and `keep` are both tuples, there are four cases: @@ -182,26 +196,28 @@ def remove_tags(text, which_ones=(), keep=(), encoding=None): assert not (which_ones and keep), 'which_ones and keep can not be given at the same time' - which_ones = {tag.lower() for tag in which_ones} - keep = {tag.lower() for tag in keep} + which_ones_ = {tag.lower() for tag in which_ones} + keep_ = {tag.lower() for tag in keep} def will_remove(tag): tag = tag.lower() - if which_ones: - return tag in which_ones + if which_ones_: + return tag in which_ones_ else: - return tag not in keep + return tag not in keep_ def remove_tag(m): tag = m.group(1) return u'' if will_remove(tag) else m.group(0) - regex = '?([^ >/]+).*?>' + regex = u'?([^ >/]+).*?>' retags = re.compile(regex, re.DOTALL | re.IGNORECASE) return retags.sub(remove_tag, to_unicode(text, encoding)) + def remove_tags_with_content(text, which_ones=(), encoding=None): + # type: (AnyStr, Sequence[String], Optional[String]) -> six.text_type """Remove tags and their content. `which_ones` is a tuple of which tags to remove including their content. @@ -215,16 +231,18 @@ def remove_tags_with_content(text, which_ones=(), encoding=None): """ - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) if which_ones: - tags = '|'.join([r'<%s.*?%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) + tags = u'|'.join([r'<%s.*?%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) - text = retags.sub(u'', text) - return text + text_unicode = retags.sub(u'', text_unicode) + return text_unicode -def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \ - encoding=None): +def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='', + encoding=None): + # type: (AnyStr, Sequence[String], String, Optional[String]) -> six.text_type + """Remove escape characters. `which_ones` is a tuple of which escape characters we want to remove. @@ -235,12 +253,15 @@ def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \ """ - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) for ec in which_ones: - text = text.replace(ec, to_unicode(replace_by, encoding)) - return text + text_unicode = text_unicode.replace(ec, to_unicode(replace_by, encoding)) + return text_unicode + def unquote_markup(text, keep=(), remove_illegal=True, encoding=None): + # type: (AnyStr, Sequence[String], bool, Optional[String]) -> six.text_type + """ This function receives markup as a text (always a unicode string or a UTF-8 encoded string) and does the following: @@ -261,27 +282,29 @@ def _get_fragments(txt, pattern): offset = match_e yield txt[offset:] - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) ret_text = u'' - for fragment in _get_fragments(text, _cdata_re): + for fragment in _get_fragments(text_unicode, _cdata_re): if isinstance(fragment, six.string_types): # it's not a CDATA (so we try to remove its entities) - ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal) + # XXX: mypy has problems with six.string_types, + # had to ignore this type check + ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal) # type: ignore else: # it's a CDATA (so we just extract its content) ret_text += fragment.group('cdata_d') return ret_text + def get_base_url(text, baseurl='', encoding='utf-8'): + # type: (AnyStr, str, String) -> str """Return the base url if declared in the given HTML `text`, relative to the given base url. If no base url is found, the given `baseurl` is returned. - """ - - text = to_unicode(text, encoding) - m = _baseurl_re.search(text) + text_unicode = to_unicode(text, encoding) + m = _baseurl_re.search(text_unicode) if m: return moves.urllib.parse.urljoin( safe_url_string(baseurl), @@ -290,30 +313,30 @@ def get_base_url(text, baseurl='', encoding='utf-8'): else: return safe_url_string(baseurl) + def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): + # type: (AnyStr, str, String, Sequence[String]) -> Union[Tuple[float, str], Tuple[None, None]] """Return the http-equiv parameter of the HTML meta element from the given - HTML text and return a tuple ``(interval, url)`` where interval is an integer + HTML text and return a tuple ``(interval, url)`` where interval is a number containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, ``(None, None)`` is returned. """ - - if six.PY2: - baseurl = to_bytes(baseurl, encoding) + baseurl_str = to_native_str(baseurl) try: - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) except UnicodeDecodeError: print(text) raise - text = remove_tags_with_content(text, ignore_tags) - text = remove_comments(replace_entities(text)) - m = _meta_refresh_re.search(text) + text_unicode = remove_tags_with_content(text_unicode, ignore_tags) + text_unicode = remove_comments(replace_entities(text_unicode)) + m = _meta_refresh_re.search(text_unicode) if m: interval = float(m.group('int')) url = safe_url_string(m.group('url').strip(' "\''), encoding) - url = moves.urllib.parse.urljoin(baseurl, url) + url = moves.urllib.parse.urljoin(baseurl_str, url) return interval, url else: return None, None diff --git a/w3lib/http.py b/w3lib/http.py index 8c5dfedd..9ef20ac7 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,7 +1,12 @@ +from __future__ import absolute_import from base64 import urlsafe_b64encode +from typing import Dict, List, Optional, Union, Any + +from ._types import String def headers_raw_to_dict(headers_raw): + # type: (Optional[bytes]) -> Optional[Dict[bytes, List[bytes]]] r""" Convert raw headers (single multi-line bytestring) to a dictionary. @@ -10,7 +15,7 @@ def headers_raw_to_dict(headers_raw): >>> import w3lib.http >>> w3lib.http.headers_raw_to_dict(b"Content-type: text/html\n\rAccept: gzip\n\n") # doctest: +SKIP - {'Content-type': ['text/html'], 'Accept': ['gzip']} + {b'Content-type': [b'text/html'], b'Accept': [b'gzip']} Incorrect input: @@ -37,6 +42,7 @@ def headers_raw_to_dict(headers_raw): def headers_dict_to_raw(headers_dict): + # type: (Optional[Dict[bytes, Union[bytes, List[bytes]]]]) -> Optional[bytes] r""" Returns a raw HTTP headers representation of headers @@ -69,6 +75,7 @@ def headers_dict_to_raw(headers_dict): def basic_auth_header(username, password): + # type: (String, String) -> bytes """ Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_ @@ -79,8 +86,7 @@ def basic_auth_header(username, password): .. _HTTP Basic Access Authentication (RFC 2617): http://www.ietf.org/rfc/rfc2617.txt """ - - auth = "%s:%s" % (username, password) + auth = "%s:%s" % (username, password) # type: Any if not isinstance(auth, bytes): # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 # seems to be the most widely used encoding here. See also: diff --git a/w3lib/url.py b/w3lib/url.py index 2d0bc106..cfe973a4 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -11,13 +11,18 @@ from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit, urldefrag, urlencode, urlparse, quote, parse_qs, parse_qsl) -from six.moves.urllib.request import pathname2url, url2pathname +from six.moves.urllib.request import pathname2url, url2pathname # type: ignore +from typing import AnyStr, Union, Set, Sequence, TypeVar + from w3lib.util import to_bytes, to_native_str, to_unicode +from w3lib._types import String + +T = TypeVar('T') # error handling function for bytes-to-Unicode decoding errors with URLs def _quote_byte(error): - return (to_unicode(quote(error.object[error.start:error.end])), error.end) + return to_unicode(quote(error.object[error.start:error.end])), error.end codecs.register_error('percentencode', _quote_byte) @@ -61,11 +66,13 @@ def urljoin_rfc(base, ref, encoding='utf-8'): str_ref = to_bytes(ref, encoding) return urljoin(str_base, str_ref) -_reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax) -_unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3 +_reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax) +_unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3 _safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks + def safe_url_string(url, encoding='utf8', path_encoding='utf8'): + # type: (AnyStr, String, String) -> str """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. @@ -107,7 +114,9 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'): _parent_dirs = re.compile(r'/?(\.\./)+') + def safe_download_url(url): + # type: (str) -> str """ Make a url for download. This will call safe_url_string and then strip the fragment, if one exists. The path will be normalised. @@ -125,10 +134,14 @@ def safe_download_url(url): path = '/' return urlunsplit((scheme, netloc, path, query, '')) + def is_url(text): + # type: (String) -> bool return text.partition("://")[0] in ('file', 'http', 'https') -def url_query_parameter(url, parameter, default=None, keep_blank_values=0): + +def url_query_parameter(url, parameter, default=None, keep_blank_values=False): + # type: (str, String, T, bool) -> Union[str, T] """Return the value of a url parameter, given the url and parameter name General case: @@ -144,14 +157,14 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0): 'mydefault' >>> - Returns None if `keep_blank_values` not set or 0 (default): + Returns None if `keep_blank_values` not set or False (default): >>> w3lib.url.url_query_parameter("product.html?id=", "id") >>> - Returns an empty string if `keep_blank_values` set to 1: + Returns an empty string if `keep_blank_values` set to True: - >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1) + >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=True) '' >>> @@ -161,9 +174,13 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0): urlsplit(str(url))[3], keep_blank_values=keep_blank_values ) - return queryparams.get(parameter, [default])[0] + # mypy shows 'List item 0 has incompatible type "_T"' error + values = queryparams.get(parameter, [default]) # type: ignore + return values[0] + def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True): + # type: (str, Union[str, Sequence[str]], str, str, bool, bool) -> str """Clean URL arguments leaving only those passed in the parameterlist keeping order >>> import w3lib.url @@ -193,7 +210,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u parameterlist = [parameterlist] url = urldefrag(url)[0] base, _, query = url.partition('?') - seen = set() + seen = set() # type: Set[str] querylist = [] for ksv in query.split(sep): k, _, _ = ksv.partition(kvsep) @@ -208,7 +225,9 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u seen.add(k) return '?'.join([base, sep.join(querylist)]) if querylist else base + def add_or_replace_parameter(url, name, new_value): + # type: (str, str, str) -> str """Add or remove a parameter to a given url >>> import w3lib.url @@ -237,10 +256,13 @@ def add_or_replace_parameter(url, name, new_value): new_args.append((name, new_value)) query = urlencode(new_args) - return urlunsplit(parsed._replace(query=query)) + + # "SplitResult" has no attribute "_replace" - looks like a bug in typeshed + return urlunsplit(parsed._replace(query=query)) # type: ignore def path_to_file_uri(path): + # type: (str) -> str """Convert local filesystem path to legal File URIs as described in: http://en.wikipedia.org/wiki/File_URI_scheme """ @@ -249,14 +271,18 @@ def path_to_file_uri(path): x = x.replace('|', ':') # http://bugs.python.org/issue5861 return 'file:///%s' % x.lstrip('/') + def file_uri_to_path(uri): + # type: (str) -> str """Convert File URI to local filesystem path according to: http://en.wikipedia.org/wiki/File_URI_scheme """ uri_path = urlparse(uri).path return url2pathname(uri_path) + def any_to_uri(uri_or_path): + # type: (str) -> str """If given a path name, return its File URI, otherwise return it unmodified """ diff --git a/w3lib/util.py b/w3lib/util.py index d8513eef..3015525d 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -1,4 +1,8 @@ import six +from typing import AnyStr, Optional + +from ._types import String + def str_to_unicode(text, encoding=None, errors='strict'): if encoding is None: @@ -7,6 +11,7 @@ def str_to_unicode(text, encoding=None, errors='strict'): return text.decode(encoding, errors) return text + def unicode_to_str(text, encoding=None, errors='strict'): if encoding is None: encoding = 'utf-8' @@ -14,19 +19,23 @@ def unicode_to_str(text, encoding=None, errors='strict'): return text.encode(encoding, errors) return text + def to_unicode(text, encoding=None, errors='strict'): + # type: (AnyStr, Optional[String], String) -> six.text_type """Return the unicode representation of a bytes object `text`. If `text` is already an unicode object, return it as-is.""" if isinstance(text, six.text_type): return text - if not isinstance(text, (bytes, six.text_type)): + if not isinstance(text, bytes): raise TypeError('to_unicode must receive a bytes, str or unicode ' 'object, got %s' % type(text).__name__) if encoding is None: encoding = 'utf-8' return text.decode(encoding, errors) + def to_bytes(text, encoding=None, errors='strict'): + # type: (AnyStr, Optional[String], String) -> bytes """Return the binary representation of `text`. If `text` is already a bytes object, return it as-is.""" if isinstance(text, bytes): @@ -38,7 +47,9 @@ def to_bytes(text, encoding=None, errors='strict'): encoding = 'utf-8' return text.encode(encoding, errors) + def to_native_str(text, encoding=None, errors='strict'): + # type: (AnyStr, Optional[String], String) -> str """ Return str representation of `text` (bytes in Python 2.x and unicode in Python 3.x). """ if six.PY2: