From c38f5da9ba010399df7f59395c0ad164aa311735 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 8 Jul 2016 02:12:55 +0500 Subject: [PATCH 1/8] type hints --- .travis.yml | 4 ++ setup.py | 2 +- tests/test_encoding.py | 9 ++++ tox.ini | 20 ++++++++- w3lib/_types.py | 25 +++++++++++ w3lib/encoding.py | 38 +++++++++++++---- w3lib/html.py | 96 ++++++++++++++++++++++++++---------------- w3lib/http.py | 12 ++++-- w3lib/url.py | 34 +++++++++++---- w3lib/util.py | 13 +++++- 10 files changed, 194 insertions(+), 59 deletions(-) create mode 100644 w3lib/_types.py diff --git a/.travis.yml b/.travis.yml index dfeee83b..9b6642a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,6 +18,10 @@ matrix: sudo: true - python: 3.5 env: TOXENV=pypy3 + - python: 3.6 + env: TOXENV=mypy2 + - python: 3.6 + env: TOXENV=mypy3 install: - | diff --git a/setup.py b/setup.py index 2ae088c9..44fdf0e2 100644 --- a/setup.py +++ b/setup.py @@ -29,5 +29,5 @@ 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Internet :: WWW/HTTP', ], - install_requires=['six >= 1.4.1'], + install_requires=['six >= 1.4.1', 'typing'], ) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 649c189a..5c2ff0f4 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -3,6 +3,7 @@ from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode, http_content_type_encoding, resolve_encoding, html_to_unicode) + class RequestEncodingTests(unittest.TestCase): utf8_fragments = [ # Content-Type as meta http-equiv @@ -51,6 +52,7 @@ def test_html_body_declared_encoding(self): for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment) self.assertEqual(encoding, 'utf-8', fragment) + self.assertEqual(None, html_body_declared_encoding(b"something else")) self.assertEqual(None, html_body_declared_encoding(b""" @@ -76,6 +78,11 @@ def test_html_body_declared_encoding_unicode(self): self.assertEqual(None, html_body_declared_encoding( u"""""")) + def test_html_body_declared_encoding_aliases(self): + fragment = b"""""" + self.assertEqual("cp1251", html_body_declared_encoding(fragment)) + self.assertEqual("cp1251", html_body_declared_encoding(fragment.decode('utf8'))) + class CodecsEncodingTestCase(unittest.TestCase): def test_resolve_encoding(self): @@ -97,9 +104,11 @@ def test_invalid_utf8(self): def ct(charset): return "Content-Type: text/html; charset=" + charset if charset else None + def norm_encoding(enc): return codecs.lookup(enc).name + class HtmlConversionTests(unittest.TestCase): def test_unicode_body(self): diff --git a/tox.ini b/tox.ini index 0e5a0b39..0b213434 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27, pypy, py34, py35, py36, py37, pypy3 +envlist = py27, pypy, py34, py35, py36, py37, pypy3, mypy2, mypy3 [testenv] deps = @@ -15,3 +15,21 @@ commands = --doctest-modules \ --cov=w3lib --cov-report=term \ {posargs:w3lib tests} + + +[testenv:mypy2] +basepython = python3.6 +deps = + mypy-lang + typing +commands = + mypy --py2 w3lib tests + + +[testenv:mypy3] +basepython = python3.6 +deps = + mypy-lang + typing +commands = + mypy w3lib tests diff --git a/w3lib/_types.py b/w3lib/_types.py new file mode 100644 index 00000000..e8e7117d --- /dev/null +++ b/w3lib/_types.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +""" +Which string type to use? +========================= + +1. Variable is an URL ==> use ``str`` +2. Variable is binary; unicode is not accepted ==> use ``bytes`` +3. Variable is text, and it can be only unicode in Python 2 ==> use + ``six.text_type`` (or typing.Text??) +4. Variable is text, but it can be str in Python 2 ==> use w3lib._types.String +5. Variable can be either bytes or unicode both in Python 2 + and Python 3 ==> use typing.AnyStr +6. Variable should be str (==bytes) in Python 2 + and str (==unicode) in Python 3 ==> use ``str``. + +""" + +from __future__ import absolute_import +from typing import Union +import six + +if six.PY2: + String = Union[bytes, unicode] +else: + String = str diff --git a/w3lib/encoding.py b/w3lib/encoding.py index c7ac567f..b965baeb 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -2,12 +2,21 @@ """ Functions for handling encoding of web pages """ -import re, codecs, encodings +import re +import codecs +import encodings # type: ignore from sys import version_info +from typing import Optional, Union, AnyStr, Tuple, Callable +import six + +from .util import to_native_str +from ._types import String _HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I) + def http_content_type_encoding(content_type): + # type: (str) -> Optional[str] """Extract the encoding in the content-type header >>> import w3lib.encoding @@ -21,6 +30,7 @@ def http_content_type_encoding(content_type): if match: return resolve_encoding(match.group(1)) + # regexp for parsing HTTP meta tags _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' _SKIP_ATTRS = '''(?:\\s+ @@ -40,13 +50,15 @@ def http_content_type_encoding(content_type): _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P[\w-]+)') # check for meta tags, or xml decl. and stop search if a body tag is encountered -_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % ( +_BODY_ENCODING_PATTERN = six.u(r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)') % ( _SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE) _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE) _BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I | re.VERBOSE) + def html_body_declared_encoding(html_body_str): + # type: (AnyStr) -> Optional[str] '''Return the encoding specified in meta tags in the html body, or ``None`` if no suitable encoding was found @@ -79,7 +91,8 @@ def html_body_declared_encoding(html_body_str): encoding = match.group('charset') or match.group('charset2') \ or match.group('xmlcharset') if encoding: - return resolve_encoding(encoding) + return resolve_encoding(to_native_str(encoding)) + # Default encoding translation # this maps cannonicalized encodings to target encodings @@ -109,6 +122,7 @@ def html_body_declared_encoding(html_body_str): } def _c18n_encoding(encoding): + # type: (AnyStr) -> str """Cannonicalize an encoding name This performs normalization and translates aliases using python's @@ -117,7 +131,9 @@ def _c18n_encoding(encoding): normed = encodings.normalize_encoding(encoding).lower() return encodings.aliases.aliases.get(normed, normed) + def resolve_encoding(encoding_alias): + # type: (AnyStr) -> Optional[str] """Return the encoding that `encoding_alias` maps to, or ``None`` if the encoding cannot be interpreted @@ -136,6 +152,7 @@ def resolve_encoding(encoding_alias): except LookupError: return None + _BOM_TABLE = [ (codecs.BOM_UTF32_BE, 'utf-32-be'), (codecs.BOM_UTF32_LE, 'utf-32-le'), @@ -145,7 +162,9 @@ def resolve_encoding(encoding_alias): ] _FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE) + def read_bom(data): + # type: (bytes) -> Tuple[str, bytes] r"""Read the byte order mark in the text, if present, and return the encoding represented by the BOM and the BOM. @@ -153,13 +172,13 @@ def read_bom(data): >>> import w3lib.encoding >>> w3lib.encoding.read_bom(b'\xfe\xff\x6c\x34') - ('utf-16-be', '\xfe\xff') + ('utf-16-be', b'\xfe\xff') >>> w3lib.encoding.read_bom(b'\xff\xfe\x34\x6c') - ('utf-16-le', '\xff\xfe') + ('utf-16-le', b'\xff\xfe') >>> w3lib.encoding.read_bom(b'\x00\x00\xfe\xff\x00\x00\x6c\x34') - ('utf-32-be', '\x00\x00\xfe\xff') + ('utf-32-be', b'\x00\x00\xfe\xff') >>> w3lib.encoding.read_bom(b'\xff\xfe\x00\x00\x34\x6c\x00\x00') - ('utf-32-le', '\xff\xfe\x00\x00') + ('utf-32-le', b'\xff\xfe\x00\x00') >>> w3lib.encoding.read_bom(b'\x01\x02\x03\x04') (None, None) >>> @@ -175,9 +194,10 @@ def read_bom(data): # Python decoder doesn't follow unicode standard when handling # bad utf-8 encoded strings. see http://bugs.python.org/issue8271 -codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end)) +codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end)) # type: ignore def to_unicode(data_str, encoding): + # type: (bytes, str) -> six.text_type """Convert a str object to unicode using the encoding given Characters that cannot be converted will be converted to ``\\ufffd`` (the @@ -185,8 +205,10 @@ def to_unicode(data_str, encoding): """ return data_str.decode(encoding, 'replace' if version_info[0:2] >= (3, 3) else 'w3lib_replace') + def html_to_unicode(content_type_header, html_body_str, default_encoding='utf8', auto_detect_fun=None): + # type: (Optional[str], bytes, str, Optional[Callable[[bytes], str]]) -> Tuple[str, six.text_type] r'''Convert raw html bytes to unicode This attempts to make a reasonable guess at the content encoding of the diff --git a/w3lib/html.py b/w3lib/html.py index 9990a351..bbaf439a 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -7,12 +7,14 @@ import re import six from six import moves +from typing import AnyStr, Optional, Iterable, Tuple, Union, Sequence -from w3lib.util import to_bytes, to_unicode +from w3lib.util import to_bytes, to_unicode, to_native_str from w3lib.url import safe_url_string +from w3lib._types import String -_ent_re = re.compile(r'&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)', re.IGNORECASE) -_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL) +_ent_re = re.compile(six.u(r'&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)'), re.IGNORECASE) +_tag_re = re.compile(six.u(r'<[a-zA-Z\/!].*?>'), re.DOTALL) _baseurl_re = re.compile(six.u(r']*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I) _meta_refresh_re = re.compile(six.u(r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE) _cdata_re = re.compile(r'((?P.*?)(?P\]\]>))', re.DOTALL) @@ -38,7 +40,9 @@ def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): return replace_entities(text, keep, remove_illegal, encoding) + def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): + # type: (AnyStr, Sequence[String], bool, String) -> six.text_type u"""Remove entities from the given `text` by converting them to their corresponding unicode character. @@ -96,14 +100,19 @@ def convert_entity(m): return _ent_re.sub(convert_entity, to_unicode(text, encoding)) + def has_entities(text, encoding=None): + # type: (AnyStr, Optional[String]) -> bool return bool(_ent_re.search(to_unicode(text, encoding))) + def replace_tags(text, token='', encoding=None): + # type: (AnyStr, String, Optional[String]) -> six.text_type + """Replace all markup tags found in the given `text` by the given token. By default `token` is an empty string so it just removes all tags. - `text` can be a unicode string or a regular string encoded as `encoding` + `text` can be a unicode string or a byte string encoded as `encoding` (or ``'utf-8'`` if `encoding` is not given.) Always returns a unicode string. @@ -124,6 +133,7 @@ def replace_tags(text, token='', encoding=None): _REMOVECOMMENTS_RE = re.compile(u'', re.DOTALL) def remove_comments(text, encoding=None): + # type: (AnyStr, Optional[String]) -> six.text_type """ Remove HTML Comments. >>> import w3lib.html @@ -133,10 +143,13 @@ def remove_comments(text, encoding=None): """ - text = to_unicode(text, encoding) - return _REMOVECOMMENTS_RE.sub(u'', text) + text_unicode = to_unicode(text, encoding) + return _REMOVECOMMENTS_RE.sub(u'', text_unicode) + def remove_tags(text, which_ones=(), keep=(), encoding=None): + # type: (AnyStr, Sequence[String], Sequence[String], Optional[String]) -> six.text_type + """ Remove HTML Tags only. `which_ones` and `keep` are both tuples, there are four cases: @@ -185,26 +198,28 @@ def remove_tags(text, which_ones=(), keep=(), encoding=None): assert not (which_ones and keep), 'which_ones and keep can not be given at the same time' - which_ones = {tag.lower() for tag in which_ones} - keep = {tag.lower() for tag in keep} + which_ones_ = {tag.lower() for tag in which_ones} + keep_ = {tag.lower() for tag in keep} def will_remove(tag): tag = tag.lower() - if which_ones: - return tag in which_ones + if which_ones_: + return tag in which_ones_ else: - return tag not in keep + return tag not in keep_ def remove_tag(m): tag = m.group(1) return u'' if will_remove(tag) else m.group(0) - regex = '/]+).*?>' + regex = u'/]+).*?>' retags = re.compile(regex, re.DOTALL | re.IGNORECASE) return retags.sub(remove_tag, to_unicode(text, encoding)) + def remove_tags_with_content(text, which_ones=(), encoding=None): + # type: (AnyStr, Sequence[String], Optional[String]) -> six.text_type """Remove tags and their content. `which_ones` is a tuple of which tags to remove including their content. @@ -218,16 +233,18 @@ def remove_tags_with_content(text, which_ones=(), encoding=None): """ - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) if which_ones: - tags = '|'.join([r'<%s.*?|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) + tags = u'|'.join([r'<%s.*?|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) - text = retags.sub(u'', text) - return text + text_unicode = retags.sub(u'', text_unicode) + return text_unicode + +def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='', + encoding=None): + # type: (AnyStr, Sequence[String], String, Optional[String]) -> six.text_type -def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \ - encoding=None): """Remove escape characters. `which_ones` is a tuple of which escape characters we want to remove. @@ -238,12 +255,15 @@ def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \ """ - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) for ec in which_ones: - text = text.replace(ec, to_unicode(replace_by, encoding)) - return text + text_unicode = text_unicode.replace(ec, to_unicode(replace_by, encoding)) + return text_unicode + def unquote_markup(text, keep=(), remove_illegal=True, encoding=None): + # type: (AnyStr, Sequence[String], bool, Optional[String]) -> six.text_type + """ This function receives markup as a text (always a unicode string or a UTF-8 encoded string) and does the following: @@ -264,27 +284,29 @@ def _get_fragments(txt, pattern): offset = match_e yield txt[offset:] - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) ret_text = u'' - for fragment in _get_fragments(text, _cdata_re): + for fragment in _get_fragments(text_unicode, _cdata_re): if isinstance(fragment, six.string_types): # it's not a CDATA (so we try to remove its entities) - ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal) + # XXX: mypy has problems with six.string_types, + # had to ignore this type check + ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal) # type: ignore else: # it's a CDATA (so we just extract its content) ret_text += fragment.group('cdata_d') return ret_text + def get_base_url(text, baseurl='', encoding='utf-8'): + # type: (AnyStr, str, String) -> str """Return the base url if declared in the given HTML `text`, relative to the given base url. If no base url is found, the given `baseurl` is returned. - """ - - text = to_unicode(text, encoding) - m = _baseurl_re.search(text) + text_unicode = to_unicode(text, encoding) + m = _baseurl_re.search(text_unicode) if m: return moves.urllib.parse.urljoin( safe_url_string(baseurl), @@ -293,30 +315,30 @@ def get_base_url(text, baseurl='', encoding='utf-8'): else: return safe_url_string(baseurl) + def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): + # type: (AnyStr, str, String, Sequence[String]) -> Tuple[Optional[float], Optional[str]] """Return the http-equiv parameter of the HTML meta element from the given - HTML text and return a tuple ``(interval, url)`` where interval is an integer + HTML text and return a tuple ``(interval, url)`` where interval is a number containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, ``(None, None)`` is returned. """ - - if six.PY2: - baseurl = to_bytes(baseurl, encoding) + baseurl_str = to_native_str(baseurl) try: - text = to_unicode(text, encoding) + text_unicode = to_unicode(text, encoding) except UnicodeDecodeError: print(text) raise - text = remove_tags_with_content(text, ignore_tags) - text = remove_comments(replace_entities(text)) - m = _meta_refresh_re.search(text) + text_unicode = remove_tags_with_content(text_unicode, ignore_tags) + text_unicode = remove_comments(replace_entities(text_unicode)) + m = _meta_refresh_re.search(text_unicode) if m: interval = float(m.group('int')) url = safe_url_string(m.group('url').strip(' "\''), encoding) - url = moves.urllib.parse.urljoin(baseurl, url) + url = moves.urllib.parse.urljoin(baseurl_str, url) return interval, url else: return None, None diff --git a/w3lib/http.py b/w3lib/http.py index c7b94a23..599bdac7 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,7 +1,12 @@ +from __future__ import absolute_import from base64 import urlsafe_b64encode +from typing import Dict, List, Tuple, Optional, Union, AnyStr, Any + +from ._types import String def headers_raw_to_dict(headers_raw): + # type: (Optional[bytes]) -> Optional[Dict[bytes, List[bytes]]] r""" Convert raw headers (single multi-line bytestring) to a dictionary. @@ -10,7 +15,7 @@ def headers_raw_to_dict(headers_raw): >>> import w3lib.http >>> w3lib.http.headers_raw_to_dict(b"Content-type: text/html\n\rAccept: gzip\n\n") # doctest: +SKIP - {'Content-type': ['text/html'], 'Accept': ['gzip']} + {b'Content-type': [b'text/html'], b'Accept': [b'gzip']} Incorrect input: @@ -47,6 +52,7 @@ def headers_raw_to_dict(headers_raw): def headers_dict_to_raw(headers_dict): + # type: (Optional[Dict[bytes, Union[bytes, List[bytes]]]]) -> Optional[bytes] r""" Returns a raw HTTP headers representation of headers @@ -79,6 +85,7 @@ def headers_dict_to_raw(headers_dict): def basic_auth_header(username, password, encoding='ISO-8859-1'): + # type: (String, String, String) -> bytes """ Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_ @@ -89,8 +96,7 @@ def basic_auth_header(username, password, encoding='ISO-8859-1'): .. _HTTP Basic Access Authentication (RFC 2617): http://www.ietf.org/rfc/rfc2617.txt """ - - auth = "%s:%s" % (username, password) + auth = "%s:%s" % (username, password) # type: Any if not isinstance(auth, bytes): # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 # seems to be the most widely used encoding here. See also: diff --git a/w3lib/url.py b/w3lib/url.py index f55d057d..c73c1970 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -16,12 +16,16 @@ quote, parse_qs, parse_qsl, ParseResult, unquote, urlunparse) from six.moves.urllib.request import pathname2url, url2pathname +from typing import AnyStr, Tuple, Union, Set, Sequence, TypeVar from w3lib.util import to_bytes, to_native_str, to_unicode +from w3lib._types import String + +T = TypeVar('T') # error handling function for bytes-to-Unicode decoding errors with URLs def _quote_byte(error): - return (to_unicode(quote(error.object[error.start:error.end])), error.end) + return to_unicode(quote(error.object[error.start:error.end])), error.end codecs.register_error('percentencode', _quote_byte) @@ -34,7 +38,9 @@ def _quote_byte(error): _safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%' + def safe_url_string(url, encoding='utf8', path_encoding='utf8'): + # type: (AnyStr, String, String) -> str """Convert the given URL into a legal URL by escaping unsafe characters according to RFC-3986. @@ -84,7 +90,9 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'): _parent_dirs = re.compile(r'/?(\.\./)+') + def safe_download_url(url): + # type: (str) -> str """ Make a url for download. This will call safe_url_string and then strip the fragment, if one exists. The path will be normalised. @@ -104,10 +112,12 @@ def safe_download_url(url): def is_url(text): + # type: (String) -> bool return text.partition("://")[0] in ('file', 'http', 'https') def url_query_parameter(url, parameter, default=None, keep_blank_values=0): + # type: (str, String, T, bool) -> Union[str, T] """Return the value of a url parameter, given the url and parameter name General case: @@ -123,14 +133,14 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0): 'mydefault' >>> - Returns None if `keep_blank_values` not set or 0 (default): + Returns None if `keep_blank_values` not set or False (default): >>> w3lib.url.url_query_parameter("product.html?id=", "id") >>> - Returns an empty string if `keep_blank_values` set to 1: + Returns an empty string if `keep_blank_values` set to True: - >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1) + >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=True) '' >>> @@ -140,10 +150,14 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0): urlsplit(str(url))[3], keep_blank_values=keep_blank_values ) - return queryparams.get(parameter, [default])[0] + # mypy shows 'List item 0 has incompatible type "_T"' error + values = queryparams.get(parameter, [default]) # type: ignore + return values[0] + def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False): + # type: (str, Union[str, Sequence[str]], str, str, bool, bool, bool) -> str """Clean URL arguments leaving only those passed in the parameterlist keeping order >>> import w3lib.url @@ -179,7 +193,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u parameterlist = [parameterlist] url, fragment = urldefrag(url) base, _, query = url.partition('?') - seen = set() + seen = set() # type: Set[str] querylist = [] for ksv in query.split(sep): if not ksv: @@ -208,10 +222,11 @@ def _add_or_replace_parameters(url, params): new_args.update(params) query = urlencode(new_args) - return urlunsplit(parsed._replace(query=query)) - + # "SplitResult" has no attribute "_replace" - looks like a bug in typeshed + return urlunsplit(parsed._replace(query=query)) # type: ignore def add_or_replace_parameter(url, name, new_value): + # type: (str, str, str) -> str """Add or remove a parameter to a given url >>> import w3lib.url @@ -243,6 +258,7 @@ def add_or_replace_parameters(url, new_parameters): def path_to_file_uri(path): + # type: (str) -> str """Convert local filesystem path to legal File URIs as described in: http://en.wikipedia.org/wiki/File_URI_scheme """ @@ -253,6 +269,7 @@ def path_to_file_uri(path): def file_uri_to_path(uri): + # type: (str) -> str """Convert File URI to local filesystem path according to: http://en.wikipedia.org/wiki/File_URI_scheme """ @@ -261,6 +278,7 @@ def file_uri_to_path(uri): def any_to_uri(uri_or_path): + # type: (str) -> str """If given a path name, return its File URI, otherwise return it unmodified """ diff --git a/w3lib/util.py b/w3lib/util.py index d8513eef..3015525d 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -1,4 +1,8 @@ import six +from typing import AnyStr, Optional + +from ._types import String + def str_to_unicode(text, encoding=None, errors='strict'): if encoding is None: @@ -7,6 +11,7 @@ def str_to_unicode(text, encoding=None, errors='strict'): return text.decode(encoding, errors) return text + def unicode_to_str(text, encoding=None, errors='strict'): if encoding is None: encoding = 'utf-8' @@ -14,19 +19,23 @@ def unicode_to_str(text, encoding=None, errors='strict'): return text.encode(encoding, errors) return text + def to_unicode(text, encoding=None, errors='strict'): + # type: (AnyStr, Optional[String], String) -> six.text_type """Return the unicode representation of a bytes object `text`. If `text` is already an unicode object, return it as-is.""" if isinstance(text, six.text_type): return text - if not isinstance(text, (bytes, six.text_type)): + if not isinstance(text, bytes): raise TypeError('to_unicode must receive a bytes, str or unicode ' 'object, got %s' % type(text).__name__) if encoding is None: encoding = 'utf-8' return text.decode(encoding, errors) + def to_bytes(text, encoding=None, errors='strict'): + # type: (AnyStr, Optional[String], String) -> bytes """Return the binary representation of `text`. If `text` is already a bytes object, return it as-is.""" if isinstance(text, bytes): @@ -38,7 +47,9 @@ def to_bytes(text, encoding=None, errors='strict'): encoding = 'utf-8' return text.encode(encoding, errors) + def to_native_str(text, encoding=None, errors='strict'): + # type: (AnyStr, Optional[String], String) -> str """ Return str representation of `text` (bytes in Python 2.x and unicode in Python 3.x). """ if six.PY2: From 528158fc708d04b8c2c4a17878876a977754b5ba Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 8 Jul 2016 22:46:47 +0500 Subject: [PATCH 2/8] cleanup: clarify String uses, remove unused imports --- w3lib/_types.py | 4 ++-- w3lib/encoding.py | 3 +-- w3lib/html.py | 4 ++-- w3lib/http.py | 2 +- w3lib/url.py | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/w3lib/_types.py b/w3lib/_types.py index e8e7117d..324a3e49 100644 --- a/w3lib/_types.py +++ b/w3lib/_types.py @@ -7,14 +7,14 @@ 2. Variable is binary; unicode is not accepted ==> use ``bytes`` 3. Variable is text, and it can be only unicode in Python 2 ==> use ``six.text_type`` (or typing.Text??) -4. Variable is text, but it can be str in Python 2 ==> use w3lib._types.String +4. Variable is text, but it can be ascii or utf8-encoded str + in Python 2 ==> use w3lib._types.String 5. Variable can be either bytes or unicode both in Python 2 and Python 3 ==> use typing.AnyStr 6. Variable should be str (==bytes) in Python 2 and str (==unicode) in Python 3 ==> use ``str``. """ - from __future__ import absolute_import from typing import Union import six diff --git a/w3lib/encoding.py b/w3lib/encoding.py index b965baeb..e4060113 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -6,11 +6,10 @@ import codecs import encodings # type: ignore from sys import version_info -from typing import Optional, Union, AnyStr, Tuple, Callable +from typing import Optional, AnyStr, Tuple, Callable import six from .util import to_native_str -from ._types import String _HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I) diff --git a/w3lib/html.py b/w3lib/html.py index bbaf439a..68bca505 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -7,9 +7,9 @@ import re import six from six import moves -from typing import AnyStr, Optional, Iterable, Tuple, Union, Sequence +from typing import AnyStr, Optional, Tuple, Sequence -from w3lib.util import to_bytes, to_unicode, to_native_str +from w3lib.util import to_unicode, to_native_str from w3lib.url import safe_url_string from w3lib._types import String diff --git a/w3lib/http.py b/w3lib/http.py index 599bdac7..e4fae9b5 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from base64 import urlsafe_b64encode -from typing import Dict, List, Tuple, Optional, Union, AnyStr, Any +from typing import Dict, List, Optional, Union, Any from ._types import String diff --git a/w3lib/url.py b/w3lib/url.py index c73c1970..33377be7 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -16,7 +16,7 @@ quote, parse_qs, parse_qsl, ParseResult, unquote, urlunparse) from six.moves.urllib.request import pathname2url, url2pathname -from typing import AnyStr, Tuple, Union, Set, Sequence, TypeVar +from typing import AnyStr, Union, Set, Sequence, TypeVar from w3lib.util import to_bytes, to_native_str, to_unicode from w3lib._types import String From b69e9757129849bd850a86f143feaccfc3b0887b Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 8 Jul 2016 23:57:05 +0500 Subject: [PATCH 3/8] more precise result value types thanks @lopuhin for the catch --- w3lib/encoding.py | 7 ++++--- w3lib/html.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/w3lib/encoding.py b/w3lib/encoding.py index e4060113..5a04528a 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -6,7 +6,7 @@ import codecs import encodings # type: ignore from sys import version_info -from typing import Optional, AnyStr, Tuple, Callable +from typing import Optional, AnyStr, Tuple, Callable, Union import six from .util import to_native_str @@ -163,7 +163,7 @@ def resolve_encoding(encoding_alias): def read_bom(data): - # type: (bytes) -> Tuple[str, bytes] + # type: (bytes) -> Union[Tuple[str, bytes], Tuple[None, None]] r"""Read the byte order mark in the text, if present, and return the encoding represented by the BOM and the BOM. @@ -271,7 +271,8 @@ def html_to_unicode(content_type_header, html_body_str, ''' enc = http_content_type_encoding(content_type_header) - bom_enc, bom = read_bom(html_body_str) + # FIXME: remove type: ignore when mypy bug is fixed + bom_enc, bom = read_bom(html_body_str) # type: ignore if enc is not None: # remove BOM if it agrees with the encoding if enc == bom_enc: diff --git a/w3lib/html.py b/w3lib/html.py index 68bca505..0b0a4b5e 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -7,7 +7,7 @@ import re import six from six import moves -from typing import AnyStr, Optional, Tuple, Sequence +from typing import AnyStr, Optional, Tuple, Sequence, Union from w3lib.util import to_unicode, to_native_str from w3lib.url import safe_url_string @@ -317,7 +317,7 @@ def get_base_url(text, baseurl='', encoding='utf-8'): def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): - # type: (AnyStr, str, String, Sequence[String]) -> Tuple[Optional[float], Optional[str]] + # type: (AnyStr, str, String, Sequence[String]) -> Union[Tuple[float, str], Tuple[None, None]] """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple ``(interval, url)`` where interval is a number containing the delay in seconds (or zero if not present) and url is a From ed46d4689a636372c144340895aa2e2d5aeba6a9 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Fri, 2 Nov 2018 14:21:05 +0800 Subject: [PATCH 4/8] use py37 for mypy --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9b6642a9..ad47a3ac 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,9 +18,9 @@ matrix: sudo: true - python: 3.5 env: TOXENV=pypy3 - - python: 3.6 + - python: 3.7 env: TOXENV=mypy2 - - python: 3.6 + - python: 3.7 env: TOXENV=mypy3 install: From 04aedad266862ad19adac27b664bf502f5d251f7 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Fri, 2 Nov 2018 14:30:51 +0800 Subject: [PATCH 5/8] revert to py36 for mypy --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index ad47a3ac..9b6642a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,9 +18,9 @@ matrix: sudo: true - python: 3.5 env: TOXENV=pypy3 - - python: 3.7 + - python: 3.6 env: TOXENV=mypy2 - - python: 3.7 + - python: 3.6 env: TOXENV=mypy3 install: From d2d6e6870196f9db1cfa3caa731e590bd0a06767 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Fri, 2 Nov 2018 18:51:03 +0800 Subject: [PATCH 6/8] mypy-lang is now mypy --- tox.ini | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tox.ini b/tox.ini index 0b213434..a43b8b3a 100644 --- a/tox.ini +++ b/tox.ini @@ -20,7 +20,7 @@ commands = [testenv:mypy2] basepython = python3.6 deps = - mypy-lang + mypy typing commands = mypy --py2 w3lib tests @@ -29,7 +29,6 @@ commands = [testenv:mypy3] basepython = python3.6 deps = - mypy-lang - typing + mypy commands = mypy w3lib tests From fe2213f522b502c7dcff610d0878e0e8dc3afd9d Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Thu, 24 Jan 2019 11:25:40 +0800 Subject: [PATCH 7/8] fix mypy errors --- .gitignore | 3 ++- w3lib/encoding.py | 11 +++++++---- w3lib/http.py | 2 +- w3lib/url.py | 6 +++--- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 3fe67fd1..e967c85a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ dist docs/_build _trial_temp .coverage -.cache \ No newline at end of file +.cache +.mypy_cache/ \ No newline at end of file diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 5a04528a..3462934f 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -6,7 +6,7 @@ import codecs import encodings # type: ignore from sys import version_info -from typing import Optional, AnyStr, Tuple, Callable, Union +from typing import Optional, AnyStr, Tuple, Callable, Union, cast import six from .util import to_native_str @@ -15,7 +15,7 @@ def http_content_type_encoding(content_type): - # type: (str) -> Optional[str] + # type: (Optional[str]) -> Optional[str] """Extract the encoding in the content-type header >>> import w3lib.encoding @@ -28,6 +28,7 @@ def http_content_type_encoding(content_type): match = _HEADER_ENCODING_RE.search(content_type) if match: return resolve_encoding(match.group(1)) + return None # regexp for parsing HTTP meta tags @@ -91,6 +92,7 @@ def html_body_declared_encoding(html_body_str): or match.group('xmlcharset') if encoding: return resolve_encoding(to_native_str(encoding)) + return None # Default encoding translation @@ -127,8 +129,8 @@ def _c18n_encoding(encoding): This performs normalization and translates aliases using python's encoding aliases """ - normed = encodings.normalize_encoding(encoding).lower() - return encodings.aliases.aliases.get(normed, normed) + normed = encodings.normalize_encoding(encoding).lower() # type: ignore + return encodings.aliases.aliases.get(normed, normed) # type: ignore def resolve_encoding(encoding_alias): @@ -273,6 +275,7 @@ def html_to_unicode(content_type_header, html_body_str, enc = http_content_type_encoding(content_type_header) # FIXME: remove type: ignore when mypy bug is fixed bom_enc, bom = read_bom(html_body_str) # type: ignore + bom = cast(bytes, bom) if enc is not None: # remove BOM if it agrees with the encoding if enc == bom_enc: diff --git a/w3lib/http.py b/w3lib/http.py index e4fae9b5..848fac7a 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -35,7 +35,7 @@ def headers_raw_to_dict(headers_raw): headers = headers_raw.splitlines() headers_tuples = [header.split(b':', 1) for header in headers] - result_dict = {} + result_dict = {} # type: Dict[bytes, List[bytes]] for header_item in headers_tuples: if not len(header_item) == 2: continue diff --git a/w3lib/url.py b/w3lib/url.py index 33377be7..e12b0a61 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -70,7 +70,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'): try: netloc = parts.netloc.encode('idna') except UnicodeError: - netloc = parts.netloc + netloc = parts.netloc # type: ignore # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) @@ -116,7 +116,7 @@ def is_url(text): return text.partition("://")[0] in ('file', 'http', 'https') -def url_query_parameter(url, parameter, default=None, keep_blank_values=0): +def url_query_parameter(url, parameter, default=None, keep_blank_values=False): # type: (str, String, T, bool) -> Union[str, T] """Return the value of a url parameter, given the url and parameter name @@ -543,7 +543,7 @@ def parse_url(url, encoding=None): if not six.PY2: - from urllib.parse import _coerce_args, unquote_to_bytes + from urllib.parse import _coerce_args, unquote_to_bytes # type: ignore def parse_qsl_to_bytes(qs, keep_blank_values=False): """Parse a query given as a string argument. From 8887f4d8c0ea14decbeff8c552c679aaf26bc354 Mon Sep 17 00:00:00 2001 From: Lucy Wang Date: Wed, 30 Jan 2019 13:18:24 +0800 Subject: [PATCH 8/8] more fix --- w3lib/encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 3462934f..3d613fc1 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -85,7 +85,7 @@ def html_body_declared_encoding(html_body_str): if isinstance(chunk, bytes): match = _BODY_ENCODING_BYTES_RE.search(chunk) else: - match = _BODY_ENCODING_STR_RE.search(chunk) + match = _BODY_ENCODING_STR_RE.search(chunk) # type: ignore if match: encoding = match.group('charset') or match.group('charset2') \