From c38f5da9ba010399df7f59395c0ad164aa311735 Mon Sep 17 00:00:00 2001
From: Mikhail Korobov <kmike84@gmail.com>
Date: Fri, 8 Jul 2016 02:12:55 +0500
Subject: [PATCH 1/8] type hints

---
 .travis.yml            |  4 ++
 setup.py               |  2 +-
 tests/test_encoding.py |  9 ++++
 tox.ini                | 20 ++++++++-
 w3lib/_types.py        | 25 +++++++++++
 w3lib/encoding.py      | 38 +++++++++++++----
 w3lib/html.py          | 96 ++++++++++++++++++++++++++----------------
 w3lib/http.py          | 12 ++++--
 w3lib/url.py           | 34 +++++++++++----
 w3lib/util.py          | 13 +++++-
 10 files changed, 194 insertions(+), 59 deletions(-)
 create mode 100644 w3lib/_types.py
diff --git a/.travis.yml b/.travis.yml
index dfeee83b..9b6642a9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,6 +18,10 @@ matrix:
       sudo: true
     - python: 3.5
       env: TOXENV=pypy3
+    - python: 3.6
+      env: TOXENV=mypy2
+    - python: 3.6
+      env: TOXENV=mypy3
 
 install:
   - |
diff --git a/setup.py b/setup.py
index 2ae088c9..44fdf0e2 100644
--- a/setup.py
+++ b/setup.py
@@ -29,5 +29,5 @@
         'Programming Language :: Python :: Implementation :: PyPy',
         'Topic :: Internet :: WWW/HTTP',
     ],
-    install_requires=['six >= 1.4.1'],
+    install_requires=['six >= 1.4.1', 'typing'],
 )
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
index 649c189a..5c2ff0f4 100644
--- a/tests/test_encoding.py
+++ b/tests/test_encoding.py
@@ -3,6 +3,7 @@
 from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode,
         http_content_type_encoding, resolve_encoding, html_to_unicode)
 
+
 class RequestEncodingTests(unittest.TestCase):
     utf8_fragments = [
         # Content-Type as meta http-equiv
@@ -51,6 +52,7 @@ def test_html_body_declared_encoding(self):
         for fragment in self.utf8_fragments:
             encoding = html_body_declared_encoding(fragment)
             self.assertEqual(encoding, 'utf-8', fragment)
+
         self.assertEqual(None, html_body_declared_encoding(b"something else"))
         self.assertEqual(None, html_body_declared_encoding(b"""
             <head></head><body>
@@ -76,6 +78,11 @@ def test_html_body_declared_encoding_unicode(self):
         self.assertEqual(None, html_body_declared_encoding(
             u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
 
+    def test_html_body_declared_encoding_aliases(self):
+        fragment = b"""<meta http-equiv="content-type" content="text/html;charset=win-1251"/>"""
+        self.assertEqual("cp1251", html_body_declared_encoding(fragment))
+        self.assertEqual("cp1251", html_body_declared_encoding(fragment.decode('utf8')))
+
 
 class CodecsEncodingTestCase(unittest.TestCase):
     def test_resolve_encoding(self):
@@ -97,9 +104,11 @@ def test_invalid_utf8(self):
 def ct(charset):
     return "Content-Type: text/html; charset=" + charset if charset else None
 
+
 def norm_encoding(enc):
     return codecs.lookup(enc).name
 
+
 class HtmlConversionTests(unittest.TestCase):
 
     def test_unicode_body(self):
diff --git a/tox.ini b/tox.ini
index 0e5a0b39..0b213434 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py27, pypy, py34, py35, py36, py37, pypy3
+envlist = py27, pypy, py34, py35, py36, py37, pypy3, mypy2, mypy3
 
 [testenv]
 deps =
@@ -15,3 +15,21 @@ commands =
         --doctest-modules \
         --cov=w3lib --cov-report=term \
         {posargs:w3lib tests}
+
+
+[testenv:mypy2]
+basepython = python3.6
+deps =
+    mypy-lang
+    typing
+commands =
+    mypy --py2 w3lib tests
+
+
+[testenv:mypy3]
+basepython = python3.6
+deps =
+    mypy-lang
+    typing
+commands =
+    mypy w3lib tests
diff --git a/w3lib/_types.py b/w3lib/_types.py
new file mode 100644
index 00000000..e8e7117d
--- /dev/null
+++ b/w3lib/_types.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+"""
+Which string type to use?
+=========================
+
+1. Variable is an URL ==> use ``str``
+2. Variable is binary; unicode is not accepted ==> use ``bytes``
+3. Variable is text, and it can be only unicode in Python 2 ==> use
+   ``six.text_type``  (or typing.Text??)
+4. Variable is text, but it can be str in Python 2 ==> use w3lib._types.String
+5. Variable can be either bytes or unicode both in Python 2
+   and Python 3 ==> use typing.AnyStr
+6. Variable should be str (==bytes) in Python 2
+   and str (==unicode) in Python 3 ==> use ``str``.
+
+"""
+
+from __future__ import absolute_import
+from typing import Union
+import six
+
+if six.PY2:
+    String = Union[bytes, unicode]
+else:
+    String = str
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index c7ac567f..b965baeb 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -2,12 +2,21 @@
 """
 Functions for handling encoding of web pages
 """
-import re, codecs, encodings
+import re
+import codecs
+import encodings  # type: ignore
 from sys import version_info
+from typing import Optional, Union, AnyStr, Tuple, Callable
+import six
+
+from .util import to_native_str
+from ._types import String
 
 _HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
 
+
 def http_content_type_encoding(content_type):
+    # type: (str) -> Optional[str]
     """Extract the encoding in the content-type header
 
     >>> import w3lib.encoding
@@ -21,6 +30,7 @@ def http_content_type_encoding(content_type):
         if match:
             return resolve_encoding(match.group(1))
 
+
 # regexp for parsing HTTP meta tags
 _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
 _SKIP_ATTRS = '''(?:\\s+
@@ -40,13 +50,15 @@ def http_content_type_encoding(content_type):
 _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
 
 # check for meta tags, or xml decl. and stop search if a body tag is encountered
-_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
+_BODY_ENCODING_PATTERN = six.u(r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)') % (
     _SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
 _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE)
 _BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'),
                                      re.I | re.VERBOSE)
 
+
 def html_body_declared_encoding(html_body_str):
+    # type: (AnyStr) -> Optional[str]
     '''Return the encoding specified in meta tags in the html body,
     or ``None`` if no suitable encoding was found
 
@@ -79,7 +91,8 @@ def html_body_declared_encoding(html_body_str):
         encoding = match.group('charset') or match.group('charset2') \
                 or match.group('xmlcharset')
         if encoding:
-            return resolve_encoding(encoding)
+            return resolve_encoding(to_native_str(encoding))
+
 
 # Default encoding translation
 # this maps cannonicalized encodings to target encodings
@@ -109,6 +122,7 @@ def html_body_declared_encoding(html_body_str):
 }
 
 def _c18n_encoding(encoding):
+    # type: (AnyStr) -> str
     """Cannonicalize an encoding name
 
     This performs normalization and translates aliases using python's
@@ -117,7 +131,9 @@ def _c18n_encoding(encoding):
     normed = encodings.normalize_encoding(encoding).lower()
     return encodings.aliases.aliases.get(normed, normed)
 
+
 def resolve_encoding(encoding_alias):
+    # type: (AnyStr) -> Optional[str]
     """Return the encoding that `encoding_alias` maps to, or ``None``
     if the encoding cannot be interpreted
 
@@ -136,6 +152,7 @@ def resolve_encoding(encoding_alias):
     except LookupError:
         return None
 
+
 _BOM_TABLE = [
     (codecs.BOM_UTF32_BE, 'utf-32-be'),
     (codecs.BOM_UTF32_LE, 'utf-32-le'),
@@ -145,7 +162,9 @@ def resolve_encoding(encoding_alias):
 ]
 _FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
 
+
 def read_bom(data):
+    # type: (bytes) -> Tuple[str, bytes]
     r"""Read the byte order mark in the text, if present, and
     return the encoding represented by the BOM and the BOM.
 
@@ -153,13 +172,13 @@ def read_bom(data):
 
     >>> import w3lib.encoding
     >>> w3lib.encoding.read_bom(b'\xfe\xff\x6c\x34')
-    ('utf-16-be', '\xfe\xff')
+    ('utf-16-be', b'\xfe\xff')
     >>> w3lib.encoding.read_bom(b'\xff\xfe\x34\x6c')
-    ('utf-16-le', '\xff\xfe')
+    ('utf-16-le', b'\xff\xfe')
     >>> w3lib.encoding.read_bom(b'\x00\x00\xfe\xff\x00\x00\x6c\x34')
-    ('utf-32-be', '\x00\x00\xfe\xff')
+    ('utf-32-be', b'\x00\x00\xfe\xff')
     >>> w3lib.encoding.read_bom(b'\xff\xfe\x00\x00\x34\x6c\x00\x00')
-    ('utf-32-le', '\xff\xfe\x00\x00')
+    ('utf-32-le', b'\xff\xfe\x00\x00')
     >>> w3lib.encoding.read_bom(b'\x01\x02\x03\x04')
     (None, None)
     >>>
@@ -175,9 +194,10 @@ def read_bom(data):
 
 # Python decoder doesn't follow unicode standard when handling
 # bad utf-8 encoded strings. see http://bugs.python.org/issue8271
-codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end))
+codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end))  # type: ignore
 
 def to_unicode(data_str, encoding):
+    # type: (bytes, str) -> six.text_type
     """Convert a str object to unicode using the encoding given
 
     Characters that cannot be converted will be converted to ``\\ufffd`` (the
@@ -185,8 +205,10 @@ def to_unicode(data_str, encoding):
     """
     return data_str.decode(encoding, 'replace' if version_info[0:2] >= (3, 3) else 'w3lib_replace')
 
+
 def html_to_unicode(content_type_header, html_body_str,
         default_encoding='utf8', auto_detect_fun=None):
+    # type: (Optional[str], bytes, str, Optional[Callable[[bytes], str]]) -> Tuple[str, six.text_type]
     r'''Convert raw html bytes to unicode
 
     This attempts to make a reasonable guess at the content encoding of the
diff --git a/w3lib/html.py b/w3lib/html.py
index 9990a351..bbaf439a 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -7,12 +7,14 @@
 import re
 import six
 from six import moves
+from typing import AnyStr, Optional, Iterable, Tuple, Union, Sequence
 
-from w3lib.util import to_bytes, to_unicode
+from w3lib.util import to_bytes, to_unicode, to_native_str
 from w3lib.url import safe_url_string
+from w3lib._types import String
 
-_ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
-_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
+_ent_re = re.compile(six.u(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)'), re.IGNORECASE)
+_tag_re = re.compile(six.u(r'<[a-zA-Z\/!].*?>'), re.DOTALL)
 _baseurl_re = re.compile(six.u(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I)
 _meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)
 _cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
@@ -38,7 +40,9 @@ def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
 
     return replace_entities(text, keep, remove_illegal, encoding)
 
+
 def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
+    # type: (AnyStr, Sequence[String], bool, String) -> six.text_type
     u"""Remove entities from the given `text` by converting them to their
     corresponding unicode character.
 
@@ -96,14 +100,19 @@ def convert_entity(m):
 
     return _ent_re.sub(convert_entity, to_unicode(text, encoding))
 
+
 def has_entities(text, encoding=None):
+    # type: (AnyStr, Optional[String]) -> bool
     return bool(_ent_re.search(to_unicode(text, encoding)))
 
+
 def replace_tags(text, token='', encoding=None):
+    # type: (AnyStr, String, Optional[String]) -> six.text_type
+
     """Replace all markup tags found in the given `text` by the given token.
     By default `token` is an empty string so it just removes all tags.
 
-    `text` can be a unicode string or a regular string encoded as `encoding`
+    `text` can be a unicode string or a byte string encoded as `encoding`
     (or ``'utf-8'`` if `encoding` is not given.)
 
     Always returns a unicode string.
@@ -124,6 +133,7 @@ def replace_tags(text, token='', encoding=None):
 
 _REMOVECOMMENTS_RE = re.compile(u'<!--.*?-->', re.DOTALL)
 def remove_comments(text, encoding=None):
+    # type: (AnyStr, Optional[String]) -> six.text_type
     """ Remove HTML Comments.
 
     >>> import w3lib.html
@@ -133,10 +143,13 @@ def remove_comments(text, encoding=None):
 
     """
 
-    text = to_unicode(text, encoding)
-    return _REMOVECOMMENTS_RE.sub(u'', text)
+    text_unicode = to_unicode(text, encoding)
+    return _REMOVECOMMENTS_RE.sub(u'', text_unicode)
+
 
 def remove_tags(text, which_ones=(), keep=(), encoding=None):
+    # type: (AnyStr, Sequence[String], Sequence[String], Optional[String]) -> six.text_type
+
     """ Remove HTML Tags only.
 
     `which_ones` and `keep` are both tuples, there are four cases:
@@ -185,26 +198,28 @@ def remove_tags(text, which_ones=(), keep=(), encoding=None):
 
     assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'
 
-    which_ones = {tag.lower() for tag in which_ones}
-    keep = {tag.lower() for tag in keep}
+    which_ones_ = {tag.lower() for tag in which_ones}
+    keep_ = {tag.lower() for tag in keep}
 
     def will_remove(tag):
         tag = tag.lower()
-        if which_ones:
-            return tag in which_ones
+        if which_ones_:
+            return tag in which_ones_
         else:
-            return tag not in keep
+            return tag not in keep_
 
     def remove_tag(m):
         tag = m.group(1)
         return u'' if will_remove(tag) else m.group(0)
 
-    regex = '</?([^ >/]+).*?>'
+    regex = u'</?([^ >/]+).*?>'
     retags = re.compile(regex, re.DOTALL | re.IGNORECASE)
 
     return retags.sub(remove_tag, to_unicode(text, encoding))
 
+
 def remove_tags_with_content(text, which_ones=(), encoding=None):
+    # type: (AnyStr, Sequence[String], Optional[String]) -> six.text_type
     """Remove tags and their content.
 
     `which_ones` is a tuple of which tags to remove including their content.
@@ -218,16 +233,18 @@ def remove_tags_with_content(text, which_ones=(), encoding=None):
 
     """
 
-    text = to_unicode(text, encoding)
+    text_unicode = to_unicode(text, encoding)
     if which_ones:
-        tags = '|'.join([r'<%s.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
+        tags = u'|'.join([r'<%s.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
         retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
-        text = retags.sub(u'', text)
-    return text
+        text_unicode = retags.sub(u'', text_unicode)
+    return text_unicode
+
 
+def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='',
+                         encoding=None):
+    # type: (AnyStr, Sequence[String], String, Optional[String]) -> six.text_type
 
-def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
-        encoding=None):
     """Remove escape characters.
 
     `which_ones` is a tuple of which escape characters we want to remove.
@@ -238,12 +255,15 @@ def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
 
     """
 
-    text = to_unicode(text, encoding)
+    text_unicode = to_unicode(text, encoding)
     for ec in which_ones:
-        text = text.replace(ec, to_unicode(replace_by, encoding))
-    return text
+        text_unicode = text_unicode.replace(ec, to_unicode(replace_by, encoding))
+    return text_unicode
+
 
 def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
+    # type: (AnyStr, Sequence[String], bool, Optional[String]) -> six.text_type
+
     """
     This function receives markup as a text (always a unicode string or
     a UTF-8 encoded string) and does the following:
@@ -264,27 +284,29 @@ def _get_fragments(txt, pattern):
             offset = match_e
         yield txt[offset:]
 
-    text = to_unicode(text, encoding)
+    text_unicode = to_unicode(text, encoding)
     ret_text = u''
-    for fragment in _get_fragments(text, _cdata_re):
+    for fragment in _get_fragments(text_unicode, _cdata_re):
         if isinstance(fragment, six.string_types):
             # it's not a CDATA (so we try to remove its entities)
-            ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)
+            # XXX: mypy has problems with six.string_types,
+            # had to ignore this type check
+            ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)  # type: ignore
         else:
             # it's a CDATA (so we just extract its content)
             ret_text += fragment.group('cdata_d')
     return ret_text
 
+
 def get_base_url(text, baseurl='', encoding='utf-8'):
+    # type: (AnyStr, str, String) -> str
     """Return the base url if declared in the given HTML `text`,
     relative to the given base url.
 
     If no base url is found, the given `baseurl` is returned.
-
     """
-
-    text = to_unicode(text, encoding)
-    m = _baseurl_re.search(text)
+    text_unicode = to_unicode(text, encoding)
+    m = _baseurl_re.search(text_unicode)
     if m:
         return moves.urllib.parse.urljoin(
             safe_url_string(baseurl),
@@ -293,30 +315,30 @@ def get_base_url(text, baseurl='', encoding='utf-8'):
     else:
         return safe_url_string(baseurl)
 
+
 def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
+    # type: (AnyStr, str, String, Sequence[String]) -> Tuple[Optional[float], Optional[str]]
     """Return  the http-equiv parameter of the HTML meta element from the given
-    HTML text and return a tuple ``(interval, url)`` where interval is an integer
+    HTML text and return a tuple ``(interval, url)`` where interval is a number
     containing the delay in seconds (or zero if not present) and url is a
     string with the absolute url to redirect.
 
     If no meta redirect is found, ``(None, None)`` is returned.
 
     """
-
-    if six.PY2:
-        baseurl = to_bytes(baseurl, encoding)
+    baseurl_str = to_native_str(baseurl)
     try:
-        text = to_unicode(text, encoding)
+        text_unicode = to_unicode(text, encoding)
     except UnicodeDecodeError:
         print(text)
         raise
-    text = remove_tags_with_content(text, ignore_tags)
-    text = remove_comments(replace_entities(text))
-    m = _meta_refresh_re.search(text)
+    text_unicode = remove_tags_with_content(text_unicode, ignore_tags)
+    text_unicode = remove_comments(replace_entities(text_unicode))
+    m = _meta_refresh_re.search(text_unicode)
     if m:
         interval = float(m.group('int'))
         url = safe_url_string(m.group('url').strip(' "\''), encoding)
-        url = moves.urllib.parse.urljoin(baseurl, url)
+        url = moves.urllib.parse.urljoin(baseurl_str, url)
         return interval, url
     else:
         return None, None
diff --git a/w3lib/http.py b/w3lib/http.py
index c7b94a23..599bdac7 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -1,7 +1,12 @@
+from __future__ import absolute_import
 from base64 import urlsafe_b64encode
+from typing import Dict, List, Tuple, Optional, Union, AnyStr, Any
+
+from ._types import String
 
 
 def headers_raw_to_dict(headers_raw):
+    # type: (Optional[bytes]) -> Optional[Dict[bytes, List[bytes]]]
     r"""
     Convert raw headers (single multi-line bytestring)
     to a dictionary.
@@ -10,7 +15,7 @@ def headers_raw_to_dict(headers_raw):
 
     >>> import w3lib.http
     >>> w3lib.http.headers_raw_to_dict(b"Content-type: text/html\n\rAccept: gzip\n\n")   # doctest: +SKIP
-    {'Content-type': ['text/html'], 'Accept': ['gzip']}
+    {b'Content-type': [b'text/html'], b'Accept': [b'gzip']}
 
     Incorrect input:
 
@@ -47,6 +52,7 @@ def headers_raw_to_dict(headers_raw):
 
 
 def headers_dict_to_raw(headers_dict):
+    # type: (Optional[Dict[bytes, Union[bytes, List[bytes]]]]) -> Optional[bytes]
     r"""
     Returns a raw HTTP headers representation of headers
 
@@ -79,6 +85,7 @@ def headers_dict_to_raw(headers_dict):
 
 
 def basic_auth_header(username, password, encoding='ISO-8859-1'):
+    # type: (String, String, String) -> bytes
     """
     Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_
 
@@ -89,8 +96,7 @@ def basic_auth_header(username, password, encoding='ISO-8859-1'):
     .. _HTTP Basic Access Authentication (RFC 2617): http://www.ietf.org/rfc/rfc2617.txt
 
     """
-
-    auth = "%s:%s" % (username, password)
+    auth = "%s:%s" % (username, password)  # type: Any
     if not isinstance(auth, bytes):
         # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
         # seems to be the most widely used encoding here. See also:
diff --git a/w3lib/url.py b/w3lib/url.py
index f55d057d..c73c1970 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -16,12 +16,16 @@
                                     quote, parse_qs, parse_qsl,
                                     ParseResult, unquote, urlunparse)
 from six.moves.urllib.request import pathname2url, url2pathname
+from typing import AnyStr, Tuple, Union, Set, Sequence, TypeVar
 from w3lib.util import to_bytes, to_native_str, to_unicode
+from w3lib._types import String
+
+T = TypeVar('T')
 
 
 # error handling function for bytes-to-Unicode decoding errors with URLs
 def _quote_byte(error):
-    return (to_unicode(quote(error.object[error.start:error.end])), error.end)
+    return to_unicode(quote(error.object[error.start:error.end])), error.end
 
 codecs.register_error('percentencode', _quote_byte)
 
@@ -34,7 +38,9 @@ def _quote_byte(error):
 
 _safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'
 
+
 def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
+    # type: (AnyStr, String, String) -> str
     """Convert the given URL into a legal URL by escaping unsafe characters
     according to RFC-3986.
 
@@ -84,7 +90,9 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
 
 _parent_dirs = re.compile(r'/?(\.\./)+')
 
+
 def safe_download_url(url):
+    # type: (str) -> str
     """ Make a url for download. This will call safe_url_string
     and then strip the fragment, if one exists. The path will
     be normalised.
@@ -104,10 +112,12 @@ def safe_download_url(url):
 
 
 def is_url(text):
+    # type: (String) -> bool
     return text.partition("://")[0] in ('file', 'http', 'https')
 
 
 def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
+    # type: (str, String, T, bool) -> Union[str, T]
     """Return the value of a url parameter, given the url and parameter name
 
     General case:
@@ -123,14 +133,14 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
     'mydefault'
     >>>
 
-    Returns None if `keep_blank_values` not set or 0 (default):
+    Returns None if `keep_blank_values` not set or False (default):
 
     >>> w3lib.url.url_query_parameter("product.html?id=", "id")
     >>>
 
-    Returns an empty string if `keep_blank_values` set to 1:
+    Returns an empty string if `keep_blank_values` set to True:
 
-    >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1)
+    >>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=True)
     ''
     >>>
 
@@ -140,10 +150,14 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
         urlsplit(str(url))[3],
         keep_blank_values=keep_blank_values
     )
-    return queryparams.get(parameter, [default])[0]
+    # mypy shows 'List item 0 has incompatible type "_T"' error
+    values = queryparams.get(parameter, [default])  # type: ignore
+    return values[0]
+
 
 
 def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
+    # type: (str, Union[str, Sequence[str]], str, str, bool, bool, bool) -> str
     """Clean URL arguments leaving only those passed in the parameterlist keeping order
 
     >>> import w3lib.url
@@ -179,7 +193,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
         parameterlist = [parameterlist]
     url, fragment = urldefrag(url)
     base, _, query = url.partition('?')
-    seen = set()
+    seen = set()  # type: Set[str]
     querylist = []
     for ksv in query.split(sep):
         if not ksv:
@@ -208,10 +222,11 @@ def _add_or_replace_parameters(url, params):
     new_args.update(params)
 
     query = urlencode(new_args)
-    return urlunsplit(parsed._replace(query=query))
-
+    # "SplitResult" has no attribute "_replace" - looks like a bug in typeshed
+    return urlunsplit(parsed._replace(query=query))  # type: ignore
 
 def add_or_replace_parameter(url, name, new_value):
+    # type: (str, str, str) -> str
     """Add or remove a parameter to a given url
 
     >>> import w3lib.url
@@ -243,6 +258,7 @@ def add_or_replace_parameters(url, new_parameters):
 
 
 def path_to_file_uri(path):
+    # type: (str) -> str
     """Convert local filesystem path to legal File URIs as described in:
     http://en.wikipedia.org/wiki/File_URI_scheme
     """
@@ -253,6 +269,7 @@ def path_to_file_uri(path):
 
 
 def file_uri_to_path(uri):
+    # type: (str) -> str
     """Convert File URI to local filesystem path according to:
     http://en.wikipedia.org/wiki/File_URI_scheme
     """
@@ -261,6 +278,7 @@ def file_uri_to_path(uri):
 
 
 def any_to_uri(uri_or_path):
+    # type: (str) -> str
     """If given a path name, return its File URI, otherwise return it
     unmodified
     """
diff --git a/w3lib/util.py b/w3lib/util.py
index d8513eef..3015525d 100644
--- a/w3lib/util.py
+++ b/w3lib/util.py
@@ -1,4 +1,8 @@
 import six
+from typing import AnyStr, Optional
+
+from ._types import String
+
 
 def str_to_unicode(text, encoding=None, errors='strict'):
     if encoding is None:
@@ -7,6 +11,7 @@ def str_to_unicode(text, encoding=None, errors='strict'):
         return text.decode(encoding, errors)
     return text
 
+
 def unicode_to_str(text, encoding=None, errors='strict'):
     if encoding is None:
         encoding = 'utf-8'
@@ -14,19 +19,23 @@ def unicode_to_str(text, encoding=None, errors='strict'):
         return text.encode(encoding, errors)
     return text
 
+
 def to_unicode(text, encoding=None, errors='strict'):
+    # type: (AnyStr, Optional[String], String) -> six.text_type
     """Return the unicode representation of a bytes object `text`. If `text`
     is already an unicode object, return it as-is."""
     if isinstance(text, six.text_type):
         return text
-    if not isinstance(text, (bytes, six.text_type)):
+    if not isinstance(text, bytes):
         raise TypeError('to_unicode must receive a bytes, str or unicode '
                         'object, got %s' % type(text).__name__)
     if encoding is None:
         encoding = 'utf-8'
     return text.decode(encoding, errors)
 
+
 def to_bytes(text, encoding=None, errors='strict'):
+    # type: (AnyStr, Optional[String], String) -> bytes
     """Return the binary representation of `text`. If `text`
     is already a bytes object, return it as-is."""
     if isinstance(text, bytes):
@@ -38,7 +47,9 @@ def to_bytes(text, encoding=None, errors='strict'):
         encoding = 'utf-8'
     return text.encode(encoding, errors)
 
+
 def to_native_str(text, encoding=None, errors='strict'):
+    # type: (AnyStr, Optional[String], String) -> str
     """ Return str representation of `text`
     (bytes in Python 2.x and unicode in Python 3.x). """
     if six.PY2:

From 528158fc708d04b8c2c4a17878876a977754b5ba Mon Sep 17 00:00:00 2001
From: Mikhail Korobov <kmike84@gmail.com>
Date: Fri, 8 Jul 2016 22:46:47 +0500
Subject: [PATCH 2/8] cleanup: clarify String uses, remove unused imports

---
 w3lib/_types.py   | 4 ++--
 w3lib/encoding.py | 3 +--
 w3lib/html.py     | 4 ++--
 w3lib/http.py     | 2 +-
 w3lib/url.py      | 2 +-
 5 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/w3lib/_types.py b/w3lib/_types.py
index e8e7117d..324a3e49 100644
--- a/w3lib/_types.py
+++ b/w3lib/_types.py
@@ -7,14 +7,14 @@
 2. Variable is binary; unicode is not accepted ==> use ``bytes``
 3. Variable is text, and it can be only unicode in Python 2 ==> use
    ``six.text_type``  (or typing.Text??)
-4. Variable is text, but it can be str in Python 2 ==> use w3lib._types.String
+4. Variable is text, but it can be ascii or utf8-encoded str
+   in Python 2 ==> use w3lib._types.String
 5. Variable can be either bytes or unicode both in Python 2
    and Python 3 ==> use typing.AnyStr
 6. Variable should be str (==bytes) in Python 2
    and str (==unicode) in Python 3 ==> use ``str``.
 
 """
-
 from __future__ import absolute_import
 from typing import Union
 import six
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index b965baeb..e4060113 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -6,11 +6,10 @@
 import codecs
 import encodings  # type: ignore
 from sys import version_info
-from typing import Optional, Union, AnyStr, Tuple, Callable
+from typing import Optional, AnyStr, Tuple, Callable
 import six
 
 from .util import to_native_str
-from ._types import String
 
 _HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
 
diff --git a/w3lib/html.py b/w3lib/html.py
index bbaf439a..68bca505 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -7,9 +7,9 @@
 import re
 import six
 from six import moves
-from typing import AnyStr, Optional, Iterable, Tuple, Union, Sequence
+from typing import AnyStr, Optional, Tuple, Sequence
 
-from w3lib.util import to_bytes, to_unicode, to_native_str
+from w3lib.util import to_unicode, to_native_str
 from w3lib.url import safe_url_string
 from w3lib._types import String
 
diff --git a/w3lib/http.py b/w3lib/http.py
index 599bdac7..e4fae9b5 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 from base64 import urlsafe_b64encode
-from typing import Dict, List, Tuple, Optional, Union, AnyStr, Any
+from typing import Dict, List, Optional, Union, Any
 
 from ._types import String
 
diff --git a/w3lib/url.py b/w3lib/url.py
index c73c1970..33377be7 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -16,7 +16,7 @@
                                     quote, parse_qs, parse_qsl,
                                     ParseResult, unquote, urlunparse)
 from six.moves.urllib.request import pathname2url, url2pathname
-from typing import AnyStr, Tuple, Union, Set, Sequence, TypeVar
+from typing import AnyStr, Union, Set, Sequence, TypeVar
 from w3lib.util import to_bytes, to_native_str, to_unicode
 from w3lib._types import String
 

From b69e9757129849bd850a86f143feaccfc3b0887b Mon Sep 17 00:00:00 2001
From: Mikhail Korobov <kmike84@gmail.com>
Date: Fri, 8 Jul 2016 23:57:05 +0500
Subject: [PATCH 3/8] more precise result value types

thanks @lopuhin for the catch
---
 w3lib/encoding.py | 7 ++++---
 w3lib/html.py     | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index e4060113..5a04528a 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -6,7 +6,7 @@
 import codecs
 import encodings  # type: ignore
 from sys import version_info
-from typing import Optional, AnyStr, Tuple, Callable
+from typing import Optional, AnyStr, Tuple, Callable, Union
 import six
 
 from .util import to_native_str
@@ -163,7 +163,7 @@ def resolve_encoding(encoding_alias):
 
 
 def read_bom(data):
-    # type: (bytes) -> Tuple[str, bytes]
+    # type: (bytes) -> Union[Tuple[str, bytes], Tuple[None, None]]
     r"""Read the byte order mark in the text, if present, and
     return the encoding represented by the BOM and the BOM.
 
@@ -271,7 +271,8 @@ def html_to_unicode(content_type_header, html_body_str,
     '''
 
     enc = http_content_type_encoding(content_type_header)
-    bom_enc, bom = read_bom(html_body_str)
+    # FIXME: remove type: ignore when mypy bug is fixed
+    bom_enc, bom = read_bom(html_body_str)  # type: ignore
     if enc is not None:
         # remove BOM if it agrees with the encoding
         if enc == bom_enc:
diff --git a/w3lib/html.py b/w3lib/html.py
index 68bca505..0b0a4b5e 100644
--- a/w3lib/html.py
+++ b/w3lib/html.py
@@ -7,7 +7,7 @@
 import re
 import six
 from six import moves
-from typing import AnyStr, Optional, Tuple, Sequence
+from typing import AnyStr, Optional, Tuple, Sequence, Union
 
 from w3lib.util import to_unicode, to_native_str
 from w3lib.url import safe_url_string
@@ -317,7 +317,7 @@ def get_base_url(text, baseurl='', encoding='utf-8'):
 
 
 def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
-    # type: (AnyStr, str, String, Sequence[String]) -> Tuple[Optional[float], Optional[str]]
+    # type: (AnyStr, str, String, Sequence[String]) -> Union[Tuple[float, str], Tuple[None, None]]
     """Return  the http-equiv parameter of the HTML meta element from the given
     HTML text and return a tuple ``(interval, url)`` where interval is a number
     containing the delay in seconds (or zero if not present) and url is a

From ed46d4689a636372c144340895aa2e2d5aeba6a9 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Fri, 2 Nov 2018 14:21:05 +0800
Subject: [PATCH 4/8] use py37 for mypy

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 9b6642a9..ad47a3ac 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,9 +18,9 @@ matrix:
       sudo: true
     - python: 3.5
       env: TOXENV=pypy3
-    - python: 3.6
+    - python: 3.7
       env: TOXENV=mypy2
-    - python: 3.6
+    - python: 3.7
       env: TOXENV=mypy3
 
 install:

From 04aedad266862ad19adac27b664bf502f5d251f7 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Fri, 2 Nov 2018 14:30:51 +0800
Subject: [PATCH 5/8] revert to py36 for mypy

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index ad47a3ac..9b6642a9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,9 +18,9 @@ matrix:
       sudo: true
     - python: 3.5
       env: TOXENV=pypy3
-    - python: 3.7
+    - python: 3.6
       env: TOXENV=mypy2
-    - python: 3.7
+    - python: 3.6
       env: TOXENV=mypy3
 
 install:

From d2d6e6870196f9db1cfa3caa731e590bd0a06767 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Fri, 2 Nov 2018 18:51:03 +0800
Subject: [PATCH 6/8] mypy-lang is now mypy

---
 tox.ini | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tox.ini b/tox.ini
index 0b213434..a43b8b3a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -20,7 +20,7 @@ commands =
 [testenv:mypy2]
 basepython = python3.6
 deps =
-    mypy-lang
+    mypy
     typing
 commands =
     mypy --py2 w3lib tests
@@ -29,7 +29,6 @@ commands =
 [testenv:mypy3]
 basepython = python3.6
 deps =
-    mypy-lang
-    typing
+    mypy
 commands =
     mypy w3lib tests

From fe2213f522b502c7dcff610d0878e0e8dc3afd9d Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Thu, 24 Jan 2019 11:25:40 +0800
Subject: [PATCH 7/8] fix mypy errors

---
 .gitignore        |  3 ++-
 w3lib/encoding.py | 11 +++++++----
 w3lib/http.py     |  2 +-
 w3lib/url.py      |  6 +++---
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3fe67fd1..e967c85a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ dist
 docs/_build
 _trial_temp
 .coverage
-.cache
\ No newline at end of file
+.cache
+.mypy_cache/
\ No newline at end of file
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index 5a04528a..3462934f 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -6,7 +6,7 @@
 import codecs
 import encodings  # type: ignore
 from sys import version_info
-from typing import Optional, AnyStr, Tuple, Callable, Union
+from typing import Optional, AnyStr, Tuple, Callable, Union, cast
 import six
 
 from .util import to_native_str
@@ -15,7 +15,7 @@
 
 
 def http_content_type_encoding(content_type):
-    # type: (str) -> Optional[str]
+    # type: (Optional[str]) -> Optional[str]
     """Extract the encoding in the content-type header
 
     >>> import w3lib.encoding
@@ -28,6 +28,7 @@ def http_content_type_encoding(content_type):
         match = _HEADER_ENCODING_RE.search(content_type)
         if match:
             return resolve_encoding(match.group(1))
+    return None
 
 
 # regexp for parsing HTTP meta tags
@@ -91,6 +92,7 @@ def html_body_declared_encoding(html_body_str):
                 or match.group('xmlcharset')
         if encoding:
             return resolve_encoding(to_native_str(encoding))
+    return None
 
 
 # Default encoding translation
@@ -127,8 +129,8 @@ def _c18n_encoding(encoding):
     This performs normalization and translates aliases using python's
     encoding aliases
     """
-    normed = encodings.normalize_encoding(encoding).lower()
-    return encodings.aliases.aliases.get(normed, normed)
+    normed = encodings.normalize_encoding(encoding).lower()  # type: ignore
+    return encodings.aliases.aliases.get(normed, normed)  # type: ignore
 
 
 def resolve_encoding(encoding_alias):
@@ -273,6 +275,7 @@ def html_to_unicode(content_type_header, html_body_str,
     enc = http_content_type_encoding(content_type_header)
     # FIXME: remove type: ignore when mypy bug is fixed
     bom_enc, bom = read_bom(html_body_str)  # type: ignore
+    bom = cast(bytes, bom)
     if enc is not None:
         # remove BOM if it agrees with the encoding
         if enc == bom_enc:
diff --git a/w3lib/http.py b/w3lib/http.py
index e4fae9b5..848fac7a 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -35,7 +35,7 @@ def headers_raw_to_dict(headers_raw):
     headers = headers_raw.splitlines()
     headers_tuples = [header.split(b':', 1) for header in headers]
 
-    result_dict = {}
+    result_dict = {} # type: Dict[bytes, List[bytes]]
     for header_item in headers_tuples:
         if not len(header_item) == 2:
             continue
diff --git a/w3lib/url.py b/w3lib/url.py
index 33377be7..e12b0a61 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -70,7 +70,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
     try:
         netloc = parts.netloc.encode('idna')
     except UnicodeError:
-        netloc = parts.netloc
+        netloc = parts.netloc  # type: ignore
 
     # quote() in Python2 return type follows input type;
     # quote() in Python3 always returns Unicode (native str)
@@ -116,7 +116,7 @@ def is_url(text):
     return text.partition("://")[0] in ('file', 'http', 'https')
 
 
-def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
+def url_query_parameter(url, parameter, default=None, keep_blank_values=False):
     # type: (str, String, T, bool) -> Union[str, T]
     """Return the value of a url parameter, given the url and parameter name
 
@@ -543,7 +543,7 @@ def parse_url(url, encoding=None):
 
 
 if not six.PY2:
-    from urllib.parse import _coerce_args, unquote_to_bytes
+    from urllib.parse import _coerce_args, unquote_to_bytes  # type: ignore
 
     def parse_qsl_to_bytes(qs, keep_blank_values=False):
         """Parse a query given as a string argument.

From 8887f4d8c0ea14decbeff8c552c679aaf26bc354 Mon Sep 17 00:00:00 2001
From: Lucy Wang <wxitb2017@gmail.com>
Date: Wed, 30 Jan 2019 13:18:24 +0800
Subject: [PATCH 8/8] more fix

---
 w3lib/encoding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/w3lib/encoding.py b/w3lib/encoding.py
index 3462934f..3d613fc1 100644
--- a/w3lib/encoding.py
+++ b/w3lib/encoding.py
@@ -85,7 +85,7 @@ def html_body_declared_encoding(html_body_str):
     if isinstance(chunk, bytes):
         match = _BODY_ENCODING_BYTES_RE.search(chunk)
     else:
-        match = _BODY_ENCODING_STR_RE.search(chunk)
+        match = _BODY_ENCODING_STR_RE.search(chunk)  # type: ignore
 
     if match:
         encoding = match.group('charset') or match.group('charset2') \