scrapy · lucywang000 · Jul 7, 2016 · Jul 8, 2016 · Jul 8, 2016 · Nov 2, 2018
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,5 @@ dist
 docs/_build
 _trial_temp
 .coverage
-.cache
+.cache
+.mypy_cache/
diff --git a/.travis.yml b/.travis.yml
@@ -18,6 +18,10 @@ matrix:
       sudo: true
     - python: 3.5
       env: TOXENV=pypy3
+    - python: 3.6
+      env: TOXENV=mypy2
+    - python: 3.6
+      env: TOXENV=mypy3
 
 install:
   - |

diff --git a/setup.py b/setup.py
@@ -29,5 +29,5 @@
         'Programming Language :: Python :: Implementation :: PyPy',
         'Topic :: Internet :: WWW/HTTP',
     ],
-    install_requires=['six >= 1.4.1'],
+    install_requires=['six >= 1.4.1', 'typing'],
 )
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
@@ -3,6 +3,7 @@
 from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode,
         http_content_type_encoding, resolve_encoding, html_to_unicode)
 
+
 class RequestEncodingTests(unittest.TestCase):
     utf8_fragments = [
         # Content-Type as meta http-equiv
@@ -51,6 +52,7 @@ def test_html_body_declared_encoding(self):
         for fragment in self.utf8_fragments:
             encoding = html_body_declared_encoding(fragment)
             self.assertEqual(encoding, 'utf-8', fragment)
+
         self.assertEqual(None, html_body_declared_encoding(b"something else"))
         self.assertEqual(None, html_body_declared_encoding(b"""
             <head></head><body>
@@ -76,6 +78,11 @@ def test_html_body_declared_encoding_unicode(self):
         self.assertEqual(None, html_body_declared_encoding(
             u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
 
+    def test_html_body_declared_encoding_aliases(self):
+        fragment = b"""<meta http-equiv="content-type" content="text/html;charset=win-1251"/>"""
+        self.assertEqual("cp1251", html_body_declared_encoding(fragment))
+        self.assertEqual("cp1251", html_body_declared_encoding(fragment.decode('utf8')))
+
 
 class CodecsEncodingTestCase(unittest.TestCase):
     def test_resolve_encoding(self):
@@ -97,9 +104,11 @@ def test_invalid_utf8(self):
 def ct(charset):
     return "Content-Type: text/html; charset=" + charset if charset else None
 
+
 def norm_encoding(enc):
     return codecs.lookup(enc).name
 
+
 class HtmlConversionTests(unittest.TestCase):
 
     def test_unicode_body(self):

diff --git a/tox.ini b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py27, pypy, py34, py35, py36, py37, pypy3
+envlist = py27, pypy, py34, py35, py36, py37, pypy3, mypy2, mypy3
 
 [testenv]
 deps =
@@ -15,3 +15,20 @@ commands =
         --doctest-modules \
         --cov=w3lib --cov-report=term \
         {posargs:w3lib tests}
+
+
+[testenv:mypy2]
+basepython = python3.6
+deps =
+    mypy
+    typing
+commands =
+    mypy --py2 w3lib tests
+
+
+[testenv:mypy3]
+basepython = python3.6
+deps =
+    mypy
+commands =
+    mypy w3lib tests
diff --git a/w3lib/_types.py b/w3lib/_types.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+"""
+Which string type to use?
+=========================
+
+1. Variable is an URL ==> use ``str``
+2. Variable is binary; unicode is not accepted ==> use ``bytes``
+3. Variable is text, and it can be only unicode in Python 2 ==> use
+   ``six.text_type``  (or typing.Text??)
+4. Variable is text, but it can be ascii or utf8-encoded str
+   in Python 2 ==> use w3lib._types.String
+5. Variable can be either bytes or unicode both in Python 2
+   and Python 3 ==> use typing.AnyStr
+6. Variable should be str (==bytes) in Python 2
+   and str (==unicode) in Python 3 ==> use ``str``.
+
+"""
+from __future__ import absolute_import
+from typing import Union
+import six
+
+if six.PY2:
+    String = Union[bytes, unicode]
+else:
+    String = str
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
@@ -2,12 +2,20 @@
 """
 Functions for handling encoding of web pages
 """
-import re, codecs, encodings
+import re
+import codecs
+import encodings  # type: ignore
 from sys import version_info
+from typing import Optional, AnyStr, Tuple, Callable, Union, cast
+import six
+
+from .util import to_native_str
 
 _HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
 
+
 def http_content_type_encoding(content_type):
+    # type: (Optional[str]) -> Optional[str]
     """Extract the encoding in the content-type header
 
     >>> import w3lib.encoding
@@ -20,6 +28,8 @@ def http_content_type_encoding(content_type):
         match = _HEADER_ENCODING_RE.search(content_type)
         if match:
             return resolve_encoding(match.group(1))
+    return None
+
 
 # regexp for parsing HTTP meta tags
 _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
@@ -40,13 +50,15 @@ def http_content_type_encoding(content_type):
 _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
 
 # check for meta tags, or xml decl. and stop search if a body tag is encountered
-_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
+_BODY_ENCODING_PATTERN = six.u(r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)') % (
     _SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
 _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE)
 _BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'),
                                      re.I | re.VERBOSE)
 
+
 def html_body_declared_encoding(html_body_str):
+    # type: (AnyStr) -> Optional[str]
     '''Return the encoding specified in meta tags in the html body,
     or ``None`` if no suitable encoding was found
 
@@ -73,13 +85,15 @@ def html_body_declared_encoding(html_body_str):
     if isinstance(chunk, bytes):
         match = _BODY_ENCODING_BYTES_RE.search(chunk)
     else:
-        match = _BODY_ENCODING_STR_RE.search(chunk)
+        match = _BODY_ENCODING_STR_RE.search(chunk)  # type: ignore
 
     if match:
         encoding = match.group('charset') or match.group('charset2') \
                 or match.group('xmlcharset')
         if encoding:
-            return resolve_encoding(encoding)
+            return resolve_encoding(to_native_str(encoding))
+    return None
+
 
 # Default encoding translation
 # this maps cannonicalized encodings to target encodings
@@ -109,15 +123,18 @@ def html_body_declared_encoding(html_body_str):
 }
 
 def _c18n_encoding(encoding):
+    # type: (AnyStr) -> str
     """Cannonicalize an encoding name
 
     This performs normalization and translates aliases using python's
     encoding aliases
     """
-    normed = encodings.normalize_encoding(encoding).lower()
-    return encodings.aliases.aliases.get(normed, normed)
+    normed = encodings.normalize_encoding(encoding).lower()  # type: ignore
+    return encodings.aliases.aliases.get(normed, normed)  # type: ignore
+
 
 def resolve_encoding(encoding_alias):
+    # type: (AnyStr) -> Optional[str]
     """Return the encoding that `encoding_alias` maps to, or ``None``
     if the encoding cannot be interpreted
 
@@ -136,6 +153,7 @@ def resolve_encoding(encoding_alias):
     except LookupError:
         return None
 
+
 _BOM_TABLE = [
     (codecs.BOM_UTF32_BE, 'utf-32-be'),
     (codecs.BOM_UTF32_LE, 'utf-32-le'),
@@ -145,21 +163,23 @@ def resolve_encoding(encoding_alias):
 ]
 _FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
 
+
 def read_bom(data):
+    # type: (bytes) -> Union[Tuple[str, bytes], Tuple[None, None]]
     r"""Read the byte order mark in the text, if present, and
     return the encoding represented by the BOM and the BOM.
 
     If no BOM can be detected, ``(None, None)`` is returned.
 
     >>> import w3lib.encoding
     >>> w3lib.encoding.read_bom(b'\xfe\xff\x6c\x34')
-    ('utf-16-be', '\xfe\xff')
+    ('utf-16-be', b'\xfe\xff')
     >>> w3lib.encoding.read_bom(b'\xff\xfe\x34\x6c')
-    ('utf-16-le', '\xff\xfe')
+    ('utf-16-le', b'\xff\xfe')
     >>> w3lib.encoding.read_bom(b'\x00\x00\xfe\xff\x00\x00\x6c\x34')
-    ('utf-32-be', '\x00\x00\xfe\xff')
+    ('utf-32-be', b'\x00\x00\xfe\xff')
     >>> w3lib.encoding.read_bom(b'\xff\xfe\x00\x00\x34\x6c\x00\x00')
-    ('utf-32-le', '\xff\xfe\x00\x00')
+    ('utf-32-le', b'\xff\xfe\x00\x00')
     >>> w3lib.encoding.read_bom(b'\x01\x02\x03\x04')
     (None, None)
     >>>
@@ -175,18 +195,21 @@ def read_bom(data):
 
 # Python decoder doesn't follow unicode standard when handling
 # bad utf-8 encoded strings. see http://bugs.python.org/issue8271
-codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end))
+codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end))  # type: ignore
 
 def to_unicode(data_str, encoding):
+    # type: (bytes, str) -> six.text_type
     """Convert a str object to unicode using the encoding given
 
     Characters that cannot be converted will be converted to ``\\ufffd`` (the
     unicode replacement character).
     """
     return data_str.decode(encoding, 'replace' if version_info[0:2] >= (3, 3) else 'w3lib_replace')
 
+
 def html_to_unicode(content_type_header, html_body_str,
         default_encoding='utf8', auto_detect_fun=None):
+    # type: (Optional[str], bytes, str, Optional[Callable[[bytes], str]]) -> Tuple[str, six.text_type]
     r'''Convert raw html bytes to unicode
 
     This attempts to make a reasonable guess at the content encoding of the
@@ -250,7 +273,9 @@ def html_to_unicode(content_type_header, html_body_str,
     '''
 
     enc = http_content_type_encoding(content_type_header)
-    bom_enc, bom = read_bom(html_body_str)
+    # FIXME: remove type: ignore when mypy bug is fixed
+    bom_enc, bom = read_bom(html_body_str)  # type: ignore
+    bom = cast(bytes, bom)
     if enc is not None:
         # remove BOM if it agrees with the encoding
         if enc == bom_enc: