scrapy · kmike · Jul 7, 2016 · Jul 8, 2016 · Jul 8, 2016 · eliasdorneles
diff --git a/.travis.yml b/.travis.yml
@@ -7,6 +7,8 @@ env:
   - TOXENV=py33
   - TOXENV=py34
   - TOXENV=py35
+  - TOXENV=mypy2
+  - TOXENV=mypy3
 
 install:
   - pip install -U tox twine wheel codecov

diff --git a/setup.py b/setup.py
@@ -27,5 +27,5 @@
         'Programming Language :: Python :: Implementation :: PyPy',
         'Topic :: Internet :: WWW/HTTP',
     ],
-    install_requires=['six >= 1.4.1'],
+    install_requires=['six >= 1.4.1', 'typing'],
 )
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
@@ -3,6 +3,7 @@
 from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode,
         http_content_type_encoding, resolve_encoding, html_to_unicode)
 
+
 class RequestEncodingTests(unittest.TestCase):
     utf8_fragments = [
         # Content-Type as meta http-equiv
@@ -51,6 +52,7 @@ def test_html_body_declared_encoding(self):
         for fragment in self.utf8_fragments:
             encoding = html_body_declared_encoding(fragment)
             self.assertEqual(encoding, 'utf-8', fragment)
+
         self.assertEqual(None, html_body_declared_encoding(b"something else"))
         self.assertEqual(None, html_body_declared_encoding(b"""
             <head></head><body>
@@ -76,6 +78,11 @@ def test_html_body_declared_encoding_unicode(self):
         self.assertEqual(None, html_body_declared_encoding(
             u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))
 
+    def test_html_body_declared_encoding_aliases(self):
+        fragment = b"""<meta http-equiv="content-type" content="text/html;charset=win-1251"/>"""
+        self.assertEqual("cp1251", html_body_declared_encoding(fragment))
+        self.assertEqual("cp1251", html_body_declared_encoding(fragment.decode('utf8')))
+
 
 class CodecsEncodingTestCase(unittest.TestCase):
     def test_resolve_encoding(self):
@@ -97,9 +104,11 @@ def test_invalid_utf8(self):
 def ct(charset):
     return "Content-Type: text/html; charset=" + charset if charset else None
 
+
 def norm_encoding(enc):
     return codecs.lookup(enc).name
 
+
 class HtmlConversionTests(unittest.TestCase):
 
     def test_unicode_body(self):

diff --git a/tox.ini b/tox.ini
@@ -4,11 +4,29 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py27, pypy, py33, py34, py35
+envlist = py27, pypy, py33, py34, py35, mypy2, mypy3
 
 [testenv]
 deps =
     pytest
     pytest-cov
 commands =
     py.test --cov=w3lib --cov-report= {posargs:w3lib tests}
+
+
+[testenv:mypy2]
+basepython = python3.5
+deps =
+    mypy-lang
+    typing
+commands =
+    mypy --py2 w3lib tests
+
+
+[testenv:mypy3]
+basepython = python3.5
+deps =
+    mypy-lang
+    typing
+commands =
+    mypy w3lib tests
diff --git a/w3lib/_types.py b/w3lib/_types.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+"""
+Which string type to use?
+=========================
+
+1. Variable is an URL ==> use ``str``
+2. Variable is binary; unicode is not accepted ==> use ``bytes``
+3. Variable is text, and it can be only unicode in Python 2 ==> use
+   ``six.text_type``  (or typing.Text??)
+4. Variable is text, but it can be ascii or utf8-encoded str
+   in Python 2 ==> use w3lib._types.String
+5. Variable can be either bytes or unicode both in Python 2
+   and Python 3 ==> use typing.AnyStr
+6. Variable should be str (==bytes) in Python 2
+   and str (==unicode) in Python 3 ==> use ``str``.
+
+"""
+from __future__ import absolute_import
+from typing import Union
+import six
+
+if six.PY2:
+    String = Union[bytes, unicode]
+else:
+    String = str
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
@@ -2,11 +2,20 @@
 """
 Functions for handling encoding of web pages
 """
-import re, codecs, encodings
+import re
+import codecs
+import encodings  # type: ignore
+from typing import Optional, AnyStr, Tuple, Callable, Union
+import six
+
+from .util import to_native_str
+
 
 _HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
 
+
 def http_content_type_encoding(content_type):
+    # type: (str) -> Optional[str]
     """Extract the encoding in the content-type header
 
     >>> import w3lib.encoding
@@ -20,6 +29,7 @@ def http_content_type_encoding(content_type):
         if match:
             return resolve_encoding(match.group(1))
 
+
 # regexp for parsing HTTP meta tags
 _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
 _SKIP_ATTRS = '''(?x)(?:\\s+
@@ -39,12 +49,14 @@ def http_content_type_encoding(content_type):
 _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
 
 # check for meta tags, or xml decl. and stop search if a body tag is encountered
-_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
+_BODY_ENCODING_PATTERN = six.u(r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)') % (
     _SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
 _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I)
 _BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I)
 
+
 def html_body_declared_encoding(html_body_str):
+    # type: (AnyStr) -> Optional[str]
     '''Return the encoding specified in meta tags in the html body,
     or ``None`` if no suitable encoding was found
 
@@ -77,7 +89,8 @@ def html_body_declared_encoding(html_body_str):
         encoding = match.group('charset') or match.group('charset2') \
                 or match.group('xmlcharset')
         if encoding:
-            return resolve_encoding(encoding)
+            return resolve_encoding(to_native_str(encoding))
+
 
 # Default encoding translation
 # this maps cannonicalized encodings to target encodings
@@ -107,6 +120,7 @@ def html_body_declared_encoding(html_body_str):
 }
 
 def _c18n_encoding(encoding):
+    # type: (AnyStr) -> str
     """Cannonicalize an encoding name
 
     This performs normalization and translates aliases using python's
@@ -115,7 +129,9 @@ def _c18n_encoding(encoding):
     normed = encodings.normalize_encoding(encoding).lower()
     return encodings.aliases.aliases.get(normed, normed)
 
+
 def resolve_encoding(encoding_alias):
+    # type: (AnyStr) -> Optional[str]
     """Return the encoding that `encoding_alias` maps to, or ``None``
     if the encoding cannot be interpreted
 
@@ -134,6 +150,7 @@ def resolve_encoding(encoding_alias):
     except LookupError:
         return None
 
+
 _BOM_TABLE = [
     (codecs.BOM_UTF32_BE, 'utf-32-be'),
     (codecs.BOM_UTF32_LE, 'utf-32-le'),
@@ -143,21 +160,23 @@ def resolve_encoding(encoding_alias):
 ]
 _FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
 
+
 def read_bom(data):
+    # type: (bytes) -> Union[Tuple[str, bytes], Tuple[None, None]]
     r"""Read the byte order mark in the text, if present, and
     return the encoding represented by the BOM and the BOM.
 
     If no BOM can be detected, ``(None, None)`` is returned.
 
     >>> import w3lib.encoding
     >>> w3lib.encoding.read_bom(b'\xfe\xff\x6c\x34')
-    ('utf-16-be', '\xfe\xff')
+    ('utf-16-be', b'\xfe\xff')
     >>> w3lib.encoding.read_bom(b'\xff\xfe\x34\x6c')
-    ('utf-16-le', '\xff\xfe')
+    ('utf-16-le', b'\xff\xfe')
     >>> w3lib.encoding.read_bom(b'\x00\x00\xfe\xff\x00\x00\x6c\x34')
-    ('utf-32-be', '\x00\x00\xfe\xff')
+    ('utf-32-be', b'\x00\x00\xfe\xff')
     >>> w3lib.encoding.read_bom(b'\xff\xfe\x00\x00\x34\x6c\x00\x00')
-    ('utf-32-le', '\xff\xfe\x00\x00')
+    ('utf-32-le', b'\xff\xfe\x00\x00')
     >>> w3lib.encoding.read_bom(b'\x01\x02\x03\x04')
     (None, None)
     >>>
@@ -173,18 +192,22 @@ def read_bom(data):
 
 # Python decoder doesn't follow unicode standard when handling
 # bad utf-8 encoded strings. see http://bugs.python.org/issue8271
-codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1))
+codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1))  # type: ignore
+
 
 def to_unicode(data_str, encoding):
+    # type: (bytes, str) -> six.text_type
     """Convert a str object to unicode using the encoding given
 
     Characters that cannot be converted will be converted to ``\\ufffd`` (the
     unicode replacement character).
     """
     return data_str.decode(encoding, 'w3lib_replace')
 
+
 def html_to_unicode(content_type_header, html_body_str,
         default_encoding='utf8', auto_detect_fun=None):
+    # type: (Optional[str], bytes, str, Optional[Callable[[bytes], str]]) -> Tuple[str, six.text_type]
     r'''Convert raw html bytes to unicode
 
     This attempts to make a reasonable guess at the content encoding of the
@@ -230,7 +253,7 @@ def html_to_unicode(content_type_header, html_body_str,
 
     >>> import w3lib.encoding
     >>> w3lib.encoding.html_to_unicode(None,
-    ... """<!DOCTYPE html>
+    ... b"""<!DOCTYPE html>
     ... <head>
     ... <meta charset="UTF-8" />
     ... <meta name="viewport" content="width=device-width" />
@@ -248,7 +271,8 @@ def html_to_unicode(content_type_header, html_body_str,
     '''
 
     enc = http_content_type_encoding(content_type_header)
-    bom_enc, bom = read_bom(html_body_str)
+    # FIXME: remove type: ignore when mypy bug is fixed
+    bom_enc, bom = read_bom(html_body_str)  # type: ignore
     if enc is not None:
         # remove BOM if it agrees with the encoding
         if enc == bom_enc: