Skip to content

Type hints #61

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ env:
- TOXENV=py33
- TOXENV=py34
- TOXENV=py35
- TOXENV=mypy2
- TOXENV=mypy3

install:
- pip install -U tox twine wheel codecov
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,5 @@
'Programming Language :: Python :: Implementation :: PyPy',
'Topic :: Internet :: WWW/HTTP',
],
install_requires=['six >= 1.4.1'],
install_requires=['six >= 1.4.1', 'typing'],
)
9 changes: 9 additions & 0 deletions tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode,
http_content_type_encoding, resolve_encoding, html_to_unicode)


class RequestEncodingTests(unittest.TestCase):
utf8_fragments = [
# Content-Type as meta http-equiv
Expand Down Expand Up @@ -51,6 +52,7 @@ def test_html_body_declared_encoding(self):
for fragment in self.utf8_fragments:
encoding = html_body_declared_encoding(fragment)
self.assertEqual(encoding, 'utf-8', fragment)

self.assertEqual(None, html_body_declared_encoding(b"something else"))
self.assertEqual(None, html_body_declared_encoding(b"""
<head></head><body>
Expand All @@ -76,6 +78,11 @@ def test_html_body_declared_encoding_unicode(self):
self.assertEqual(None, html_body_declared_encoding(
u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))

def test_html_body_declared_encoding_aliases(self):
fragment = b"""<meta http-equiv="content-type" content="text/html;charset=win-1251"/>"""
self.assertEqual("cp1251", html_body_declared_encoding(fragment))
self.assertEqual("cp1251", html_body_declared_encoding(fragment.decode('utf8')))


class CodecsEncodingTestCase(unittest.TestCase):
def test_resolve_encoding(self):
Expand All @@ -97,9 +104,11 @@ def test_invalid_utf8(self):
def ct(charset):
return "Content-Type: text/html; charset=" + charset if charset else None


def norm_encoding(enc):
return codecs.lookup(enc).name


class HtmlConversionTests(unittest.TestCase):

def test_unicode_body(self):
Expand Down
20 changes: 19 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,29 @@
# and then run "tox" from this directory.

[tox]
envlist = py27, pypy, py33, py34, py35
envlist = py27, pypy, py33, py34, py35, mypy2, mypy3

[testenv]
deps =
pytest
pytest-cov
commands =
py.test --cov=w3lib --cov-report= {posargs:w3lib tests}


[testenv:mypy2]
basepython = python3.5
deps =
mypy-lang
typing
commands =
mypy --py2 w3lib tests


[testenv:mypy3]
basepython = python3.5
deps =
mypy-lang
typing
commands =
mypy w3lib tests
25 changes: 25 additions & 0 deletions w3lib/_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
"""
Which string type to use?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️

=========================

1. Variable is an URL ==> use ``str``
2. Variable is binary; unicode is not accepted ==> use ``bytes``
3. Variable is text, and it can be only unicode in Python 2 ==> use
``six.text_type`` (or typing.Text??)
4. Variable is text, but it can be ascii or utf8-encoded str
in Python 2 ==> use w3lib._types.String
5. Variable can be either bytes or unicode both in Python 2
and Python 3 ==> use typing.AnyStr
6. Variable should be str (==bytes) in Python 2
and str (==unicode) in Python 3 ==> use ``str``.

"""
from __future__ import absolute_import
from typing import Union
import six

if six.PY2:
String = Union[bytes, unicode]
else:
String = str
44 changes: 34 additions & 10 deletions w3lib/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,20 @@
"""
Functions for handling encoding of web pages
"""
import re, codecs, encodings
import re
import codecs
import encodings # type: ignore
from typing import Optional, AnyStr, Tuple, Callable, Union
import six

from .util import to_native_str


_HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)


def http_content_type_encoding(content_type):
# type: (str) -> Optional[str]
"""Extract the encoding in the content-type header

>>> import w3lib.encoding
Expand All @@ -20,6 +29,7 @@ def http_content_type_encoding(content_type):
if match:
return resolve_encoding(match.group(1))


# regexp for parsing HTTP meta tags
_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
_SKIP_ATTRS = '''(?x)(?:\\s+
Expand All @@ -39,12 +49,14 @@ def http_content_type_encoding(content_type):
_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')

# check for meta tags, or xml decl. and stop search if a body tag is encountered
_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
_BODY_ENCODING_PATTERN = six.u(r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)') % (
_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
_BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I)
_BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I)


def html_body_declared_encoding(html_body_str):
# type: (AnyStr) -> Optional[str]
'''Return the encoding specified in meta tags in the html body,
or ``None`` if no suitable encoding was found

Expand Down Expand Up @@ -77,7 +89,8 @@ def html_body_declared_encoding(html_body_str):
encoding = match.group('charset') or match.group('charset2') \
or match.group('xmlcharset')
if encoding:
return resolve_encoding(encoding)
return resolve_encoding(to_native_str(encoding))


# Default encoding translation
# this maps cannonicalized encodings to target encodings
Expand Down Expand Up @@ -107,6 +120,7 @@ def html_body_declared_encoding(html_body_str):
}

def _c18n_encoding(encoding):
# type: (AnyStr) -> str
"""Cannonicalize an encoding name

This performs normalization and translates aliases using python's
Expand All @@ -115,7 +129,9 @@ def _c18n_encoding(encoding):
normed = encodings.normalize_encoding(encoding).lower()
return encodings.aliases.aliases.get(normed, normed)


def resolve_encoding(encoding_alias):
# type: (AnyStr) -> Optional[str]
"""Return the encoding that `encoding_alias` maps to, or ``None``
if the encoding cannot be interpreted

Expand All @@ -134,6 +150,7 @@ def resolve_encoding(encoding_alias):
except LookupError:
return None


_BOM_TABLE = [
(codecs.BOM_UTF32_BE, 'utf-32-be'),
(codecs.BOM_UTF32_LE, 'utf-32-le'),
Expand All @@ -143,21 +160,23 @@ def resolve_encoding(encoding_alias):
]
_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)


def read_bom(data):
# type: (bytes) -> Union[Tuple[str, bytes], Tuple[None, None]]
r"""Read the byte order mark in the text, if present, and
return the encoding represented by the BOM and the BOM.

If no BOM can be detected, ``(None, None)`` is returned.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A nitpick - this is not reflected by the type

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah; currently mypy assumes that None is a valid value for any type, so Tuple[str, bytes] allows None, None. I recall using Union[Tuple[str, bytes], Tuple[None, None]] in some other place, but I'm not sure it works.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, didn't know that, thanks! mypy behavior makes sense for tuple, although not that I would expect it in general :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They have docs for this: http://mypy.readthedocs.io/en/latest/planned_features.html. Let me try Union again :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it returns a weird error for this line if annotation is changed to Union:

bom_enc, bom = read_bom(html_body_str)

w3lib/encoding.py: note: In function "html_to_unicode":
w3lib/encoding.py:274: error: 'Union[Tuple[builtins.str, builtins.bytes], Tuple[void, void]]' object is not iterable

It looks related to Union handling, not to Tuple[None, None] handling because it also fails if Tuple[str, str] is used as a second option for Union.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So mypy does not see that all union "kinds" here are iterable, that's a pity.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

relevant mypy issue: python/mypy#1575


>>> import w3lib.encoding
>>> w3lib.encoding.read_bom(b'\xfe\xff\x6c\x34')
('utf-16-be', '\xfe\xff')
('utf-16-be', b'\xfe\xff')
>>> w3lib.encoding.read_bom(b'\xff\xfe\x34\x6c')
('utf-16-le', '\xff\xfe')
('utf-16-le', b'\xff\xfe')
>>> w3lib.encoding.read_bom(b'\x00\x00\xfe\xff\x00\x00\x6c\x34')
('utf-32-be', '\x00\x00\xfe\xff')
('utf-32-be', b'\x00\x00\xfe\xff')
>>> w3lib.encoding.read_bom(b'\xff\xfe\x00\x00\x34\x6c\x00\x00')
('utf-32-le', '\xff\xfe\x00\x00')
('utf-32-le', b'\xff\xfe\x00\x00')
>>> w3lib.encoding.read_bom(b'\x01\x02\x03\x04')
(None, None)
>>>
Expand All @@ -173,18 +192,22 @@ def read_bom(data):

# Python decoder doesn't follow unicode standard when handling
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1))
codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1)) # type: ignore


def to_unicode(data_str, encoding):
# type: (bytes, str) -> six.text_type
"""Convert a str object to unicode using the encoding given

Characters that cannot be converted will be converted to ``\\ufffd`` (the
unicode replacement character).
"""
return data_str.decode(encoding, 'w3lib_replace')


def html_to_unicode(content_type_header, html_body_str,
default_encoding='utf8', auto_detect_fun=None):
# type: (Optional[str], bytes, str, Optional[Callable[[bytes], str]]) -> Tuple[str, six.text_type]
r'''Convert raw html bytes to unicode

This attempts to make a reasonable guess at the content encoding of the
Expand Down Expand Up @@ -230,7 +253,7 @@ def html_to_unicode(content_type_header, html_body_str,

>>> import w3lib.encoding
>>> w3lib.encoding.html_to_unicode(None,
... """<!DOCTYPE html>
... b"""<!DOCTYPE html>
... <head>
... <meta charset="UTF-8" />
... <meta name="viewport" content="width=device-width" />
Expand All @@ -248,7 +271,8 @@ def html_to_unicode(content_type_header, html_body_str,
'''

enc = http_content_type_encoding(content_type_header)
bom_enc, bom = read_bom(html_body_str)
# FIXME: remove type: ignore when mypy bug is fixed
bom_enc, bom = read_bom(html_body_str) # type: ignore
if enc is not None:
# remove BOM if it agrees with the encoding
if enc == bom_enc:
Expand Down
Loading