Skip to content

Type hints based on #61 #123

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ dist
docs/_build
_trial_temp
.coverage
.cache
.cache
.mypy_cache/
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ matrix:
sudo: true
- python: 3.5
env: TOXENV=pypy3
- python: 3.6
env: TOXENV=mypy2
- python: 3.6
env: TOXENV=mypy3

install:
- |
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@
'Programming Language :: Python :: Implementation :: PyPy',
'Topic :: Internet :: WWW/HTTP',
],
install_requires=['six >= 1.4.1'],
install_requires=['six >= 1.4.1', 'typing'],
)
9 changes: 9 additions & 0 deletions tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode,
http_content_type_encoding, resolve_encoding, html_to_unicode)


class RequestEncodingTests(unittest.TestCase):
utf8_fragments = [
# Content-Type as meta http-equiv
Expand Down Expand Up @@ -51,6 +52,7 @@ def test_html_body_declared_encoding(self):
for fragment in self.utf8_fragments:
encoding = html_body_declared_encoding(fragment)
self.assertEqual(encoding, 'utf-8', fragment)

self.assertEqual(None, html_body_declared_encoding(b"something else"))
self.assertEqual(None, html_body_declared_encoding(b"""
<head></head><body>
Expand All @@ -76,6 +78,11 @@ def test_html_body_declared_encoding_unicode(self):
self.assertEqual(None, html_body_declared_encoding(
u"""<meta http-equiv="Fake-Content-Type-Header" content="text/html; charset=utf-8">"""))

def test_html_body_declared_encoding_aliases(self):
fragment = b"""<meta http-equiv="content-type" content="text/html;charset=win-1251"/>"""
self.assertEqual("cp1251", html_body_declared_encoding(fragment))
self.assertEqual("cp1251", html_body_declared_encoding(fragment.decode('utf8')))


class CodecsEncodingTestCase(unittest.TestCase):
def test_resolve_encoding(self):
Expand All @@ -97,9 +104,11 @@ def test_invalid_utf8(self):
def ct(charset):
return "Content-Type: text/html; charset=" + charset if charset else None


def norm_encoding(enc):
return codecs.lookup(enc).name


class HtmlConversionTests(unittest.TestCase):

def test_unicode_body(self):
Expand Down
19 changes: 18 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# and then run "tox" from this directory.

[tox]
envlist = py27, pypy, py34, py35, py36, py37, pypy3
envlist = py27, pypy, py34, py35, py36, py37, pypy3, mypy2, mypy3

[testenv]
deps =
Expand All @@ -15,3 +15,20 @@ commands =
--doctest-modules \
--cov=w3lib --cov-report=term \
{posargs:w3lib tests}


[testenv:mypy2]
basepython = python3.6
deps =
mypy
typing
commands =
mypy --py2 w3lib tests


[testenv:mypy3]
basepython = python3.6
deps =
mypy
commands =
mypy w3lib tests
25 changes: 25 additions & 0 deletions w3lib/_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
"""
Which string type to use?
=========================

1. Variable is an URL ==> use ``str``
2. Variable is binary; unicode is not accepted ==> use ``bytes``
3. Variable is text, and it can be only unicode in Python 2 ==> use
``six.text_type`` (or typing.Text??)
4. Variable is text, but it can be ascii or utf8-encoded str
in Python 2 ==> use w3lib._types.String
5. Variable can be either bytes or unicode both in Python 2
and Python 3 ==> use typing.AnyStr
6. Variable should be str (==bytes) in Python 2
and str (==unicode) in Python 3 ==> use ``str``.

"""
from __future__ import absolute_import
from typing import Union
import six

if six.PY2:
String = Union[bytes, unicode]
else:
String = str
49 changes: 37 additions & 12 deletions w3lib/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@
"""
Functions for handling encoding of web pages
"""
import re, codecs, encodings
import re
import codecs
import encodings # type: ignore
from sys import version_info
from typing import Optional, AnyStr, Tuple, Callable, Union, cast
import six

from .util import to_native_str

_HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)


def http_content_type_encoding(content_type):
# type: (Optional[str]) -> Optional[str]
"""Extract the encoding in the content-type header

>>> import w3lib.encoding
Expand All @@ -20,6 +28,8 @@ def http_content_type_encoding(content_type):
match = _HEADER_ENCODING_RE.search(content_type)
if match:
return resolve_encoding(match.group(1))
return None


# regexp for parsing HTTP meta tags
_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
Expand All @@ -40,13 +50,15 @@ def http_content_type_encoding(content_type):
_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')

# check for meta tags, or xml decl. and stop search if a body tag is encountered
_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
_BODY_ENCODING_PATTERN = six.u(r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)') % (
_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
_BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE)
_BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'),
re.I | re.VERBOSE)


def html_body_declared_encoding(html_body_str):
# type: (AnyStr) -> Optional[str]
'''Return the encoding specified in meta tags in the html body,
or ``None`` if no suitable encoding was found

Expand All @@ -73,13 +85,15 @@ def html_body_declared_encoding(html_body_str):
if isinstance(chunk, bytes):
match = _BODY_ENCODING_BYTES_RE.search(chunk)
else:
match = _BODY_ENCODING_STR_RE.search(chunk)
match = _BODY_ENCODING_STR_RE.search(chunk) # type: ignore

if match:
encoding = match.group('charset') or match.group('charset2') \
or match.group('xmlcharset')
if encoding:
return resolve_encoding(encoding)
return resolve_encoding(to_native_str(encoding))
return None


# Default encoding translation
# this maps cannonicalized encodings to target encodings
Expand Down Expand Up @@ -109,15 +123,18 @@ def html_body_declared_encoding(html_body_str):
}

def _c18n_encoding(encoding):
# type: (AnyStr) -> str
"""Cannonicalize an encoding name

This performs normalization and translates aliases using python's
encoding aliases
"""
normed = encodings.normalize_encoding(encoding).lower()
return encodings.aliases.aliases.get(normed, normed)
normed = encodings.normalize_encoding(encoding).lower() # type: ignore
return encodings.aliases.aliases.get(normed, normed) # type: ignore


def resolve_encoding(encoding_alias):
# type: (AnyStr) -> Optional[str]
"""Return the encoding that `encoding_alias` maps to, or ``None``
if the encoding cannot be interpreted

Expand All @@ -136,6 +153,7 @@ def resolve_encoding(encoding_alias):
except LookupError:
return None


_BOM_TABLE = [
(codecs.BOM_UTF32_BE, 'utf-32-be'),
(codecs.BOM_UTF32_LE, 'utf-32-le'),
Expand All @@ -145,21 +163,23 @@ def resolve_encoding(encoding_alias):
]
_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)


def read_bom(data):
# type: (bytes) -> Union[Tuple[str, bytes], Tuple[None, None]]
r"""Read the byte order mark in the text, if present, and
return the encoding represented by the BOM and the BOM.

If no BOM can be detected, ``(None, None)`` is returned.

>>> import w3lib.encoding
>>> w3lib.encoding.read_bom(b'\xfe\xff\x6c\x34')
('utf-16-be', '\xfe\xff')
('utf-16-be', b'\xfe\xff')
>>> w3lib.encoding.read_bom(b'\xff\xfe\x34\x6c')
('utf-16-le', '\xff\xfe')
('utf-16-le', b'\xff\xfe')
>>> w3lib.encoding.read_bom(b'\x00\x00\xfe\xff\x00\x00\x6c\x34')
('utf-32-be', '\x00\x00\xfe\xff')
('utf-32-be', b'\x00\x00\xfe\xff')
>>> w3lib.encoding.read_bom(b'\xff\xfe\x00\x00\x34\x6c\x00\x00')
('utf-32-le', '\xff\xfe\x00\x00')
('utf-32-le', b'\xff\xfe\x00\x00')
>>> w3lib.encoding.read_bom(b'\x01\x02\x03\x04')
(None, None)
>>>
Expand All @@ -175,18 +195,21 @@ def read_bom(data):

# Python decoder doesn't follow unicode standard when handling
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end))
codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end)) # type: ignore

def to_unicode(data_str, encoding):
# type: (bytes, str) -> six.text_type
"""Convert a str object to unicode using the encoding given

Characters that cannot be converted will be converted to ``\\ufffd`` (the
unicode replacement character).
"""
return data_str.decode(encoding, 'replace' if version_info[0:2] >= (3, 3) else 'w3lib_replace')


def html_to_unicode(content_type_header, html_body_str,
default_encoding='utf8', auto_detect_fun=None):
# type: (Optional[str], bytes, str, Optional[Callable[[bytes], str]]) -> Tuple[str, six.text_type]
r'''Convert raw html bytes to unicode

This attempts to make a reasonable guess at the content encoding of the
Expand Down Expand Up @@ -250,7 +273,9 @@ def html_to_unicode(content_type_header, html_body_str,
'''

enc = http_content_type_encoding(content_type_header)
bom_enc, bom = read_bom(html_body_str)
# FIXME: remove type: ignore when mypy bug is fixed
bom_enc, bom = read_bom(html_body_str) # type: ignore
bom = cast(bytes, bom)
if enc is not None:
# remove BOM if it agrees with the encoding
if enc == bom_enc:
Expand Down
Loading