-
Notifications
You must be signed in to change notification settings - Fork 107
Type hints #61
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Type hints #61
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Which string type to use? | ||
========================= | ||
|
||
1. Variable is an URL ==> use ``str`` | ||
2. Variable is binary; unicode is not accepted ==> use ``bytes`` | ||
3. Variable is text, and it can be only unicode in Python 2 ==> use | ||
``six.text_type`` (or typing.Text??) | ||
4. Variable is text, but it can be ascii or utf8-encoded str | ||
in Python 2 ==> use w3lib._types.String | ||
5. Variable can be either bytes or unicode both in Python 2 | ||
and Python 3 ==> use typing.AnyStr | ||
6. Variable should be str (==bytes) in Python 2 | ||
and str (==unicode) in Python 3 ==> use ``str``. | ||
|
||
""" | ||
from __future__ import absolute_import | ||
from typing import Union | ||
import six | ||
|
||
if six.PY2: | ||
String = Union[bytes, unicode] | ||
else: | ||
String = str |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,11 +2,20 @@ | |
""" | ||
Functions for handling encoding of web pages | ||
""" | ||
import re, codecs, encodings | ||
import re | ||
import codecs | ||
import encodings # type: ignore | ||
from typing import Optional, AnyStr, Tuple, Callable, Union | ||
import six | ||
|
||
from .util import to_native_str | ||
|
||
|
||
_HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I) | ||
|
||
|
||
def http_content_type_encoding(content_type): | ||
# type: (str) -> Optional[str] | ||
"""Extract the encoding in the content-type header | ||
|
||
>>> import w3lib.encoding | ||
|
@@ -20,6 +29,7 @@ def http_content_type_encoding(content_type): | |
if match: | ||
return resolve_encoding(match.group(1)) | ||
|
||
|
||
# regexp for parsing HTTP meta tags | ||
_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?''' | ||
_SKIP_ATTRS = '''(?x)(?:\\s+ | ||
|
@@ -39,12 +49,14 @@ def http_content_type_encoding(content_type): | |
_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)') | ||
|
||
# check for meta tags, or xml decl. and stop search if a body tag is encountered | ||
_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % ( | ||
_BODY_ENCODING_PATTERN = six.u(r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)') % ( | ||
_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE) | ||
_BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I) | ||
_BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I) | ||
|
||
|
||
def html_body_declared_encoding(html_body_str): | ||
# type: (AnyStr) -> Optional[str] | ||
'''Return the encoding specified in meta tags in the html body, | ||
or ``None`` if no suitable encoding was found | ||
|
||
|
@@ -77,7 +89,8 @@ def html_body_declared_encoding(html_body_str): | |
encoding = match.group('charset') or match.group('charset2') \ | ||
or match.group('xmlcharset') | ||
if encoding: | ||
return resolve_encoding(encoding) | ||
return resolve_encoding(to_native_str(encoding)) | ||
|
||
|
||
# Default encoding translation | ||
# this maps cannonicalized encodings to target encodings | ||
|
@@ -107,6 +120,7 @@ def html_body_declared_encoding(html_body_str): | |
} | ||
|
||
def _c18n_encoding(encoding): | ||
# type: (AnyStr) -> str | ||
"""Cannonicalize an encoding name | ||
|
||
This performs normalization and translates aliases using python's | ||
|
@@ -115,7 +129,9 @@ def _c18n_encoding(encoding): | |
normed = encodings.normalize_encoding(encoding).lower() | ||
return encodings.aliases.aliases.get(normed, normed) | ||
|
||
|
||
def resolve_encoding(encoding_alias): | ||
# type: (AnyStr) -> Optional[str] | ||
"""Return the encoding that `encoding_alias` maps to, or ``None`` | ||
if the encoding cannot be interpreted | ||
|
||
|
@@ -134,6 +150,7 @@ def resolve_encoding(encoding_alias): | |
except LookupError: | ||
return None | ||
|
||
|
||
_BOM_TABLE = [ | ||
(codecs.BOM_UTF32_BE, 'utf-32-be'), | ||
(codecs.BOM_UTF32_LE, 'utf-32-le'), | ||
|
@@ -143,21 +160,23 @@ def resolve_encoding(encoding_alias): | |
] | ||
_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE) | ||
|
||
|
||
def read_bom(data): | ||
# type: (bytes) -> Union[Tuple[str, bytes], Tuple[None, None]] | ||
r"""Read the byte order mark in the text, if present, and | ||
return the encoding represented by the BOM and the BOM. | ||
|
||
If no BOM can be detected, ``(None, None)`` is returned. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A nitpick - this is not reflected by the type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah; currently mypy assumes that None is a valid value for any type, so There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, didn't know that, thanks! mypy behavior makes sense for tuple, although not that I would expect it in general :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They have docs for this: http://mypy.readthedocs.io/en/latest/planned_features.html. Let me try Union again :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it returns a weird error for this line if annotation is changed to Union:
It looks related to Union handling, not to Tuple[None, None] handling because it also fails if Tuple[str, str] is used as a second option for Union. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So mypy does not see that all union "kinds" here are iterable, that's a pity. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. relevant mypy issue: python/mypy#1575 |
||
|
||
>>> import w3lib.encoding | ||
>>> w3lib.encoding.read_bom(b'\xfe\xff\x6c\x34') | ||
('utf-16-be', '\xfe\xff') | ||
('utf-16-be', b'\xfe\xff') | ||
>>> w3lib.encoding.read_bom(b'\xff\xfe\x34\x6c') | ||
('utf-16-le', '\xff\xfe') | ||
('utf-16-le', b'\xff\xfe') | ||
>>> w3lib.encoding.read_bom(b'\x00\x00\xfe\xff\x00\x00\x6c\x34') | ||
('utf-32-be', '\x00\x00\xfe\xff') | ||
('utf-32-be', b'\x00\x00\xfe\xff') | ||
>>> w3lib.encoding.read_bom(b'\xff\xfe\x00\x00\x34\x6c\x00\x00') | ||
('utf-32-le', '\xff\xfe\x00\x00') | ||
('utf-32-le', b'\xff\xfe\x00\x00') | ||
>>> w3lib.encoding.read_bom(b'\x01\x02\x03\x04') | ||
(None, None) | ||
>>> | ||
|
@@ -173,18 +192,22 @@ def read_bom(data): | |
|
||
# Python decoder doesn't follow unicode standard when handling | ||
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271 | ||
codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1)) | ||
codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.start+1)) # type: ignore | ||
|
||
|
||
def to_unicode(data_str, encoding): | ||
# type: (bytes, str) -> six.text_type | ||
"""Convert a str object to unicode using the encoding given | ||
|
||
Characters that cannot be converted will be converted to ``\\ufffd`` (the | ||
unicode replacement character). | ||
""" | ||
return data_str.decode(encoding, 'w3lib_replace') | ||
|
||
|
||
def html_to_unicode(content_type_header, html_body_str, | ||
default_encoding='utf8', auto_detect_fun=None): | ||
# type: (Optional[str], bytes, str, Optional[Callable[[bytes], str]]) -> Tuple[str, six.text_type] | ||
r'''Convert raw html bytes to unicode | ||
|
||
This attempts to make a reasonable guess at the content encoding of the | ||
|
@@ -230,7 +253,7 @@ def html_to_unicode(content_type_header, html_body_str, | |
|
||
>>> import w3lib.encoding | ||
>>> w3lib.encoding.html_to_unicode(None, | ||
... """<!DOCTYPE html> | ||
... b"""<!DOCTYPE html> | ||
... <head> | ||
... <meta charset="UTF-8" /> | ||
... <meta name="viewport" content="width=device-width" /> | ||
|
@@ -248,7 +271,8 @@ def html_to_unicode(content_type_header, html_body_str, | |
''' | ||
|
||
enc = http_content_type_encoding(content_type_header) | ||
bom_enc, bom = read_bom(html_body_str) | ||
# FIXME: remove type: ignore when mypy bug is fixed | ||
bom_enc, bom = read_bom(html_body_str) # type: ignore | ||
if enc is not None: | ||
# remove BOM if it agrees with the encoding | ||
if enc == bom_enc: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
❤️