Skip to content

Commit ba6f231

Browse files
committed
more type hints
1 parent a0c3942 commit ba6f231

File tree

4 files changed

+52
-51
lines changed

4 files changed

+52
-51
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ coverage.xml
1313
.mypy_cache/
1414
/index.txt
1515
.dmypy.json
16+
.hypothesis/

w3lib/html.py

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import re
66
from html.entities import name2codepoint
7-
from typing import Match, Sequence, AnyStr
7+
from typing import Iterable, Match, AnyStr, Optional, Pattern, Tuple, Union
88
from urllib.parse import urljoin
99

1010
from w3lib.util import to_unicode
@@ -19,7 +19,7 @@
1919
HTML5_WHITESPACE = ' \t\n\r\x0c'
2020

2121

22-
def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8'):
22+
def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8') -> str:
2323
"""Remove entities from the given `text` by converting them to their
2424
corresponding unicode character.
2525
@@ -47,7 +47,7 @@ def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: boo
4747
4848
"""
4949

50-
def convert_entity(m: Match):
50+
def convert_entity(m: Match) -> str:
5151
groups = m.groupdict()
5252
number = None
5353
if groups.get('dec'):
@@ -80,10 +80,10 @@ def convert_entity(m: Match):
8080

8181
return _ent_re.sub(convert_entity, to_unicode(text, encoding))
8282

83-
def has_entities(text: AnyStr, encoding=None):
83+
def has_entities(text: AnyStr, encoding: Optional[str] = None) -> bool:
8484
return bool(_ent_re.search(to_unicode(text, encoding)))
8585

86-
def replace_tags(text, token='', encoding=None):
86+
def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None) -> str:
8787
"""Replace all markup tags found in the given `text` by the given token.
8888
By default `token` is an empty string so it just removes all tags.
8989
@@ -107,7 +107,7 @@ def replace_tags(text, token='', encoding=None):
107107

108108

109109
_REMOVECOMMENTS_RE = re.compile('<!--.*?(?:-->|$)', re.DOTALL)
110-
def remove_comments(text, encoding=None):
110+
def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
111111
""" Remove HTML Comments.
112112
113113
>>> import w3lib.html
@@ -117,10 +117,10 @@ def remove_comments(text, encoding=None):
117117
118118
"""
119119

120-
text = to_unicode(text, encoding)
121-
return _REMOVECOMMENTS_RE.sub('', text)
120+
utext = to_unicode(text, encoding)
121+
return _REMOVECOMMENTS_RE.sub('', utext)
122122

123-
def remove_tags(text, which_ones=(), keep=(), encoding=None):
123+
def remove_tags(text: AnyStr, which_ones: Iterable[str] = (), keep: Iterable[str] = (), encoding: Optional[str] = None) -> str:
124124
""" Remove HTML Tags only.
125125
126126
`which_ones` and `keep` are both tuples, there are four cases:
@@ -170,14 +170,14 @@ def remove_tags(text, which_ones=(), keep=(), encoding=None):
170170
which_ones = {tag.lower() for tag in which_ones}
171171
keep = {tag.lower() for tag in keep}
172172

173-
def will_remove(tag):
173+
def will_remove(tag: str) -> bool:
174174
tag = tag.lower()
175175
if which_ones:
176176
return tag in which_ones
177177
else:
178178
return tag not in keep
179179

180-
def remove_tag(m):
180+
def remove_tag(m: Match) -> str:
181181
tag = m.group(1)
182182
return '' if will_remove(tag) else m.group(0)
183183

@@ -186,7 +186,7 @@ def remove_tag(m):
186186

187187
return retags.sub(remove_tag, to_unicode(text, encoding))
188188

189-
def remove_tags_with_content(text, which_ones=(), encoding=None):
189+
def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None) -> str:
190190
"""Remove tags and their content.
191191
192192
`which_ones` is a tuple of which tags to remove including their content.
@@ -200,16 +200,16 @@ def remove_tags_with_content(text, which_ones=(), encoding=None):
200200
201201
"""
202202

203-
text = to_unicode(text, encoding)
203+
utext = to_unicode(text, encoding)
204204
if which_ones:
205205
tags = '|'.join([r'<%s\b.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
206206
retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
207-
text = retags.sub('', text)
208-
return text
207+
utext = retags.sub('', utext)
208+
return utext
209209

210210

211-
def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='', \
212-
encoding=None):
211+
def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: str = '', \
212+
encoding: Optional[str] = None) -> str:
213213
"""Remove escape characters.
214214
215215
`which_ones` is a tuple of which escape characters we want to remove.
@@ -220,12 +220,12 @@ def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='', \
220220
221221
"""
222222

223-
text = to_unicode(text, encoding)
223+
utext = to_unicode(text, encoding)
224224
for ec in which_ones:
225-
text = text.replace(ec, to_unicode(replace_by, encoding))
226-
return text
225+
utext = utext.replace(ec, to_unicode(replace_by, encoding))
226+
return utext
227227

228-
def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
228+
def unquote_markup(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: Optional[str] = None) -> str:
229229
"""
230230
This function receives markup as a text (always a unicode string or
231231
a UTF-8 encoded string) and does the following:
@@ -237,7 +237,7 @@ def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
237237
238238
"""
239239

240-
def _get_fragments(txt, pattern):
240+
def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
241241
offset = 0
242242
for match in pattern.finditer(txt):
243243
match_s, match_e = match.span(1)
@@ -246,9 +246,9 @@ def _get_fragments(txt, pattern):
246246
offset = match_e
247247
yield txt[offset:]
248248

249-
text = to_unicode(text, encoding)
249+
utext = to_unicode(text, encoding)
250250
ret_text = ''
251-
for fragment in _get_fragments(text, _cdata_re):
251+
for fragment in _get_fragments(utext, _cdata_re):
252252
if isinstance(fragment, str):
253253
# it's not a CDATA (so we try to remove its entities)
254254
ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)
@@ -257,16 +257,16 @@ def _get_fragments(txt, pattern):
257257
ret_text += fragment.group('cdata_d')
258258
return ret_text
259259

260-
def get_base_url(text, baseurl='', encoding='utf-8'):
260+
def get_base_url(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8') -> str:
261261
"""Return the base url if declared in the given HTML `text`,
262262
relative to the given base url.
263263
264264
If no base url is found, the given `baseurl` is returned.
265265
266266
"""
267267

268-
text = to_unicode(text, encoding)
269-
m = _baseurl_re.search(text)
268+
utext = to_unicode(text, encoding)
269+
m = _baseurl_re.search(utext)
270270
if m:
271271
return urljoin(
272272
safe_url_string(baseurl),
@@ -275,7 +275,7 @@ def get_base_url(text, baseurl='', encoding='utf-8'):
275275
else:
276276
return safe_url_string(baseurl)
277277

278-
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
278+
def get_meta_refresh(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8', ignore_tags: Iterable[str] = ('script', 'noscript')) -> Tuple[Optional[float], Optional[str]]:
279279
"""Return the http-equiv parameter of the HTML meta element from the given
280280
HTML text and return a tuple ``(interval, url)`` where interval is an integer
281281
containing the delay in seconds (or zero if not present) and url is a
@@ -286,13 +286,13 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script',
286286
"""
287287

288288
try:
289-
text = to_unicode(text, encoding)
289+
utext = to_unicode(text, encoding)
290290
except UnicodeDecodeError:
291291
print(text)
292292
raise
293-
text = remove_tags_with_content(text, ignore_tags)
294-
text = remove_comments(replace_entities(text))
295-
m = _meta_refresh_re.search(text)
293+
utext = remove_tags_with_content(utext, ignore_tags)
294+
utext = remove_comments(replace_entities(utext))
295+
m = _meta_refresh_re.search(utext)
296296
if m:
297297
interval = float(m.group('int'))
298298
url = safe_url_string(m.group('url').strip(' "\''), encoding)
@@ -302,7 +302,7 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script',
302302
return None, None
303303

304304

305-
def strip_html5_whitespace(text):
305+
def strip_html5_whitespace(text: str) -> str:
306306
r"""
307307
Strip all leading and trailing space characters (as defined in
308308
https://www.w3.org/TR/html5/infrastructure.html#space-character).

w3lib/http.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from base64 import urlsafe_b64encode
22
from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping
3-
from w3lib.util import to_bytes
3+
from w3lib.util import to_bytes, to_native_str
44

55
HeadersDictInput = Mapping[bytes, Union[Any, Sequence]]
66
HeadersDictOutput = MutableMapping[bytes, List[bytes]]
@@ -95,7 +95,7 @@ def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8
9595
9696
"""
9797

98-
auth = "%r:%r" % (username, password)
98+
auth = "%s:%s" % (to_native_str(username), to_native_str(password))
9999
# XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
100100
# seems to be the most widely used encoding here. See also:
101101
# http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html

w3lib/url.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import re
1010
import string
1111
from collections import namedtuple
12-
from typing import Callable, Optional, Sequence, Tuple, Union, cast, Dict
12+
from typing import Callable, List, Optional, Sequence, Tuple, Union, cast, Dict
1313
from urllib.parse import (
1414
parse_qs,
1515
parse_qsl,
@@ -25,7 +25,7 @@
2525
)
2626
from urllib.parse import _coerce_args # type: ignore
2727
from urllib.request import pathname2url, url2pathname
28-
from w3lib.util import to_bytes, to_native_str, to_unicode
28+
from w3lib.util import to_unicode
2929
from w3lib._types import AnyUnicodeError, StrOrBytes
3030

3131

@@ -54,10 +54,10 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str
5454
as per https://url.spec.whatwg.org/#url-parsing.
5555
5656
If a bytes URL is given, it is first converted to `str` using the given
57-
encoding (which defaults to 'utf-8'). If quote_path is True (default),
57+
encoding (which defaults to 'utf-8'). If quote_path is True (default),
5858
path_encoding ('utf-8' by default) is used to encode URL path component
5959
which is then quoted. Otherwise, if quote_path is False, path component
60-
is not encoded or quoted. Given encoding is used for query string
60+
is not encoded or quoted. Given encoding is used for query string
6161
or form data.
6262
6363
When passing an encoding, you should use the encoding of the
@@ -80,7 +80,7 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str
8080
# IDNA encoding can fail for too long labels (>63 characters)
8181
# or missing labels (e.g. http://.example.com)
8282
try:
83-
netloc = parts.netloc.encode('idna').decode()
83+
netloc = parts.netloc.encode('idna')
8484
except UnicodeError:
8585
netloc = parts.netloc.encode('utf-8')
8686

@@ -89,10 +89,10 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str
8989
path = quote(parts.path.encode(path_encoding), _path_safe_chars)
9090
else:
9191
path = parts.path
92-
92+
9393
return urlunsplit((
9494
parts.scheme,
95-
netloc.rstrip(':'),
95+
netloc.decode().rstrip(':'),
9696
path,
9797
quote(parts.query.encode(encoding), _safe_chars),
9898
quote(parts.fragment.encode(encoding), _safe_chars),
@@ -411,7 +411,7 @@ def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
411411
]
412412

413413

414-
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
414+
def _safe_ParseResult(parts: ParseResult, encoding: str = 'utf8', path_encoding: str = 'utf8') -> Tuple[str, str, str, str, str, str]:
415415
# IDNA encoding can fail for too long labels (>63 characters)
416416
# or missing labels (e.g. http://.example.com)
417417
try:
@@ -429,8 +429,8 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
429429
)
430430

431431

432-
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
433-
encoding=None):
432+
def canonicalize_url(url: StrOrBytes, keep_blank_values: bool = True, keep_fragments: bool = False,
433+
encoding: Optional[str] = None) -> str:
434434
r"""Canonicalize the given url by applying the following procedures:
435435
436436
- sort query arguments, first by key, then by value
@@ -519,7 +519,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
519519
fragment))
520520

521521

522-
def _unquotepath(path):
522+
def _unquotepath(path: str) -> bytes:
523523
for reserved in ('2f', '2F', '3f', '3F'):
524524
path = path.replace('%' + reserved, '%25' + reserved.upper())
525525

@@ -531,7 +531,7 @@ def _unquotepath(path):
531531
return unquote_to_bytes(path)
532532

533533

534-
def parse_url(url, encoding=None):
534+
def parse_url(url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None) -> ParseResult:
535535
"""Return urlparsed url from the given argument (which could be an already
536536
parsed url)
537537
"""
@@ -540,7 +540,7 @@ def parse_url(url, encoding=None):
540540
return urlparse(to_unicode(url, encoding))
541541

542542

543-
def parse_qsl_to_bytes(qs, keep_blank_values=False):
543+
def parse_qsl_to_bytes(qs: str, keep_blank_values: bool = False) -> List[Tuple[bytes, bytes]]:
544544
"""Parse a query given as a string argument.
545545
546546
Data are returned as a list of name, value pairs as bytes.
@@ -575,11 +575,11 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
575575
else:
576576
continue
577577
if len(nv[1]) or keep_blank_values:
578-
name = nv[0].replace('+', ' ')
578+
name: StrOrBytes = nv[0].replace('+', ' ')
579579
name = unquote_to_bytes(name)
580580
name = _coerce_result(name)
581-
value = nv[1].replace('+', ' ')
581+
value: StrOrBytes = nv[1].replace('+', ' ')
582582
value = unquote_to_bytes(value)
583583
value = _coerce_result(value)
584-
r.append((name, value))
584+
r.append((cast(bytes, name), cast(bytes, value)))
585585
return r

0 commit comments

Comments
 (0)