more type hints

lucywang000 · lucywang000 · commit ba6f2314023e · 2021-08-02T14:24:38.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ coverage.xml
 .mypy_cache/
 /index.txt
 .dmypy.json
+.hypothesis/
diff --git a/w3lib/html.py b/w3lib/html.py
@@ -4,7 +4,7 @@
 
 import re
 from html.entities import name2codepoint
-from typing import Match, Sequence, AnyStr
+from typing import Iterable, Match, AnyStr, Optional, Pattern, Tuple, Union
 from urllib.parse import urljoin
 
 from w3lib.util import to_unicode
@@ -19,7 +19,7 @@
 HTML5_WHITESPACE = ' \t\n\r\x0c'
 
 
-def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8'):
+def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8') -> str:
     """Remove entities from the given `text` by converting them to their
     corresponding unicode character.
 
@@ -47,7 +47,7 @@ def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: boo
 
     """
 
-    def convert_entity(m: Match):
+    def convert_entity(m: Match) -> str:
         groups = m.groupdict()
         number = None
         if groups.get('dec'):
@@ -80,10 +80,10 @@ def convert_entity(m: Match):
 
     return _ent_re.sub(convert_entity, to_unicode(text, encoding))
 
-def has_entities(text: AnyStr, encoding=None):
+def has_entities(text: AnyStr, encoding: Optional[str] = None) -> bool:
     return bool(_ent_re.search(to_unicode(text, encoding)))
 
-def replace_tags(text, token='', encoding=None):
+def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None) -> str:
     """Replace all markup tags found in the given `text` by the given token.
     By default `token` is an empty string so it just removes all tags.
 
@@ -107,7 +107,7 @@ def replace_tags(text, token='', encoding=None):
 
 
 _REMOVECOMMENTS_RE = re.compile('<!--.*?(?:-->|$)', re.DOTALL)
-def remove_comments(text, encoding=None):
+def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
     """ Remove HTML Comments.
 
     >>> import w3lib.html
@@ -117,10 +117,10 @@ def remove_comments(text, encoding=None):
 
     """
 
-    text = to_unicode(text, encoding)
-    return _REMOVECOMMENTS_RE.sub('', text)
+    utext = to_unicode(text, encoding)
+    return _REMOVECOMMENTS_RE.sub('', utext)
 
-def remove_tags(text, which_ones=(), keep=(), encoding=None):
+def remove_tags(text: AnyStr, which_ones: Iterable[str] = (), keep: Iterable[str] = (), encoding: Optional[str] = None) -> str:
     """ Remove HTML Tags only.
 
     `which_ones` and `keep` are both tuples, there are four cases:
@@ -170,14 +170,14 @@ def remove_tags(text, which_ones=(), keep=(), encoding=None):
     which_ones = {tag.lower() for tag in which_ones}
     keep = {tag.lower() for tag in keep}
 
-    def will_remove(tag):
+    def will_remove(tag: str) -> bool:
         tag = tag.lower()
         if which_ones:
             return tag in which_ones
         else:
             return tag not in keep
 
-    def remove_tag(m):
+    def remove_tag(m: Match) -> str:
         tag = m.group(1)
         return '' if will_remove(tag) else m.group(0)
 
@@ -186,7 +186,7 @@ def remove_tag(m):
 
     return retags.sub(remove_tag, to_unicode(text, encoding))
 
-def remove_tags_with_content(text, which_ones=(), encoding=None):
+def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None) -> str:
     """Remove tags and their content.
 
     `which_ones` is a tuple of which tags to remove including their content.
@@ -200,16 +200,16 @@ def remove_tags_with_content(text, which_ones=(), encoding=None):
 
     """
 
-    text = to_unicode(text, encoding)
+    utext = to_unicode(text, encoding)
     if which_ones:
         tags = '|'.join([r'<%s\b.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
         retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
-        text = retags.sub('', text)
-    return text
+        utext = retags.sub('', utext)
+    return utext
 
 
-def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='', \
-        encoding=None):
+def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: str = '', \
+        encoding: Optional[str] = None) -> str:
     """Remove escape characters.
 
     `which_ones` is a tuple of which escape characters we want to remove.
@@ -220,12 +220,12 @@ def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='', \
 
     """
 
-    text = to_unicode(text, encoding)
+    utext = to_unicode(text, encoding)
     for ec in which_ones:
-        text = text.replace(ec, to_unicode(replace_by, encoding))
-    return text
+        utext = utext.replace(ec, to_unicode(replace_by, encoding))
+    return utext
 
-def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
+def unquote_markup(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: Optional[str] = None) -> str:
     """
     This function receives markup as a text (always a unicode string or
     a UTF-8 encoded string) and does the following:
@@ -237,7 +237,7 @@ def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
 
     """
 
-    def _get_fragments(txt, pattern):
+    def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
         offset = 0
         for match in pattern.finditer(txt):
             match_s, match_e = match.span(1)
@@ -246,9 +246,9 @@ def _get_fragments(txt, pattern):
             offset = match_e
         yield txt[offset:]
 
-    text = to_unicode(text, encoding)
+    utext = to_unicode(text, encoding)
     ret_text = ''
-    for fragment in _get_fragments(text, _cdata_re):
+    for fragment in _get_fragments(utext, _cdata_re):
         if isinstance(fragment, str):
             # it's not a CDATA (so we try to remove its entities)
             ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)
@@ -257,16 +257,16 @@ def _get_fragments(txt, pattern):
             ret_text += fragment.group('cdata_d')
     return ret_text
 
-def get_base_url(text, baseurl='', encoding='utf-8'):
+def get_base_url(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8') -> str:
     """Return the base url if declared in the given HTML `text`,
     relative to the given base url.
 
     If no base url is found, the given `baseurl` is returned.
 
     """
 
-    text = to_unicode(text, encoding)
-    m = _baseurl_re.search(text)
+    utext = to_unicode(text, encoding)
+    m = _baseurl_re.search(utext)
     if m:
         return urljoin(
             safe_url_string(baseurl),
@@ -275,7 +275,7 @@ def get_base_url(text, baseurl='', encoding='utf-8'):
     else:
         return safe_url_string(baseurl)
 
-def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
+def get_meta_refresh(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8', ignore_tags: Iterable[str] = ('script', 'noscript')) -> Tuple[Optional[float], Optional[str]]:
     """Return  the http-equiv parameter of the HTML meta element from the given
     HTML text and return a tuple ``(interval, url)`` where interval is an integer
     containing the delay in seconds (or zero if not present) and url is a
@@ -286,13 +286,13 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script',
     """
 
     try:
-        text = to_unicode(text, encoding)
+        utext = to_unicode(text, encoding)
     except UnicodeDecodeError:
         print(text)
         raise
-    text = remove_tags_with_content(text, ignore_tags)
-    text = remove_comments(replace_entities(text))
-    m = _meta_refresh_re.search(text)
+    utext = remove_tags_with_content(utext, ignore_tags)
+    utext = remove_comments(replace_entities(utext))
+    m = _meta_refresh_re.search(utext)
     if m:
         interval = float(m.group('int'))
         url = safe_url_string(m.group('url').strip(' "\''), encoding)
@@ -302,7 +302,7 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script',
         return None, None
 
 
-def strip_html5_whitespace(text):
+def strip_html5_whitespace(text: str) -> str:
     r"""
     Strip all leading and trailing space characters (as defined in
     https://www.w3.org/TR/html5/infrastructure.html#space-character).
diff --git a/w3lib/http.py b/w3lib/http.py
@@ -1,6 +1,6 @@
 from base64 import urlsafe_b64encode
 from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping
-from w3lib.util import to_bytes
+from w3lib.util import to_bytes, to_native_str
 
 HeadersDictInput = Mapping[bytes, Union[Any, Sequence]]
 HeadersDictOutput = MutableMapping[bytes, List[bytes]]
@@ -95,7 +95,7 @@ def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8
 
     """
 
-    auth = "%r:%r" % (username, password)
+    auth = "%s:%s" % (to_native_str(username), to_native_str(password))
     # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
     # seems to be the most widely used encoding here. See also:
     # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
diff --git a/w3lib/url.py b/w3lib/url.py
@@ -9,7 +9,7 @@
 import re
 import string
 from collections import namedtuple
-from typing import Callable, Optional, Sequence, Tuple, Union, cast, Dict
+from typing import Callable, List, Optional, Sequence, Tuple, Union, cast, Dict
 from urllib.parse import (
     parse_qs,
     parse_qsl,
@@ -25,7 +25,7 @@
 )
 from urllib.parse import _coerce_args  # type: ignore
 from urllib.request import pathname2url, url2pathname
-from w3lib.util import to_bytes, to_native_str, to_unicode
+from w3lib.util import to_unicode
 from w3lib._types import AnyUnicodeError, StrOrBytes
 
 
@@ -54,10 +54,10 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str
     as per https://url.spec.whatwg.org/#url-parsing.
 
     If a bytes URL is given, it is first converted to `str` using the given
-    encoding (which defaults to 'utf-8'). If quote_path is True (default), 
+    encoding (which defaults to 'utf-8'). If quote_path is True (default),
     path_encoding ('utf-8' by default) is used to encode URL path component
     which is then quoted. Otherwise, if quote_path is False, path component
-    is not encoded or quoted. Given encoding is used for query string 
+    is not encoded or quoted. Given encoding is used for query string
     or form data.
 
     When passing an encoding, you should use the encoding of the
@@ -80,7 +80,7 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
-        netloc = parts.netloc.encode('idna').decode()
+        netloc = parts.netloc.encode('idna')
     except UnicodeError:
         netloc = parts.netloc.encode('utf-8')
 
@@ -89,10 +89,10 @@ def safe_url_string(url: StrOrBytes, encoding: str = 'utf8', path_encoding: str
         path = quote(parts.path.encode(path_encoding), _path_safe_chars)
     else:
         path = parts.path
-    
+
     return urlunsplit((
         parts.scheme,
-        netloc.rstrip(':'),
+        netloc.decode().rstrip(':'),
         path,
         quote(parts.query.encode(encoding), _safe_chars),
         quote(parts.fragment.encode(encoding), _safe_chars),
@@ -411,7 +411,7 @@ def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
 ]
 
 
-def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
+def _safe_ParseResult(parts: ParseResult, encoding: str = 'utf8', path_encoding: str = 'utf8') -> Tuple[str, str, str, str, str, str]:
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
@@ -429,8 +429,8 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
     )
 
 
-def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
-                     encoding=None):
+def canonicalize_url(url: StrOrBytes, keep_blank_values: bool = True, keep_fragments: bool = False,
+                     encoding: Optional[str] = None) -> str:
     r"""Canonicalize the given url by applying the following procedures:
 
     - sort query arguments, first by key, then by value
@@ -519,7 +519,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
                        fragment))
 
 
-def _unquotepath(path):
+def _unquotepath(path: str) -> bytes:
     for reserved in ('2f', '2F', '3f', '3F'):
         path = path.replace('%' + reserved, '%25' + reserved.upper())
 
@@ -531,7 +531,7 @@ def _unquotepath(path):
     return unquote_to_bytes(path)
 
 
-def parse_url(url, encoding=None):
+def parse_url(url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None) -> ParseResult:
     """Return urlparsed url from the given argument (which could be an already
     parsed url)
     """
@@ -540,7 +540,7 @@ def parse_url(url, encoding=None):
     return urlparse(to_unicode(url, encoding))
 
 
-def parse_qsl_to_bytes(qs, keep_blank_values=False):
+def parse_qsl_to_bytes(qs: str, keep_blank_values: bool = False) -> List[Tuple[bytes, bytes]]:
     """Parse a query given as a string argument.
 
     Data are returned as a list of name, value pairs as bytes.
@@ -575,11 +575,11 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
             else:
                 continue
         if len(nv[1]) or keep_blank_values:
-            name = nv[0].replace('+', ' ')
+            name: StrOrBytes = nv[0].replace('+', ' ')
             name = unquote_to_bytes(name)
             name = _coerce_result(name)
-            value = nv[1].replace('+', ' ')
+            value: StrOrBytes = nv[1].replace('+', ' ')
             value = unquote_to_bytes(value)
             value = _coerce_result(value)
-            r.append((name, value))
+            r.append((cast(bytes, name), cast(bytes, value)))
     return r