From 4537ff58f38b9525a124bf6a381e425bcdf1a439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 23 Nov 2022 13:06:36 +0100 Subject: [PATCH 1/3] Make safe_url_string safer --- tests/test_url.py | 412 +++++++++++++++++++++++++++++++++++++++++++++- tox.ini | 4 +- w3lib/_infra.py | 13 ++ w3lib/_url.py | 12 ++ w3lib/url.py | 42 ++++- 5 files changed, 464 insertions(+), 19 deletions(-) create mode 100644 w3lib/_infra.py create mode 100644 w3lib/_url.py diff --git a/tests/test_url.py b/tests/test_url.py index 2356c2c3..261406d5 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -4,6 +4,13 @@ import pytest +from w3lib._infra import ( + _ASCII_ALPHA, + _ASCII_ALPHANUMERIC, + _ASCII_TAB_OR_NEWLINE, + _C0_CONTROL_OR_SPACE, +) +from w3lib._url import _SPECIAL_SCHEMES from w3lib.url import ( add_or_replace_parameter, add_or_replace_parameters, @@ -21,6 +28,403 @@ ) +UNSET = object() + +# Test cases for URL-to-safe-URL conversions with a URL and an encoding as +# input parameters. +# +# (encoding, input URL, output URL or exception) +SAFE_URL_ENCODING_CASES = ( + (UNSET, "", ValueError), + (UNSET, "https://example.com", "https://example.com"), + (UNSET, "https://example.com/©", "https://example.com/%C2%A9"), + # Paths are always UTF-8-encoded. + ("iso-8859-1", "https://example.com/©", "https://example.com/%C2%A9"), + # Queries are UTF-8-encoded if the scheme is not special, ws or wss. + ("iso-8859-1", "a://example.com?©", "a://example.com?%C2%A9"), + *( + ("iso-8859-1", f"{scheme}://example.com?©", f"{scheme}://example.com?%C2%A9") + for scheme in ("ws", "wss") + ), + *( + ("iso-8859-1", f"{scheme}://example.com?©", f"{scheme}://example.com?%A9") + for scheme in _SPECIAL_SCHEMES + if scheme not in {"ws", "wss"} + ), + # Fragments are always UTF-8-encoded. + ("iso-8859-1", "https://example.com#©", "https://example.com#%C2%A9"), +) + +INVALID_SCHEME_FOLLOW_UPS = "".join( + chr(value) + for value in range(0x81) + if ( + chr(value) not in _ASCII_ALPHANUMERIC + and chr(value) not in "+-." + and chr(value) not in _C0_CONTROL_OR_SPACE # stripped + and chr(value) != ":" # separator + ) +) + +SAFE_URL_URL_INVALID_SCHEME_CASES = tuple( + (f"{scheme}://example.com", ValueError) + for scheme in ( + # A scheme is required. + "", + # The first scheme letter must be an ASCII alpha. + # Note: 0x80 is included below to also test non-ASCII example. + *( + chr(value) + for value in range(0x81) + if ( + chr(value) not in _ASCII_ALPHA + and chr(value) not in _C0_CONTROL_OR_SPACE # stripped + and chr(value) != ":" # separator + ) + ), + # The follow-up scheme letters can also be ASCII numbers, plus, hyphen, + # or period. + f"a{INVALID_SCHEME_FOLLOW_UPS}", + ) +) + +# Remove any leading and trailing C0 control or space from input. +SAFE_URL_URL_STRIP_CASES = tuple( + (f"{char}https://example.com{char}", "https://example.com") + for char in _C0_CONTROL_OR_SPACE + if char not in _ASCII_TAB_OR_NEWLINE +) + +SCHEME_NON_FIRST = _ASCII_ALPHANUMERIC + "+-." + +# Username and password characters that do not need escaping. +# Removed for RFC 2396 and RFC 3986: % +# Removed for the URL living standard: :;= +USERINFO_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + "&+$," +USERNAME_TO_ENCODE = "".join( + chr(value) + for value in range(0x80) + if ( + chr(value) not in _C0_CONTROL_OR_SPACE + and chr(value) not in USERINFO_SAFE + and chr(value) not in ":/?#\\" + ) +) +USERNAME_ENCODED = "".join(f"%{ord(char):02X}" for char in USERNAME_TO_ENCODE) +PASSWORD_TO_ENCODE = USERNAME_TO_ENCODE + ":" +PASSWORD_ENCODED = "".join(f"%{ord(char):02X}" for char in PASSWORD_TO_ENCODE) + +# Path characters that do not need escaping. +# Removed for RFC 2396 and RFC 3986: %[\]^| +PATH_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + ":@&=+$," + "/" + ";" +PATH_TO_ENCODE = "".join( + chr(value) + for value in range(0x80) + if ( + chr(value) not in _C0_CONTROL_OR_SPACE + and chr(value) not in PATH_SAFE + and chr(value) not in "?#\\" + ) +) +PATH_ENCODED = "".join(f"%{ord(char):02X}" for char in PATH_TO_ENCODE) + +# Query characters that do not need escaping. +# Removed for RFC 2396 and RFC 3986: %[\]^`{|} +# Removed for the URL living standard: ' (special) +QUERY_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + ":@&=+$," + "/" + ";" + "?" +QUERY_TO_ENCODE = "".join( + chr(value) + for value in range(0x80) + if ( + chr(value) not in _C0_CONTROL_OR_SPACE + and chr(value) not in QUERY_SAFE + and chr(value) not in "#" + ) +) +QUERY_ENCODED = "".join(f"%{ord(char):02X}" for char in QUERY_TO_ENCODE) +SPECIAL_QUERY_SAFE = QUERY_SAFE.replace("'", "") +SPECIAL_QUERY_TO_ENCODE = "".join( + chr(value) + for value in range(0x80) + if ( + chr(value) not in _C0_CONTROL_OR_SPACE + and chr(value) not in SPECIAL_QUERY_SAFE + and chr(value) not in "#" + ) +) +SPECIAL_QUERY_ENCODED = "".join(f"%{ord(char):02X}" for char in SPECIAL_QUERY_TO_ENCODE) + +# Fragment characters that do not need escaping. +# Removed for RFC 2396 and RFC 3986: #%[\\]^{|} +FRAGMENT_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + ":@&=+$," + "/" + ";" + "?" +FRAGMENT_TO_ENCODE = "".join( + chr(value) + for value in range(0x80) + if (chr(value) not in _C0_CONTROL_OR_SPACE and chr(value) not in FRAGMENT_SAFE) +) +FRAGMENT_ENCODED = "".join(f"%{ord(char):02X}" for char in FRAGMENT_TO_ENCODE) + + +# Test cases for URL-to-safe-URL conversions with only a URL as input parameter +# (i.e. no encoding or base URL). +# +# (input URL, output URL or exception) +SAFE_URL_URL_CASES = ( + # Invalid input type + (1, Exception), + (object(), Exception), + # Empty string + ("", ValueError), + *SAFE_URL_URL_STRIP_CASES, + # Remove all ASCII tab or newline from input. + ( + ( + f"{_ASCII_TAB_OR_NEWLINE}h{_ASCII_TAB_OR_NEWLINE}ttps" + f"{_ASCII_TAB_OR_NEWLINE}:{_ASCII_TAB_OR_NEWLINE}/" + f"{_ASCII_TAB_OR_NEWLINE}/{_ASCII_TAB_OR_NEWLINE}a" + f"{_ASCII_TAB_OR_NEWLINE}b{_ASCII_TAB_OR_NEWLINE}:" + f"{_ASCII_TAB_OR_NEWLINE}a{_ASCII_TAB_OR_NEWLINE}b" + f"{_ASCII_TAB_OR_NEWLINE}@{_ASCII_TAB_OR_NEWLINE}exam" + f"{_ASCII_TAB_OR_NEWLINE}ple.com{_ASCII_TAB_OR_NEWLINE}:" + f"{_ASCII_TAB_OR_NEWLINE}1{_ASCII_TAB_OR_NEWLINE}2" + f"{_ASCII_TAB_OR_NEWLINE}/{_ASCII_TAB_OR_NEWLINE}a" + f"{_ASCII_TAB_OR_NEWLINE}b{_ASCII_TAB_OR_NEWLINE}?" + f"{_ASCII_TAB_OR_NEWLINE}a{_ASCII_TAB_OR_NEWLINE}b" + f"{_ASCII_TAB_OR_NEWLINE}#{_ASCII_TAB_OR_NEWLINE}a" + f"{_ASCII_TAB_OR_NEWLINE}b{_ASCII_TAB_OR_NEWLINE}" + ), + "https://ab:ab@example.com:12/ab?ab#ab", + ), + # Scheme + (f"{_ASCII_ALPHA}://example.com", f"{_ASCII_ALPHA.lower()}://example.com"), + ( + f"a{SCHEME_NON_FIRST}://example.com", + f"a{SCHEME_NON_FIRST.lower()}://example.com", + ), + *SAFE_URL_URL_INVALID_SCHEME_CASES, + # Authority + ("https://a@example.com", "https://a@example.com"), + ("https://a:@example.com", "https://a:@example.com"), + ("https://a:a@example.com", "https://a:a@example.com"), + ("https://a%3A@example.com", "https://a%3A@example.com"), + ( + f"https://{USERINFO_SAFE}:{USERINFO_SAFE}@example.com", + f"https://{USERINFO_SAFE}:{USERINFO_SAFE}@example.com", + ), + ( + f"https://{USERNAME_TO_ENCODE}:{PASSWORD_TO_ENCODE}@example.com", + f"https://{USERNAME_ENCODED}:{PASSWORD_ENCODED}@example.com", + ), + ("https://@\\example.com", ValueError), + ("https://\x80:\x80@example.com", "https://%C2%80:%C2%80@example.com"), + # Host + ("https://example.com", "https://example.com"), + ("https://.example", "https://.example"), + ("https://\x80.example", ValueError), + ("https://%80.example", ValueError), + # The 4 cases below test before and after crossing DNS length limits on + # domain name labels (63 characters) and the domain name as a whole (253 + # characters). However, all cases are expected to pass because the URL + # living standard does not require domain names to be within these limits. + (f"https://{'a'*63}.example", f"https://{'a'*63}.example"), + (f"https://{'a'*64}.example", f"https://{'a'*64}.example"), + ( + f"https://{'a'*63}.{'a'*63}.{'a'*63}.{'a'*53}.example", + f"https://{'a'*63}.{'a'*63}.{'a'*63}.{'a'*53}.example", + ), + ( + f"https://{'a'*63}.{'a'*63}.{'a'*63}.{'a'*54}.example", + f"https://{'a'*63}.{'a'*63}.{'a'*63}.{'a'*54}.example", + ), + ("https://ñ.example", "https://xn--ida.example"), + ("http://192.168.0.0", "http://192.168.0.0"), + ("http://192.168.0.256", ValueError), + ("http://192.168.0.0.0", ValueError), + ("http://[2a01:5cc0:1:2::4]", "http://[2a01:5cc0:1:2::4]"), + ("http://[2a01:5cc0:1:2:3:4]", ValueError), + # Port + ("https://example.com:", "https://example.com:"), + ("https://example.com:1", "https://example.com:1"), + ("https://example.com:443", "https://example.com:443"), + # Path + ("https://example.com/", "https://example.com/"), + ("https://example.com/a", "https://example.com/a"), + ("https://example.com\\a", "https://example.com/a"), + ("https://example.com/a\\b", "https://example.com/a/b"), + ( + f"https://example.com/{PATH_SAFE}", + f"https://example.com/{PATH_SAFE}", + ), + ( + f"https://example.com/{PATH_TO_ENCODE}", + f"https://example.com/{PATH_ENCODED}", + ), + ("https://example.com/ñ", "https://example.com/%C3%B1"), + ("https://example.com/ñ%C3%B1", "https://example.com/%C3%B1%C3%B1"), + # Query + ("https://example.com?", "https://example.com?"), + ("https://example.com/?", "https://example.com/?"), + ("https://example.com?a", "https://example.com?a"), + ("https://example.com?a=", "https://example.com?a="), + ("https://example.com?a=b", "https://example.com?a=b"), + ( + f"a://example.com?{QUERY_SAFE}", + f"a://example.com?{QUERY_SAFE}", + ), + ( + f"a://example.com?{QUERY_TO_ENCODE}", + f"a://example.com?{QUERY_ENCODED}", + ), + *( + ( + f"{scheme}://example.com?{SPECIAL_QUERY_SAFE}", + f"{scheme}://example.com?{SPECIAL_QUERY_SAFE}", + ) + for scheme in _SPECIAL_SCHEMES + ), + *( + ( + f"{scheme}://example.com?{SPECIAL_QUERY_TO_ENCODE}", + f"{scheme}://example.com?{SPECIAL_QUERY_ENCODED}", + ) + for scheme in _SPECIAL_SCHEMES + ), + ("https://example.com?ñ", "https://example.com?%C3%B1"), + ("https://example.com?ñ%C3%B1", "https://example.com?%C3%B1%C3%B1"), + # Fragment + ("https://example.com#", "https://example.com#"), + ("https://example.com/#", "https://example.com/#"), + ("https://example.com?#", "https://example.com?#"), + ("https://example.com/?#", "https://example.com/?#"), + ("https://example.com#a", "https://example.com#a"), + ( + f"a://example.com#{FRAGMENT_SAFE}", + f"a://example.com#{FRAGMENT_SAFE}", + ), + ( + f"a://example.com#{FRAGMENT_TO_ENCODE}", + f"a://example.com#{FRAGMENT_ENCODED}", + ), + ("https://example.com#ñ", "https://example.com#%C3%B1"), + ("https://example.com#ñ%C3%B1", "https://example.com#%C3%B1%C3%B1"), + # All fields, UTF-8 wherever possible. + ( + "https://ñ:ñ@ñ.example:1/ñ?ñ#ñ", + "https://%C3%B1:%C3%B1@xn--ida.example:1/%C3%B1?%C3%B1#%C3%B1", + ), +) + + +def _test_safe_url_func(url, *, encoding=UNSET, output, func): + kwargs = {} + if encoding is not UNSET: + kwargs["encoding"] = encoding + try: + is_exception = issubclass(output, Exception) + except TypeError: + is_exception = False + if is_exception: + with pytest.raises(output): + func(url, **kwargs) + return + actual = func(url, **kwargs) + assert actual == output + assert func(actual, **kwargs) == output # Idempotency + + +def _test_safe_url_string(url, *, encoding=UNSET, output): + return _test_safe_url_func( + url, + encoding=encoding, + output=output, + func=safe_url_string, + ) + + +KNOWN_SAFE_URL_STRING_ENCODING_ISSUES = { + (UNSET, ""), # Invalid URL + # UTF-8 encoding is not enforced in non-special URLs, or in URLs with the + # ws or wss schemas. + ("iso-8859-1", "a://example.com?\xa9"), + ("iso-8859-1", "ws://example.com?\xa9"), + ("iso-8859-1", "wss://example.com?\xa9"), + # UTF-8 encoding is not enforced on the fragment. + ("iso-8859-1", "https://example.com#\xa9"), +} + + +@pytest.mark.parametrize( + "encoding,url,output", + tuple( + case + if case[:2] not in KNOWN_SAFE_URL_STRING_ENCODING_ISSUES + else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + for case in SAFE_URL_ENCODING_CASES + ), +) +def test_safe_url_string_encoding(encoding, url, output): + _test_safe_url_string(url, encoding=encoding, output=output) + + +KNOWN_SAFE_URL_STRING_URL_ISSUES = { + "", # Invalid URL + *(case[0] for case in SAFE_URL_URL_STRIP_CASES), + *(case[0] for case in SAFE_URL_URL_INVALID_SCHEME_CASES), + # Userinfo characters that the URL living standard requires escaping (:;=) + # are not escaped. + "https://@\\example.com", # Invalid URL + "https://\x80.example", # Invalid domain name (non-visible character) + "https://%80.example", # Invalid domain name (non-visible character) + "http://192.168.0.256", # Invalid IP address + "http://192.168.0.0.0", # Invalid IP address / domain name + "http://[2a01:5cc0:1:2::4]", # https://github.com/scrapy/w3lib/issues/193 + "http://[2a01:5cc0:1:2:3:4]", # Invalid IPv6 + "https://example.com:", # Removes the : + # Does not convert \ to / + "https://example.com\\a", + "https://example.com\\a\\b", + # Encodes \ and / after the first one in the path + "https://example.com/a/b", + "https://example.com/a\\b", + # Some path characters that RFC 2396 and RFC 3986 require escaping (%) + # are not escaped. + f"https://example.com/{PATH_TO_ENCODE}", + # ? is removed + "https://example.com?", + "https://example.com/?", + # Some query characters that RFC 2396 and RFC 3986 require escaping (%) + # are not escaped. + f"a://example.com?{QUERY_TO_ENCODE}", + # Some special query characters that RFC 2396 and RFC 3986 require escaping + # (%) are not escaped. + *( + f"{scheme}://example.com?{SPECIAL_QUERY_TO_ENCODE}" + for scheme in _SPECIAL_SCHEMES + ), + # ? and # are removed + "https://example.com#", + "https://example.com/#", + "https://example.com?#", + "https://example.com/?#", + # Some fragment characters that RFC 2396 and RFC 3986 require escaping + # (%) are not escaped. + f"a://example.com#{FRAGMENT_TO_ENCODE}", +} + + +@pytest.mark.parametrize( + "url,output", + tuple( + case + if case[0] not in KNOWN_SAFE_URL_STRING_URL_ISSUES + else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + for case in SAFE_URL_URL_CASES + ), +) +def test_safe_url_string_url(url, output): + _test_safe_url_string(url, output=output) + + class UrlTests(unittest.TestCase): def test_safe_url_string(self): # Motoko Kusanagi (Cyborg from Ghost in the Shell) @@ -106,14 +510,6 @@ def test_safe_url_string_remove_ascii_tab_and_newlines(self): "http://example.com/test%07.html", ) - def test_safe_url_string_unsafe_chars(self): - safeurl = safe_url_string( - r"http://localhost:8001/unwise{,},|,\,^,[,],`?|=[]&[]=|" - ) - self.assertEqual( - safeurl, r"http://localhost:8001/unwise%7B,%7D,|,%5C,%5E,[,],%60?|=[]&[]=|" - ) - def test_safe_url_string_quote_path(self): safeurl = safe_url_string('http://google.com/"hello"', quote_path=True) self.assertEqual(safeurl, "http://google.com/%22hello%22") diff --git a/tox.ini b/tox.ini index 5647f5a1..60b12f38 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py36, py37, py38, py39, py310, pypy3, docs, security, flake8, pylint, black +envlist = py37, py38, py39, py310, pypy3, docs, security, flake8, pylint, black, typing [testenv] deps = @@ -50,7 +50,7 @@ commands = deps = black==22.6.0 commands = - black --check {posargs:conftest.py setup.py tests w3lib} + black {posargs:--check conftest.py setup.py tests w3lib} [docs] changedir = docs diff --git a/w3lib/_infra.py b/w3lib/_infra.py new file mode 100644 index 00000000..f18437eb --- /dev/null +++ b/w3lib/_infra.py @@ -0,0 +1,13 @@ +# https://infra.spec.whatwg.org/ + +import string + +# https://infra.spec.whatwg.org/commit-snapshots/59e0d16c1e3ba0e77c6a60bfc69a0929b8ffaa5d/#code-points +_ASCII_TAB_OR_NEWLINE = "\t\n\r" +_ASCII_WHITESPACE = "\t\n\x0c\r " +_C0_CONTROL = "".join(chr(n) for n in range(32)) +_C0_CONTROL_OR_SPACE = _C0_CONTROL + " " +_ASCII_DIGIT = string.digits +_ASCII_HEX_DIGIT = string.hexdigits +_ASCII_ALPHA = string.ascii_letters +_ASCII_ALPHANUMERIC = string.ascii_letters + string.digits diff --git a/w3lib/_url.py b/w3lib/_url.py new file mode 100644 index 00000000..b6e38657 --- /dev/null +++ b/w3lib/_url.py @@ -0,0 +1,12 @@ +# https://url.spec.whatwg.org/ + +# https://url.spec.whatwg.org/commit-snapshots/a46cb9188a48c2c9d80ba32a9b1891652d6b4900/#default-port +_DEFAULT_PORTS = { + "ftp": 21, + "file": None, + "http": 80, + "https": 443, + "ws": 80, + "wss": 443, +} +_SPECIAL_SCHEMES = set(_DEFAULT_PORTS.keys()) diff --git a/w3lib/url.py b/w3lib/url.py index a41446c4..b5b84ec8 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -35,8 +35,10 @@ ) from urllib.parse import _coerce_args # type: ignore from urllib.request import pathname2url, url2pathname -from w3lib.util import to_unicode -from w3lib._types import AnyUnicodeError, StrOrBytes + +from .util import to_unicode +from ._types import AnyUnicodeError, StrOrBytes +from ._url import _SPECIAL_SCHEMES # error handling function for bytes-to-Unicode decoding errors with URLs @@ -54,16 +56,33 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]: RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode("ascii") EXTRA_SAFE_CHARS = b"|" # see https://github.com/scrapy/w3lib/pull/25 +RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":" _safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b"%" _path_safe_chars = _safe_chars.replace(b"#", b"") -RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":" + +# Characters that are safe in all of: +# +# - RFC 2396 + RFC 2732, as interpreted by Java 8’s java.net.URI class +# - RFC 3986 +# - The URL living standard +# +# NOTE: % is currently excluded from these lists of characters, due to +# limitations of the current safe_url_string implementation, but it should also +# be escaped as %25 when it is not already being used as part of an escape +# character. +_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, b":;=") +_PATH_SAFEST_CHARS = _safe_chars.translate(None, b"#[]|") +_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS +_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, b"'") +_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS + _ascii_tab_newline_re = re.compile( r"[\t\n\r]" ) # see https://infra.spec.whatwg.org/#ascii-tab-or-newline -def safe_url_string( +def safe_url_string( # pylint: disable=too-many-locals url: StrOrBytes, encoding: str = "utf8", path_encoding: str = "utf8", @@ -106,11 +125,11 @@ def safe_url_string( netloc_bytes = b"" if username is not None or password is not None: if username is not None: - safe_username = quote(unquote(username), RFC3986_USERINFO_SAFE_CHARS) + safe_username = quote(unquote(username), _USERINFO_SAFEST_CHARS) netloc_bytes += safe_username.encode(encoding) if password is not None: netloc_bytes += b":" - safe_password = quote(unquote(password), RFC3986_USERINFO_SAFE_CHARS) + safe_password = quote(unquote(password), _USERINFO_SAFEST_CHARS) netloc_bytes += safe_password.encode(encoding) netloc_bytes += b"@" if hostname is not None: @@ -128,17 +147,22 @@ def safe_url_string( # default encoding for path component SHOULD be UTF-8 if quote_path: - path = quote(parts.path.encode(path_encoding), _path_safe_chars) + path = quote(parts.path.encode(path_encoding), _PATH_SAFEST_CHARS) else: path = parts.path + if parts.scheme in _SPECIAL_SCHEMES: + query = quote(parts.query.encode(encoding), _SPECIAL_QUERY_SAFEST_CHARS) + else: + query = quote(parts.query.encode(encoding), _QUERY_SAFEST_CHARS) + return urlunsplit( ( parts.scheme, netloc, path, - quote(parts.query.encode(encoding), _safe_chars), - quote(parts.fragment.encode(encoding), _safe_chars), + query, + quote(parts.fragment.encode(encoding), _FRAGMENT_SAFEST_CHARS), ) ) From 7d9653661c24ffce2b53caf65b2dc964fa731ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 24 Nov 2022 09:52:41 +0100 Subject: [PATCH 2/3] bytes.translate: specify the delete parameter name for readability Co-authored-by: Mikhail Korobov --- w3lib/url.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/w3lib/url.py b/w3lib/url.py index b5b84ec8..b22b3b85 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -70,10 +70,10 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]: # limitations of the current safe_url_string implementation, but it should also # be escaped as %25 when it is not already being used as part of an escape # character. -_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, b":;=") -_PATH_SAFEST_CHARS = _safe_chars.translate(None, b"#[]|") +_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, delete=b":;=") +_PATH_SAFEST_CHARS = _safe_chars.translate(None, delete=b"#[]|") _QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS -_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, b"'") +_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, delete=b"'") _FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS From 0994e08acafb12d7eac069cfc7c57811ce4fca35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 24 Nov 2022 11:17:36 +0100 Subject: [PATCH 3/3] .flake8: move unsupported inline comment --- .flake8 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index 268fd3a8..96c8f44d 100644 --- a/.flake8 +++ b/.flake8 @@ -10,4 +10,5 @@ ignore = W504, # black disagrees with flake8, and inserts whitespace - E203, # whitespace before ':' + # E203: whitespace before ':' + E203,