From 4537ff58f38b9525a124bf6a381e425bcdf1a439 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Wed, 23 Nov 2022 13:06:36 +0100
Subject: [PATCH 1/3] Make safe_url_string safer

---
 tests/test_url.py | 412 +++++++++++++++++++++++++++++++++++++++++++++-
 tox.ini           |   4 +-
 w3lib/_infra.py   |  13 ++
 w3lib/_url.py     |  12 ++
 w3lib/url.py      |  42 ++++-
 5 files changed, 464 insertions(+), 19 deletions(-)
 create mode 100644 w3lib/_infra.py
 create mode 100644 w3lib/_url.py

diff --git a/tests/test_url.py b/tests/test_url.py
index 2356c2c3..261406d5 100644
--- a/tests/test_url.py
+++ b/tests/test_url.py
@@ -4,6 +4,13 @@
 
 import pytest
 
+from w3lib._infra import (
+    _ASCII_ALPHA,
+    _ASCII_ALPHANUMERIC,
+    _ASCII_TAB_OR_NEWLINE,
+    _C0_CONTROL_OR_SPACE,
+)
+from w3lib._url import _SPECIAL_SCHEMES
 from w3lib.url import (
     add_or_replace_parameter,
     add_or_replace_parameters,
@@ -21,6 +28,403 @@
 )
 
 
+UNSET = object()
+
+# Test cases for URL-to-safe-URL conversions with a URL and an encoding as
+# input parameters.
+#
+# (encoding, input URL, output URL or exception)
+SAFE_URL_ENCODING_CASES = (
+    (UNSET, "", ValueError),
+    (UNSET, "https://example.com", "https://example.com"),
+    (UNSET, "https://example.com/©", "https://example.com/%C2%A9"),
+    # Paths are always UTF-8-encoded.
+    ("iso-8859-1", "https://example.com/©", "https://example.com/%C2%A9"),
+    # Queries are UTF-8-encoded if the scheme is not special, ws or wss.
+    ("iso-8859-1", "a://example.com?©", "a://example.com?%C2%A9"),
+    *(
+        ("iso-8859-1", f"{scheme}://example.com?©", f"{scheme}://example.com?%C2%A9")
+        for scheme in ("ws", "wss")
+    ),
+    *(
+        ("iso-8859-1", f"{scheme}://example.com?©", f"{scheme}://example.com?%A9")
+        for scheme in _SPECIAL_SCHEMES
+        if scheme not in {"ws", "wss"}
+    ),
+    # Fragments are always UTF-8-encoded.
+    ("iso-8859-1", "https://example.com#©", "https://example.com#%C2%A9"),
+)
+
+INVALID_SCHEME_FOLLOW_UPS = "".join(
+    chr(value)
+    for value in range(0x81)
+    if (
+        chr(value) not in _ASCII_ALPHANUMERIC
+        and chr(value) not in "+-."
+        and chr(value) not in _C0_CONTROL_OR_SPACE  # stripped
+        and chr(value) != ":"  # separator
+    )
+)
+
+SAFE_URL_URL_INVALID_SCHEME_CASES = tuple(
+    (f"{scheme}://example.com", ValueError)
+    for scheme in (
+        # A scheme is required.
+        "",
+        # The first scheme letter must be an ASCII alpha.
+        # Note: 0x80 is included below to also test non-ASCII example.
+        *(
+            chr(value)
+            for value in range(0x81)
+            if (
+                chr(value) not in _ASCII_ALPHA
+                and chr(value) not in _C0_CONTROL_OR_SPACE  # stripped
+                and chr(value) != ":"  # separator
+            )
+        ),
+        # The follow-up scheme letters can also be ASCII numbers, plus, hyphen,
+        # or period.
+        f"a{INVALID_SCHEME_FOLLOW_UPS}",
+    )
+)
+
+# Remove any leading and trailing C0 control or space from input.
+SAFE_URL_URL_STRIP_CASES = tuple(
+    (f"{char}https://example.com{char}", "https://example.com")
+    for char in _C0_CONTROL_OR_SPACE
+    if char not in _ASCII_TAB_OR_NEWLINE
+)
+
+SCHEME_NON_FIRST = _ASCII_ALPHANUMERIC + "+-."
+
+# Username and password characters that do not need escaping.
+# Removed for RFC 2396 and RFC 3986: %
+# Removed for the URL living standard: :;=
+USERINFO_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + "&+$,"
+USERNAME_TO_ENCODE = "".join(
+    chr(value)
+    for value in range(0x80)
+    if (
+        chr(value) not in _C0_CONTROL_OR_SPACE
+        and chr(value) not in USERINFO_SAFE
+        and chr(value) not in ":/?#\\"
+    )
+)
+USERNAME_ENCODED = "".join(f"%{ord(char):02X}" for char in USERNAME_TO_ENCODE)
+PASSWORD_TO_ENCODE = USERNAME_TO_ENCODE + ":"
+PASSWORD_ENCODED = "".join(f"%{ord(char):02X}" for char in PASSWORD_TO_ENCODE)
+
+# Path characters that do not need escaping.
+# Removed for RFC 2396 and RFC 3986: %[\]^|
+PATH_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + ":@&=+$," + "/" + ";"
+PATH_TO_ENCODE = "".join(
+    chr(value)
+    for value in range(0x80)
+    if (
+        chr(value) not in _C0_CONTROL_OR_SPACE
+        and chr(value) not in PATH_SAFE
+        and chr(value) not in "?#\\"
+    )
+)
+PATH_ENCODED = "".join(f"%{ord(char):02X}" for char in PATH_TO_ENCODE)
+
+# Query characters that do not need escaping.
+# Removed for RFC 2396 and RFC 3986: %[\]^`{|}
+# Removed for the URL living standard: ' (special)
+QUERY_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + ":@&=+$," + "/" + ";" + "?"
+QUERY_TO_ENCODE = "".join(
+    chr(value)
+    for value in range(0x80)
+    if (
+        chr(value) not in _C0_CONTROL_OR_SPACE
+        and chr(value) not in QUERY_SAFE
+        and chr(value) not in "#"
+    )
+)
+QUERY_ENCODED = "".join(f"%{ord(char):02X}" for char in QUERY_TO_ENCODE)
+SPECIAL_QUERY_SAFE = QUERY_SAFE.replace("'", "")
+SPECIAL_QUERY_TO_ENCODE = "".join(
+    chr(value)
+    for value in range(0x80)
+    if (
+        chr(value) not in _C0_CONTROL_OR_SPACE
+        and chr(value) not in SPECIAL_QUERY_SAFE
+        and chr(value) not in "#"
+    )
+)
+SPECIAL_QUERY_ENCODED = "".join(f"%{ord(char):02X}" for char in SPECIAL_QUERY_TO_ENCODE)
+
+# Fragment characters that do not need escaping.
+# Removed for RFC 2396 and RFC 3986: #%[\\]^{|}
+FRAGMENT_SAFE = _ASCII_ALPHANUMERIC + "-_.!~*'()" + ":@&=+$," + "/" + ";" + "?"
+FRAGMENT_TO_ENCODE = "".join(
+    chr(value)
+    for value in range(0x80)
+    if (chr(value) not in _C0_CONTROL_OR_SPACE and chr(value) not in FRAGMENT_SAFE)
+)
+FRAGMENT_ENCODED = "".join(f"%{ord(char):02X}" for char in FRAGMENT_TO_ENCODE)
+
+
+# Test cases for URL-to-safe-URL conversions with only a URL as input parameter
+# (i.e. no encoding or base URL).
+#
+# (input URL, output URL or exception)
+SAFE_URL_URL_CASES = (
+    # Invalid input type
+    (1, Exception),
+    (object(), Exception),
+    # Empty string
+    ("", ValueError),
+    *SAFE_URL_URL_STRIP_CASES,
+    # Remove all ASCII tab or newline from input.
+    (
+        (
+            f"{_ASCII_TAB_OR_NEWLINE}h{_ASCII_TAB_OR_NEWLINE}ttps"
+            f"{_ASCII_TAB_OR_NEWLINE}:{_ASCII_TAB_OR_NEWLINE}/"
+            f"{_ASCII_TAB_OR_NEWLINE}/{_ASCII_TAB_OR_NEWLINE}a"
+            f"{_ASCII_TAB_OR_NEWLINE}b{_ASCII_TAB_OR_NEWLINE}:"
+            f"{_ASCII_TAB_OR_NEWLINE}a{_ASCII_TAB_OR_NEWLINE}b"
+            f"{_ASCII_TAB_OR_NEWLINE}@{_ASCII_TAB_OR_NEWLINE}exam"
+            f"{_ASCII_TAB_OR_NEWLINE}ple.com{_ASCII_TAB_OR_NEWLINE}:"
+            f"{_ASCII_TAB_OR_NEWLINE}1{_ASCII_TAB_OR_NEWLINE}2"
+            f"{_ASCII_TAB_OR_NEWLINE}/{_ASCII_TAB_OR_NEWLINE}a"
+            f"{_ASCII_TAB_OR_NEWLINE}b{_ASCII_TAB_OR_NEWLINE}?"
+            f"{_ASCII_TAB_OR_NEWLINE}a{_ASCII_TAB_OR_NEWLINE}b"
+            f"{_ASCII_TAB_OR_NEWLINE}#{_ASCII_TAB_OR_NEWLINE}a"
+            f"{_ASCII_TAB_OR_NEWLINE}b{_ASCII_TAB_OR_NEWLINE}"
+        ),
+        "https://ab:ab@example.com:12/ab?ab#ab",
+    ),
+    # Scheme
+    (f"{_ASCII_ALPHA}://example.com", f"{_ASCII_ALPHA.lower()}://example.com"),
+    (
+        f"a{SCHEME_NON_FIRST}://example.com",
+        f"a{SCHEME_NON_FIRST.lower()}://example.com",
+    ),
+    *SAFE_URL_URL_INVALID_SCHEME_CASES,
+    # Authority
+    ("https://a@example.com", "https://a@example.com"),
+    ("https://a:@example.com", "https://a:@example.com"),
+    ("https://a:a@example.com", "https://a:a@example.com"),
+    ("https://a%3A@example.com", "https://a%3A@example.com"),
+    (
+        f"https://{USERINFO_SAFE}:{USERINFO_SAFE}@example.com",
+        f"https://{USERINFO_SAFE}:{USERINFO_SAFE}@example.com",
+    ),
+    (
+        f"https://{USERNAME_TO_ENCODE}:{PASSWORD_TO_ENCODE}@example.com",
+        f"https://{USERNAME_ENCODED}:{PASSWORD_ENCODED}@example.com",
+    ),
+    ("https://@\\example.com", ValueError),
+    ("https://\x80:\x80@example.com", "https://%C2%80:%C2%80@example.com"),
+    # Host
+    ("https://example.com", "https://example.com"),
+    ("https://.example", "https://.example"),
+    ("https://\x80.example", ValueError),
+    ("https://%80.example", ValueError),
+    # The 4 cases below test before and after crossing DNS length limits on
+    # domain name labels (63 characters) and the domain name as a whole (253
+    # characters). However, all cases are expected to pass because the URL
+    # living standard does not require domain names to be within these limits.
+    (f"https://{'a'*63}.example", f"https://{'a'*63}.example"),
+    (f"https://{'a'*64}.example", f"https://{'a'*64}.example"),
+    (
+        f"https://{'a'*63}.{'a'*63}.{'a'*63}.{'a'*53}.example",
+        f"https://{'a'*63}.{'a'*63}.{'a'*63}.{'a'*53}.example",
+    ),
+    (
+        f"https://{'a'*63}.{'a'*63}.{'a'*63}.{'a'*54}.example",
+        f"https://{'a'*63}.{'a'*63}.{'a'*63}.{'a'*54}.example",
+    ),
+    ("https://ñ.example", "https://xn--ida.example"),
+    ("http://192.168.0.0", "http://192.168.0.0"),
+    ("http://192.168.0.256", ValueError),
+    ("http://192.168.0.0.0", ValueError),
+    ("http://[2a01:5cc0:1:2::4]", "http://[2a01:5cc0:1:2::4]"),
+    ("http://[2a01:5cc0:1:2:3:4]", ValueError),
+    # Port
+    ("https://example.com:", "https://example.com:"),
+    ("https://example.com:1", "https://example.com:1"),
+    ("https://example.com:443", "https://example.com:443"),
+    # Path
+    ("https://example.com/", "https://example.com/"),
+    ("https://example.com/a", "https://example.com/a"),
+    ("https://example.com\\a", "https://example.com/a"),
+    ("https://example.com/a\\b", "https://example.com/a/b"),
+    (
+        f"https://example.com/{PATH_SAFE}",
+        f"https://example.com/{PATH_SAFE}",
+    ),
+    (
+        f"https://example.com/{PATH_TO_ENCODE}",
+        f"https://example.com/{PATH_ENCODED}",
+    ),
+    ("https://example.com/ñ", "https://example.com/%C3%B1"),
+    ("https://example.com/ñ%C3%B1", "https://example.com/%C3%B1%C3%B1"),
+    # Query
+    ("https://example.com?", "https://example.com?"),
+    ("https://example.com/?", "https://example.com/?"),
+    ("https://example.com?a", "https://example.com?a"),
+    ("https://example.com?a=", "https://example.com?a="),
+    ("https://example.com?a=b", "https://example.com?a=b"),
+    (
+        f"a://example.com?{QUERY_SAFE}",
+        f"a://example.com?{QUERY_SAFE}",
+    ),
+    (
+        f"a://example.com?{QUERY_TO_ENCODE}",
+        f"a://example.com?{QUERY_ENCODED}",
+    ),
+    *(
+        (
+            f"{scheme}://example.com?{SPECIAL_QUERY_SAFE}",
+            f"{scheme}://example.com?{SPECIAL_QUERY_SAFE}",
+        )
+        for scheme in _SPECIAL_SCHEMES
+    ),
+    *(
+        (
+            f"{scheme}://example.com?{SPECIAL_QUERY_TO_ENCODE}",
+            f"{scheme}://example.com?{SPECIAL_QUERY_ENCODED}",
+        )
+        for scheme in _SPECIAL_SCHEMES
+    ),
+    ("https://example.com?ñ", "https://example.com?%C3%B1"),
+    ("https://example.com?ñ%C3%B1", "https://example.com?%C3%B1%C3%B1"),
+    # Fragment
+    ("https://example.com#", "https://example.com#"),
+    ("https://example.com/#", "https://example.com/#"),
+    ("https://example.com?#", "https://example.com?#"),
+    ("https://example.com/?#", "https://example.com/?#"),
+    ("https://example.com#a", "https://example.com#a"),
+    (
+        f"a://example.com#{FRAGMENT_SAFE}",
+        f"a://example.com#{FRAGMENT_SAFE}",
+    ),
+    (
+        f"a://example.com#{FRAGMENT_TO_ENCODE}",
+        f"a://example.com#{FRAGMENT_ENCODED}",
+    ),
+    ("https://example.com#ñ", "https://example.com#%C3%B1"),
+    ("https://example.com#ñ%C3%B1", "https://example.com#%C3%B1%C3%B1"),
+    # All fields, UTF-8 wherever possible.
+    (
+        "https://ñ:ñ@ñ.example:1/ñ?ñ#ñ",
+        "https://%C3%B1:%C3%B1@xn--ida.example:1/%C3%B1?%C3%B1#%C3%B1",
+    ),
+)
+
+
+def _test_safe_url_func(url, *, encoding=UNSET, output, func):
+    kwargs = {}
+    if encoding is not UNSET:
+        kwargs["encoding"] = encoding
+    try:
+        is_exception = issubclass(output, Exception)
+    except TypeError:
+        is_exception = False
+    if is_exception:
+        with pytest.raises(output):
+            func(url, **kwargs)
+        return
+    actual = func(url, **kwargs)
+    assert actual == output
+    assert func(actual, **kwargs) == output  # Idempotency
+
+
+def _test_safe_url_string(url, *, encoding=UNSET, output):
+    return _test_safe_url_func(
+        url,
+        encoding=encoding,
+        output=output,
+        func=safe_url_string,
+    )
+
+
+KNOWN_SAFE_URL_STRING_ENCODING_ISSUES = {
+    (UNSET, ""),  # Invalid URL
+    # UTF-8 encoding is not enforced in non-special URLs, or in URLs with the
+    # ws or wss schemas.
+    ("iso-8859-1", "a://example.com?\xa9"),
+    ("iso-8859-1", "ws://example.com?\xa9"),
+    ("iso-8859-1", "wss://example.com?\xa9"),
+    # UTF-8 encoding is not enforced on the fragment.
+    ("iso-8859-1", "https://example.com#\xa9"),
+}
+
+
+@pytest.mark.parametrize(
+    "encoding,url,output",
+    tuple(
+        case
+        if case[:2] not in KNOWN_SAFE_URL_STRING_ENCODING_ISSUES
+        else pytest.param(*case, marks=pytest.mark.xfail(strict=True))
+        for case in SAFE_URL_ENCODING_CASES
+    ),
+)
+def test_safe_url_string_encoding(encoding, url, output):
+    _test_safe_url_string(url, encoding=encoding, output=output)
+
+
+KNOWN_SAFE_URL_STRING_URL_ISSUES = {
+    "",  # Invalid URL
+    *(case[0] for case in SAFE_URL_URL_STRIP_CASES),
+    *(case[0] for case in SAFE_URL_URL_INVALID_SCHEME_CASES),
+    # Userinfo characters that the URL living standard requires escaping (:;=)
+    # are not escaped.
+    "https://@\\example.com",  # Invalid URL
+    "https://\x80.example",  # Invalid domain name (non-visible character)
+    "https://%80.example",  # Invalid domain name (non-visible character)
+    "http://192.168.0.256",  # Invalid IP address
+    "http://192.168.0.0.0",  # Invalid IP address / domain name
+    "http://[2a01:5cc0:1:2::4]",  # https://github.com/scrapy/w3lib/issues/193
+    "http://[2a01:5cc0:1:2:3:4]",  # Invalid IPv6
+    "https://example.com:",  # Removes the :
+    # Does not convert \ to /
+    "https://example.com\\a",
+    "https://example.com\\a\\b",
+    # Encodes \ and / after the first one in the path
+    "https://example.com/a/b",
+    "https://example.com/a\\b",
+    # Some path characters that RFC 2396 and RFC 3986 require escaping (%)
+    # are not escaped.
+    f"https://example.com/{PATH_TO_ENCODE}",
+    # ? is removed
+    "https://example.com?",
+    "https://example.com/?",
+    # Some query characters that RFC 2396 and RFC 3986 require escaping (%)
+    # are not escaped.
+    f"a://example.com?{QUERY_TO_ENCODE}",
+    # Some special query characters that RFC 2396 and RFC 3986 require escaping
+    # (%) are not escaped.
+    *(
+        f"{scheme}://example.com?{SPECIAL_QUERY_TO_ENCODE}"
+        for scheme in _SPECIAL_SCHEMES
+    ),
+    # ? and # are removed
+    "https://example.com#",
+    "https://example.com/#",
+    "https://example.com?#",
+    "https://example.com/?#",
+    # Some fragment characters that RFC 2396 and RFC 3986 require escaping
+    # (%) are not escaped.
+    f"a://example.com#{FRAGMENT_TO_ENCODE}",
+}
+
+
+@pytest.mark.parametrize(
+    "url,output",
+    tuple(
+        case
+        if case[0] not in KNOWN_SAFE_URL_STRING_URL_ISSUES
+        else pytest.param(*case, marks=pytest.mark.xfail(strict=True))
+        for case in SAFE_URL_URL_CASES
+    ),
+)
+def test_safe_url_string_url(url, output):
+    _test_safe_url_string(url, output=output)
+
+
 class UrlTests(unittest.TestCase):
     def test_safe_url_string(self):
         # Motoko Kusanagi (Cyborg from Ghost in the Shell)
@@ -106,14 +510,6 @@ def test_safe_url_string_remove_ascii_tab_and_newlines(self):
             "http://example.com/test%07.html",
         )
 
-    def test_safe_url_string_unsafe_chars(self):
-        safeurl = safe_url_string(
-            r"http://localhost:8001/unwise{,},|,\,^,[,],`?|=[]&[]=|"
-        )
-        self.assertEqual(
-            safeurl, r"http://localhost:8001/unwise%7B,%7D,|,%5C,%5E,[,],%60?|=[]&[]=|"
-        )
-
     def test_safe_url_string_quote_path(self):
         safeurl = safe_url_string('http://google.com/"hello"', quote_path=True)
         self.assertEqual(safeurl, "http://google.com/%22hello%22")
diff --git a/tox.ini b/tox.ini
index 5647f5a1..60b12f38 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py36, py37, py38, py39, py310, pypy3, docs, security, flake8, pylint, black
+envlist = py37, py38, py39, py310, pypy3, docs, security, flake8, pylint, black, typing
 
 [testenv]
 deps =
@@ -50,7 +50,7 @@ commands =
 deps =
     black==22.6.0
 commands =
-    black --check {posargs:conftest.py setup.py tests w3lib}
+    black {posargs:--check conftest.py setup.py tests w3lib}
 
 [docs]
 changedir = docs
diff --git a/w3lib/_infra.py b/w3lib/_infra.py
new file mode 100644
index 00000000..f18437eb
--- /dev/null
+++ b/w3lib/_infra.py
@@ -0,0 +1,13 @@
+# https://infra.spec.whatwg.org/
+
+import string
+
+# https://infra.spec.whatwg.org/commit-snapshots/59e0d16c1e3ba0e77c6a60bfc69a0929b8ffaa5d/#code-points
+_ASCII_TAB_OR_NEWLINE = "\t\n\r"
+_ASCII_WHITESPACE = "\t\n\x0c\r "
+_C0_CONTROL = "".join(chr(n) for n in range(32))
+_C0_CONTROL_OR_SPACE = _C0_CONTROL + " "
+_ASCII_DIGIT = string.digits
+_ASCII_HEX_DIGIT = string.hexdigits
+_ASCII_ALPHA = string.ascii_letters
+_ASCII_ALPHANUMERIC = string.ascii_letters + string.digits
diff --git a/w3lib/_url.py b/w3lib/_url.py
new file mode 100644
index 00000000..b6e38657
--- /dev/null
+++ b/w3lib/_url.py
@@ -0,0 +1,12 @@
+# https://url.spec.whatwg.org/
+
+# https://url.spec.whatwg.org/commit-snapshots/a46cb9188a48c2c9d80ba32a9b1891652d6b4900/#default-port
+_DEFAULT_PORTS = {
+    "ftp": 21,
+    "file": None,
+    "http": 80,
+    "https": 443,
+    "ws": 80,
+    "wss": 443,
+}
+_SPECIAL_SCHEMES = set(_DEFAULT_PORTS.keys())
diff --git a/w3lib/url.py b/w3lib/url.py
index a41446c4..b5b84ec8 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -35,8 +35,10 @@
 )
 from urllib.parse import _coerce_args  # type: ignore
 from urllib.request import pathname2url, url2pathname
-from w3lib.util import to_unicode
-from w3lib._types import AnyUnicodeError, StrOrBytes
+
+from .util import to_unicode
+from ._types import AnyUnicodeError, StrOrBytes
+from ._url import _SPECIAL_SCHEMES
 
 
 # error handling function for bytes-to-Unicode decoding errors with URLs
@@ -54,16 +56,33 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
 RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode("ascii")
 EXTRA_SAFE_CHARS = b"|"  # see https://github.com/scrapy/w3lib/pull/25
 
+RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":"
 _safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b"%"
 _path_safe_chars = _safe_chars.replace(b"#", b"")
-RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":"
+
+# Characters that are safe in all of:
+#
+# -   RFC 2396 + RFC 2732, as interpreted by Java 8’s java.net.URI class
+# -   RFC 3986
+# -   The URL living standard
+#
+# NOTE: % is currently excluded from these lists of characters, due to
+# limitations of the current safe_url_string implementation, but it should also
+# be escaped as %25 when it is not already being used as part of an escape
+# character.
+_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, b":;=")
+_PATH_SAFEST_CHARS = _safe_chars.translate(None, b"#[]|")
+_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS
+_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, b"'")
+_FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS
+
 
 _ascii_tab_newline_re = re.compile(
     r"[\t\n\r]"
 )  # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
 
 
-def safe_url_string(
+def safe_url_string(  # pylint: disable=too-many-locals
     url: StrOrBytes,
     encoding: str = "utf8",
     path_encoding: str = "utf8",
@@ -106,11 +125,11 @@ def safe_url_string(
     netloc_bytes = b""
     if username is not None or password is not None:
         if username is not None:
-            safe_username = quote(unquote(username), RFC3986_USERINFO_SAFE_CHARS)
+            safe_username = quote(unquote(username), _USERINFO_SAFEST_CHARS)
             netloc_bytes += safe_username.encode(encoding)
         if password is not None:
             netloc_bytes += b":"
-            safe_password = quote(unquote(password), RFC3986_USERINFO_SAFE_CHARS)
+            safe_password = quote(unquote(password), _USERINFO_SAFEST_CHARS)
             netloc_bytes += safe_password.encode(encoding)
         netloc_bytes += b"@"
     if hostname is not None:
@@ -128,17 +147,22 @@ def safe_url_string(
 
     # default encoding for path component SHOULD be UTF-8
     if quote_path:
-        path = quote(parts.path.encode(path_encoding), _path_safe_chars)
+        path = quote(parts.path.encode(path_encoding), _PATH_SAFEST_CHARS)
     else:
         path = parts.path
 
+    if parts.scheme in _SPECIAL_SCHEMES:
+        query = quote(parts.query.encode(encoding), _SPECIAL_QUERY_SAFEST_CHARS)
+    else:
+        query = quote(parts.query.encode(encoding), _QUERY_SAFEST_CHARS)
+
     return urlunsplit(
         (
             parts.scheme,
             netloc,
             path,
-            quote(parts.query.encode(encoding), _safe_chars),
-            quote(parts.fragment.encode(encoding), _safe_chars),
+            query,
+            quote(parts.fragment.encode(encoding), _FRAGMENT_SAFEST_CHARS),
         )
     )
 

From 7d9653661c24ffce2b53caf65b2dc964fa731ac6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Thu, 24 Nov 2022 09:52:41 +0100
Subject: [PATCH 2/3] bytes.translate: specify the delete parameter name for
 readability

Co-authored-by: Mikhail Korobov <kmike84@gmail.com>
---
 w3lib/url.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/w3lib/url.py b/w3lib/url.py
index b5b84ec8..b22b3b85 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -70,10 +70,10 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
 # limitations of the current safe_url_string implementation, but it should also
 # be escaped as %25 when it is not already being used as part of an escape
 # character.
-_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, b":;=")
-_PATH_SAFEST_CHARS = _safe_chars.translate(None, b"#[]|")
+_USERINFO_SAFEST_CHARS = RFC3986_USERINFO_SAFE_CHARS.translate(None, delete=b":;=")
+_PATH_SAFEST_CHARS = _safe_chars.translate(None, delete=b"#[]|")
 _QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS
-_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, b"'")
+_SPECIAL_QUERY_SAFEST_CHARS = _PATH_SAFEST_CHARS.translate(None, delete=b"'")
 _FRAGMENT_SAFEST_CHARS = _PATH_SAFEST_CHARS
 
 

From 0994e08acafb12d7eac069cfc7c57811ce4fca35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Thu, 24 Nov 2022 11:17:36 +0100
Subject: [PATCH 3/3] .flake8: move unsupported inline comment

---
 .flake8 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.flake8 b/.flake8
index 268fd3a8..96c8f44d 100644
--- a/.flake8
+++ b/.flake8
@@ -10,4 +10,5 @@ ignore =
     W504,
 
     # black disagrees with flake8, and inserts whitespace
-    E203,  # whitespace before ':'
+    # E203: whitespace before ':'
+    E203,