Skip to content

bpo-41115: Modified src to raise rather Unicode{Encode, Decode}Error rather than plain UnicodeError #21170

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 19 additions & 11 deletions Lib/encodings/idna.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,15 @@ def ToASCII(label):
try:
# Step 1: try ASCII
label = label.encode("ascii")
except UnicodeError:
except UnicodeEncodeError:
pass
else:
# Skip to step 3: UseSTD3ASCIIRules is false, so
# Skip to step 8.
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
raise UnicodeEncodeError("ascii", label.decode("ascii"), 0, len(label.decode("ascii")),
"label empty or too long")

# Step 2: nameprep
label = nameprep(label)
Expand All @@ -79,17 +80,18 @@ def ToASCII(label):
# Step 4: try ASCII
try:
label = label.encode("ascii")
except UnicodeError:
except UnicodeEncodeError:
pass
else:
# Skip to step 8.
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
raise UnicodeEncodeError("ascii", label.decode("ascii"), 0, len(label.decode("ascii")),
"label empty or too long")

# Step 5: Check ACE prefix
if label.startswith(sace_prefix):
raise UnicodeError("Label starts with ACE prefix")
raise UnicodeEncodeError("ascii", str(label), 0, len(label), "label starts with ACE prefix")

# Step 6: Encode with PUNYCODE
label = label.encode("punycode")
Expand All @@ -100,7 +102,9 @@ def ToASCII(label):
# Step 8: Check size
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
raise UnicodeEncodeError("punycode", label.decode("punycode"), 0,
len(label.decode("punycode")), "label empty or too long")


def ToUnicode(label):
# Step 1: Check for ASCII
Expand All @@ -110,16 +114,18 @@ def ToUnicode(label):
try:
label = label.encode("ascii")
pure_ascii = True
except UnicodeError:
except UnicodeEncodeError:
pure_ascii = False
if not pure_ascii:
# Step 2: Perform nameprep
label = nameprep(label)
# It doesn't say this, but apparently, it should be ASCII now
try:
label = label.encode("ascii")
except UnicodeError:
raise UnicodeError("Invalid character in IDN label")
except UnicodeEncodeError:
raise UnicodeEncodeError("ascii", label.decode("ascii"), 0, len(label.decode("ascii")),
"Invalid character in IDN label")

# Step 3: Check for ACE prefix
if not label.startswith(ace_prefix):
return str(label, "ascii")
Expand Down Expand Up @@ -162,9 +168,11 @@ def encode(self, input, errors='strict'):
labels = result.split(b'.')
for label in labels[:-1]:
if not (0 < len(label) < 64):
raise UnicodeError("label empty or too long")
raise UnicodeEncodeError("ascii", label.decode("ascii"), 0, len(label.decode("ascii")),
"label empty or too long")
if len(labels[-1]) >= 64:
raise UnicodeError("label too long")
raise UnicodeEncodeError("ascii", labels[-1].decode("ascii"), 0, len(labels[-1].decode("ascii")),
"label too long")
return result, len(input)

result = bytearray()
Expand Down
9 changes: 5 additions & 4 deletions Lib/encodings/punycode.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,16 +134,17 @@ def decode_generalized_number(extended, extpos, bias, errors):
char = ord(extended[extpos])
except IndexError:
if errors == "strict":
raise UnicodeError("incomplete punicode string")
raise UnicodeDecodeError("punycode", bytes(extended[extpos], "utf-8"), extpos, extpos+1,
"incomplete punycode string")
return extpos + 1, None
extpos += 1
if 0x41 <= char <= 0x5A: # A-Z
digit = char - 0x41
elif 0x30 <= char <= 0x39:
digit = char - 22 # 0x30-26
elif errors == "strict":
raise UnicodeError("Invalid extended code point '%s'"
% extended[extpos-1])
raise UnicodeDecodeError("punycode", bytes(extended[extpos-1], "utf-8"), extpos-1, extpos,
"Invalid extended code point '%s'" % extended[extpos-1])
else:
return extpos, None
t = T(j, bias)
Expand Down Expand Up @@ -171,7 +172,7 @@ def insertion_sort(base, extended, errors):
char += pos // (len(base) + 1)
if char > 0x10FFFF:
if errors == "strict":
raise UnicodeError("Invalid character U+%x" % char)
raise UnicodeDecodeError("punycode", bytes(char, "utf-8"), 0, len(char), "Invalid character U+%x" % char)
char = ord('?')
pos = pos % (len(base) + 1)
base = base[:pos] + chr(char) + base[pos:]
Expand Down
10 changes: 5 additions & 5 deletions Lib/encodings/undefined.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" Python 'undefined' Codec

This codec will always raise a ValueError exception when being
This codec will always raise a UnicodeEncodeError | UnicodeDecodeError exception when being
used. It is intended for use by the site.py file to switch off
automatic string to Unicode coercion.

Expand All @@ -16,18 +16,18 @@
class Codec(codecs.Codec):

def encode(self,input,errors='strict'):
raise UnicodeError("undefined encoding")
raise UnicodeEncodeError("undefined", str(input), 0, len(input), "undefined encoding")

def decode(self,input,errors='strict'):
raise UnicodeError("undefined encoding")
raise UnicodeDecodeError("undefined", bytes(input), 0, len(input), "undefined decoding")

class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
raise UnicodeError("undefined encoding")
raise UnicodeEncodeError("undefined", str(input), 0, len(input), "undefined encoding")

class IncrementalDecoder(codecs.IncrementalDecoder):
def decode(self, input, final=False):
raise UnicodeError("undefined encoding")
raise UnicodeDecodeError("undefined", bytes(input), 0, len(input), "undefined decoding")

class StreamWriter(Codec,codecs.StreamWriter):
pass
Expand Down
4 changes: 2 additions & 2 deletions Lib/encodings/utf_16.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _buffer_decode(self, input, errors, final):
elif byteorder == 1:
self.decoder = codecs.utf_16_be_decode
elif consumed >= 2:
raise UnicodeError("UTF-16 stream does not start with BOM")
raise UnicodeDecodeError("utc-16", input, 0, 0, "UTF-16 stream does not start with BOM")
return (output, consumed)
return self.decoder(input, self.errors, final)

Expand Down Expand Up @@ -138,7 +138,7 @@ def decode(self, input, errors='strict'):
elif byteorder == 1:
self.decode = codecs.utf_16_be_decode
elif consumed>=2:
raise UnicodeError("UTF-16 stream does not start with BOM")
raise UnicodeDecodeError("utf-16", input, 0, 0, "UTF-16 stream does not start with BOM")
return (object, consumed)

### encodings module API
Expand Down
4 changes: 2 additions & 2 deletions Lib/encodings/utf_32.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _buffer_decode(self, input, errors, final):
elif byteorder == 1:
self.decoder = codecs.utf_32_be_decode
elif consumed >= 4:
raise UnicodeError("UTF-32 stream does not start with BOM")
raise UnicodeDecodeError("utf-32", input, 0, 0, "UTF-32 stream does not start with BOM")
return (output, consumed)
return self.decoder(input, self.errors, final)

Expand Down Expand Up @@ -133,7 +133,7 @@ def decode(self, input, errors='strict'):
elif byteorder == 1:
self.decode = codecs.utf_32_be_decode
elif consumed>=4:
raise UnicodeError("UTF-32 stream does not start with BOM")
raise UnicodeDecodeError("utf-32", input, 0, 0, "UTF-32 stream does not start with BOM")
return (object, consumed)

### encodings module API
Expand Down
4 changes: 2 additions & 2 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1334,13 +1334,13 @@ def test_decode(self):

def test_decode_invalid(self):
testcases = [
(b"xn--w&", "strict", UnicodeError()),
(b"xn--w&", "strict", UnicodeDecodeError("punycode", b"xn--w&", 0, 0, "")),
(b"xn--w&", "ignore", "xn-"),
]
for puny, errors, expected in testcases:
with self.subTest(puny=puny, errors=errors):
if isinstance(expected, Exception):
self.assertRaises(UnicodeError, puny.decode, "punycode", errors)
self.assertRaises(UnicodeDecodeError, puny.decode, "punycode", errors)
else:
self.assertEqual(puny.decode("punycode", errors), expected)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Modified source to raise Unicode{Decode, Encode} Error rather than bare
UnicodeError Patch By Utkarsh Pandey