Skip to content

Commit 3d0a5f7

Browse files
bpo-43323: Fix UnicodeEncodeError in the email module (GH-32137)
It was raised if the charset itself contains characters not encodable in UTF-8 (in particular \udcxx characters representing non-decodable bytes in the source). (cherry picked from commit e91dee8) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 869a894 commit 3d0a5f7

File tree

6 files changed

+36
-6
lines changed

6 files changed

+36
-6
lines changed

Lib/email/_encoded_words.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -179,15 +179,15 @@ def decode(ew):
179179
# Turn the CTE decoded bytes into unicode.
180180
try:
181181
string = bstring.decode(charset)
182-
except UnicodeError:
182+
except UnicodeDecodeError:
183183
defects.append(errors.UndecodableBytesDefect("Encoded word "
184-
"contains bytes not decodable using {} charset".format(charset)))
184+
f"contains bytes not decodable using {charset!r} charset"))
185185
string = bstring.decode(charset, 'surrogateescape')
186-
except LookupError:
186+
except (LookupError, UnicodeEncodeError):
187187
string = bstring.decode('ascii', 'surrogateescape')
188188
if charset.lower() != 'unknown-8bit':
189-
defects.append(errors.CharsetError("Unknown charset {} "
190-
"in encoded word; decoded as unknown bytes".format(charset)))
189+
defects.append(errors.CharsetError(f"Unknown charset {charset!r} "
190+
f"in encoded word; decoded as unknown bytes"))
191191
return string, charset, lang, defects
192192

193193

Lib/email/_header_value_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -781,7 +781,7 @@ def params(self):
781781
else:
782782
try:
783783
value = value.decode(charset, 'surrogateescape')
784-
except LookupError:
784+
except (LookupError, UnicodeEncodeError):
785785
# XXX: there should really be a custom defect for
786786
# unknown character set to make it easy to find,
787787
# because otherwise unknown charset is a silent

Lib/test/test_email/test__encoded_words.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,13 @@ def test_unknown_charset(self):
130130
# XXX Should this be a new Defect instead?
131131
defects = [errors.CharsetError])
132132

133+
def test_invalid_character_in_charset(self):
134+
self._test('=?utf-8\udce2\udc80\udc9d?q?foo=ACbar?=',
135+
b'foo\xacbar'.decode('ascii', 'surrogateescape'),
136+
charset = 'utf-8\udce2\udc80\udc9d',
137+
# XXX Should this be a new Defect instead?
138+
defects = [errors.CharsetError])
139+
133140
def test_q_nonascii(self):
134141
self._test('=?utf-8?q?=C3=89ric?=',
135142
'Éric',

Lib/test/test_email/test_email.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5323,6 +5323,15 @@ def test_rfc2231_unknown_encoding(self):
53235323
Content-Transfer-Encoding: 8bit
53245324
Content-Disposition: inline; filename*=X-UNKNOWN''myfile.txt
53255325
5326+
"""
5327+
msg = email.message_from_string(m)
5328+
self.assertEqual(msg.get_filename(), 'myfile.txt')
5329+
5330+
def test_rfc2231_bad_character_in_encoding(self):
5331+
m = """\
5332+
Content-Transfer-Encoding: 8bit
5333+
Content-Disposition: inline; filename*=utf-8\udce2\udc80\udc9d''myfile.txt
5334+
53265335
"""
53275336
msg = email.message_from_string(m)
53285337
self.assertEqual(msg.get_filename(), 'myfile.txt')

Lib/test/test_email/test_headerregistry.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,18 @@ def content_type_as_value(self,
698698
" charset*=unknown-8bit''utf-8%E2%80%9D\n",
699699
),
700700

701+
'rfc2231_nonascii_in_charset_of_charset_parameter_value': (
702+
"text/plain; charset*=utf-8”''utf-8%E2%80%9D",
703+
'text/plain',
704+
'text',
705+
'plain',
706+
{'charset': 'utf-8”'},
707+
[],
708+
'text/plain; charset="utf-8”"',
709+
"Content-Type: text/plain;"
710+
" charset*=utf-8''utf-8%E2%80%9D\n",
711+
),
712+
701713
'rfc2231_encoded_then_unencoded_segments': (
702714
('application/x-foo;'
703715
'\tname*0*="us-ascii\'en-us\'My";'
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix errors in the :mod:`email` module if the charset itself contains
2+
undecodable/unencodable characters.

0 commit comments

Comments
 (0)