Skip to content

Commit 435e891

Browse files
[3.11] gh-113594: Fix UnicodeEncodeError in TokenList.fold() (GH-113730) (GH-113908)
It occurred when try to re-encode an unknown-8bit part combined with non-unknown-8bit part. (cherry picked from commit e9d5b6e) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent c92a473 commit 435e891

File tree

3 files changed

+48
-0
lines changed

3 files changed

+48
-0
lines changed

Lib/email/_header_value_parser.py

+7
Original file line numberDiff line numberDiff line change
@@ -2768,6 +2768,7 @@ def _refold_parse_tree(parse_tree, *, policy):
27682768
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
27692769
lines = ['']
27702770
last_ew = None
2771+
last_charset = None
27712772
wrap_as_ew_blocked = 0
27722773
want_encoding = False
27732774
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
@@ -2822,8 +2823,14 @@ def _refold_parse_tree(parse_tree, *, policy):
28222823
else:
28232824
# It's a terminal, wrap it as an encoded word, possibly
28242825
# combining it with previously encoded words if allowed.
2826+
if (last_ew is not None and
2827+
charset != last_charset and
2828+
(last_charset == 'unknown-8bit' or
2829+
last_charset == 'utf-8' and charset != 'us-ascii')):
2830+
last_ew = None
28252831
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
28262832
part.ew_combine_allowed, charset)
2833+
last_charset = charset
28272834
want_encoding = False
28282835
continue
28292836
if len(tstr) <= maxlen - len(lines[-1]):

Lib/test/test_email/test__header_value_parser.py

+39
Original file line numberDiff line numberDiff line change
@@ -2915,6 +2915,45 @@ def test_ews_combined_before_wrap(self):
29152915
"mich. And that's\n"
29162916
" all I'm sayin.\n")
29172917

2918+
def test_unicode_after_unknown_not_combined(self):
2919+
self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=\xa4"),
2920+
"=?unknown-8bit?q?=A4?==?utf-8?q?=C2=A4?=\n")
2921+
prefix = "0123456789 "*5
2922+
self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=A4?=\xa4"),
2923+
prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?=C2=A4?=\n")
2924+
2925+
def test_ascii_after_unknown_not_combined(self):
2926+
self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=abc"),
2927+
"=?unknown-8bit?q?=A4?=abc\n")
2928+
prefix = "0123456789 "*5
2929+
self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=A4?=abc"),
2930+
prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?abc?=\n")
2931+
2932+
def test_unknown_after_unicode_not_combined(self):
2933+
self._test(parser.get_unstructured("\xa4"
2934+
"=?unknown-8bit?q?=A4?="),
2935+
"=?utf-8?q?=C2=A4?==?unknown-8bit?q?=A4?=\n")
2936+
prefix = "0123456789 "*5
2937+
self._test(parser.get_unstructured(prefix + "\xa4=?unknown-8bit?q?=A4?="),
2938+
prefix + "=?utf-8?q?=C2=A4?=\n =?unknown-8bit?q?=A4?=\n")
2939+
2940+
def test_unknown_after_ascii_not_combined(self):
2941+
self._test(parser.get_unstructured("abc"
2942+
"=?unknown-8bit?q?=A4?="),
2943+
"abc=?unknown-8bit?q?=A4?=\n")
2944+
prefix = "0123456789 "*5
2945+
self._test(parser.get_unstructured(prefix + "abcd=?unknown-8bit?q?=A4?="),
2946+
prefix + "abcd\n =?unknown-8bit?q?=A4?=\n")
2947+
2948+
def test_unknown_after_unknown(self):
2949+
self._test(parser.get_unstructured("=?unknown-8bit?q?=C2?="
2950+
"=?unknown-8bit?q?=A4?="),
2951+
"=?unknown-8bit?q?=C2=A4?=\n")
2952+
prefix = "0123456789 "*5
2953+
self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=C2?="
2954+
"=?unknown-8bit?q?=A4?="),
2955+
prefix + "=?unknown-8bit?q?=C2?=\n =?unknown-8bit?q?=A4?=\n")
2956+
29182957
# XXX Need test of an encoded word so long that it needs to be wrapped
29192958

29202959
def test_simple_address(self):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix :exc:`UnicodeEncodeError` in :mod:`email` when re-fold lines that
2+
contain unknown-8bit encoded part followed by non-unknown-8bit encoded part.

0 commit comments

Comments
 (0)