Skip to content

Commit 5aec2d2

Browse files
miss-islingtonsidneyserhiy-storchaka
authored
[3.11] gh-94606: Fix error when message with Unicode surrogate not surrogateescaped string (GH-94641) (GH-112972)
(cherry picked from commit 27a5fd8) Co-authored-by: Sidney Markowitz <[email protected]> Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent a37e147 commit 5aec2d2

File tree

4 files changed

+49
-16
lines changed

4 files changed

+49
-16
lines changed

Lib/email/message.py

+15-14
Original file line numberDiff line numberDiff line change
@@ -289,25 +289,26 @@ def get_payload(self, i=None, decode=False):
289289
# cte might be a Header, so for now stringify it.
290290
cte = str(self.get('content-transfer-encoding', '')).lower()
291291
# payload may be bytes here.
292-
if isinstance(payload, str):
293-
if utils._has_surrogates(payload):
294-
bpayload = payload.encode('ascii', 'surrogateescape')
295-
if not decode:
292+
if not decode:
293+
if isinstance(payload, str) and utils._has_surrogates(payload):
294+
try:
295+
bpayload = payload.encode('ascii', 'surrogateescape')
296296
try:
297297
payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
298298
except LookupError:
299299
payload = bpayload.decode('ascii', 'replace')
300-
elif decode:
301-
try:
302-
bpayload = payload.encode('ascii')
303-
except UnicodeError:
304-
# This won't happen for RFC compliant messages (messages
305-
# containing only ASCII code points in the unicode input).
306-
# If it does happen, turn the string into bytes in a way
307-
# guaranteed not to fail.
308-
bpayload = payload.encode('raw-unicode-escape')
309-
if not decode:
300+
except UnicodeEncodeError:
301+
pass
310302
return payload
303+
if isinstance(payload, str):
304+
try:
305+
bpayload = payload.encode('ascii', 'surrogateescape')
306+
except UnicodeEncodeError:
307+
# This won't happen for RFC compliant messages (messages
308+
# containing only ASCII code points in the unicode input).
309+
# If it does happen, turn the string into bytes in a way
310+
# guaranteed not to fail.
311+
bpayload = payload.encode('raw-unicode-escape')
311312
if cte == 'quoted-printable':
312313
return quopri.decodestring(bpayload)
313314
elif cte == 'base64':

Lib/email/utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@
4949
escapesre = re.compile(r'[\\"]')
5050

5151
def _has_surrogates(s):
52-
"""Return True if s contains surrogate-escaped binary data."""
52+
"""Return True if s may contain surrogate-escaped binary data."""
5353
# This check is based on the fact that unless there are surrogates, utf8
5454
# (Python's default encoding) can encode any string. This is the fastest
55-
# way to check for surrogates, see issue 11454 for timings.
55+
# way to check for surrogates, see bpo-11454 (moved to gh-55663) for timings.
5656
try:
5757
s.encode()
5858
return False

Lib/test/test_email/test_message.py

+29
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,35 @@ def test_iter_attachments_mutation(self):
748748
self.assertEqual(len(list(m.iter_attachments())), 2)
749749
self.assertEqual(m.get_payload(), orig)
750750

751+
get_payload_surrogate_params = {
752+
753+
'good_surrogateescape': (
754+
"String that can be encod\udcc3\udcabd with surrogateescape",
755+
b'String that can be encod\xc3\xabd with surrogateescape'
756+
),
757+
758+
'string_with_utf8': (
759+
"String with utf-8 charactër",
760+
b'String with utf-8 charact\xebr'
761+
),
762+
763+
'surrogate_and_utf8': (
764+
"String that cannot be ëncod\udcc3\udcabd with surrogateescape",
765+
b'String that cannot be \xebncod\\udcc3\\udcabd with surrogateescape'
766+
),
767+
768+
'out_of_range_surrogate': (
769+
"String with \udfff cannot be encoded with surrogateescape",
770+
b'String with \\udfff cannot be encoded with surrogateescape'
771+
),
772+
}
773+
774+
def get_payload_surrogate_as_gh_94606(self, msg, expected):
775+
"""test for GH issue 94606"""
776+
m = self._str_msg(msg)
777+
payload = m.get_payload(decode=True)
778+
self.assertEqual(expected, payload)
779+
751780

752781
class TestEmailMessage(TestEmailMessageBase, TestEmailBase):
753782
message = EmailMessage
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix UnicodeEncodeError when :func:`email.message.get_payload` reads a message
2+
with a Unicode surrogate character and the message content is not well-formed for
3+
surrogateescape encoding. Patch by Sidney Markowitz.

0 commit comments

Comments
 (0)