Skip to content

Commit bf00034

Browse files
habermancopybara-github
authored andcommitted
Breaking Change: Made text_format output default to UTF-8.
Also hardened the text format printer against invalid UTF-8 in string fields. The output string will always be valid UTF-8, even if string fields contain invalid UTF-8. PiperOrigin-RevId: 600990001
1 parent b9e4894 commit bf00034

File tree

4 files changed

+100
-46
lines changed

4 files changed

+100
-46
lines changed

python/google/protobuf/internal/text_encoding_test.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,17 @@
2222
"signi\\\\fying\\\\ nothing\\\\",
2323
b"signi\\fying\\ nothing\\"),
2424
("\\010\\t\\n\\013\\014\\r",
25-
"\x08\\t\\n\x0b\x0c\\r",
25+
"\\010\\t\\n\\013\\014\\r",
2626
b"\010\011\012\013\014\015")]
2727

2828

2929
class TextEncodingTestCase(unittest.TestCase):
3030
def testCEscape(self):
3131
for escaped, escaped_utf8, unescaped in TEST_VALUES:
32-
self.assertEqual(escaped,
33-
text_encoding.CEscape(unescaped, as_utf8=False))
34-
self.assertEqual(escaped_utf8,
35-
text_encoding.CEscape(unescaped, as_utf8=True))
32+
self.assertEqual(escaped, text_encoding.CEscape(unescaped, as_utf8=False))
33+
self.assertEqual(
34+
escaped_utf8, text_encoding.CEscape(unescaped, as_utf8=True)
35+
)
3636

3737
def testCUnescape(self):
3838
for escaped, escaped_utf8, unescaped in TEST_VALUES:

python/google/protobuf/internal/text_format_test.py

+43-9
Original file line numberDiff line numberDiff line change
@@ -86,15 +86,18 @@ def testPrintExotic(self, message_module):
8686
message.repeated_string.append('\000\001\a\b\f\n\r\t\v\\\'"')
8787
message.repeated_string.append(u'\u00fc\ua71f')
8888
self.CompareToGoldenText(
89-
self.RemoveRedundantZeros(text_format.MessageToString(message)),
89+
self.RemoveRedundantZeros(
90+
text_format.MessageToString(message, as_utf8=True)
91+
),
9092
'repeated_int64: -9223372036854775808\n'
9193
'repeated_uint64: 18446744073709551615\n'
9294
'repeated_double: 123.456\n'
9395
'repeated_double: 1.23e+22\n'
9496
'repeated_double: 1.23e-18\n'
9597
'repeated_string:'
9698
' "\\000\\001\\007\\010\\014\\n\\r\\t\\013\\\\\\\'\\""\n'
97-
'repeated_string: "\\303\\274\\352\\234\\237"\n')
99+
'repeated_string: "üꜟ"\n',
100+
)
98101

99102
def testPrintFloatPrecision(self, message_module):
100103
message = message_module.TestAllTypes()
@@ -204,8 +207,8 @@ class UnicodeSub(str):
204207
message = message_module.TestAllTypes()
205208
message.repeated_string.append(UnicodeSub(u'\u00fc\ua71f'))
206209
self.CompareToGoldenText(
207-
text_format.MessageToString(message),
208-
'repeated_string: "\\303\\274\\352\\234\\237"\n')
210+
text_format.MessageToString(message, as_utf8=True),
211+
'repeated_string: "üꜟ"\n')
209212

210213
def testPrintNestedMessageAsOneLine(self, message_module):
211214
message = message_module.TestAllTypes()
@@ -282,15 +285,15 @@ def testPrintExoticAsOneLine(self, message_module):
282285
message.repeated_string.append(u'\u00fc\ua71f')
283286
self.CompareToGoldenText(
284287
self.RemoveRedundantZeros(text_format.MessageToString(
285-
message, as_one_line=True)),
288+
message, as_one_line=True, as_utf8=True)),
286289
'repeated_int64: -9223372036854775808'
287290
' repeated_uint64: 18446744073709551615'
288291
' repeated_double: 123.456'
289292
' repeated_double: 1.23e+22'
290293
' repeated_double: 1.23e-18'
291294
' repeated_string: '
292295
'"\\000\\001\\007\\010\\014\\n\\r\\t\\013\\\\\\\'\\""'
293-
' repeated_string: "\\303\\274\\352\\234\\237"')
296+
' repeated_string: "üꜟ"')
294297

295298
def testRoundTripExoticAsOneLine(self, message_module):
296299
message = message_module.TestAllTypes()
@@ -616,8 +619,8 @@ def testMessageToBytes(self, message_module):
616619
def testRawUtf8RoundTrip(self, message_module):
617620
message = message_module.TestAllTypes()
618621
message.repeated_string.append(u'\u00fc\t\ua71f')
619-
utf8_text = text_format.MessageToBytes(message, as_utf8=True)
620-
golden_bytes = b'repeated_string: "\xc3\xbc\\t\xea\x9c\x9f"\n'
622+
utf8_text = text_format.MessageToBytes(message, as_utf8=False)
623+
golden_bytes = b'repeated_string: "\\303\\274\\t\\352\\234\\237"\n'
621624
self.CompareToGoldenText(utf8_text, golden_bytes)
622625
parsed_message = message_module.TestAllTypes()
623626
text_format.Parse(utf8_text, parsed_message)
@@ -626,10 +629,41 @@ def testRawUtf8RoundTrip(self, message_module):
626629
(message, parsed_message, message.repeated_string[0],
627630
parsed_message.repeated_string[0]))
628631

632+
def testRawUtf8RoundTripAsUtf8(self, message_module):
633+
message = message_module.TestAllTypes()
634+
message.repeated_string.append(u'\u00fc\t\ua71f')
635+
utf8_text = text_format.MessageToString(message, as_utf8=True)
636+
parsed_message = message_module.TestAllTypes()
637+
text_format.Parse(utf8_text, parsed_message)
638+
self.assertEqual(
639+
message, parsed_message, '\n%s != %s (%s != %s)' %
640+
(message, parsed_message, message.repeated_string[0],
641+
parsed_message.repeated_string[0]))
642+
643+
# We can only test this case under proto2, because proto3 will reject invalid
644+
# UTF-8 in the parser, so there should be no way of creating a string field
645+
# that contains invalid UTF-8.
646+
#
647+
# We also can't test it in pure-Python, which validates all string fields for
648+
# UTF-8 even when the spec says it shouldn't.
649+
@unittest.skipIf(api_implementation.Type() == 'python',
650+
'Python can\'t create invalid UTF-8 strings')
651+
def testInvalidUtf8RoundTrip(self, message_module):
652+
if message_module is not unittest_pb2:
653+
return
654+
one_bytes = unittest_pb2.OneBytes()
655+
one_bytes.data = b'ABC\xff123'
656+
one_string = unittest_pb2.OneString()
657+
one_string.ParseFromString(one_bytes.SerializeToString())
658+
self.assertIn(
659+
'data: "ABC\\377123"',
660+
text_format.MessageToString(one_string, as_utf8=True),
661+
)
662+
629663
def testEscapedUtf8ASCIIRoundTrip(self, message_module):
630664
message = message_module.TestAllTypes()
631665
message.repeated_string.append(u'\u00fc\t\ua71f')
632-
ascii_text = text_format.MessageToBytes(message) # as_utf8=False default
666+
ascii_text = text_format.MessageToBytes(message, as_utf8=False)
633667
golden_bytes = b'repeated_string: "\\303\\274\\t\\352\\234\\237"\n'
634668
self.CompareToGoldenText(ascii_text, golden_bytes)
635669
parsed_message = message_module.TestAllTypes()

python/google/protobuf/text_encoding.py

+44-26
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,42 @@
88
"""Encoding related utilities."""
99
import re
1010

11-
_cescape_chr_to_symbol_map = {}
12-
_cescape_chr_to_symbol_map[9] = r'\t' # optional escape
13-
_cescape_chr_to_symbol_map[10] = r'\n' # optional escape
14-
_cescape_chr_to_symbol_map[13] = r'\r' # optional escape
15-
_cescape_chr_to_symbol_map[34] = r'\"' # necessary escape
16-
_cescape_chr_to_symbol_map[39] = r"\'" # optional escape
17-
_cescape_chr_to_symbol_map[92] = r'\\' # necessary escape
18-
19-
# Lookup table for unicode
20-
_cescape_unicode_to_str = [chr(i) for i in range(0, 256)]
21-
for byte, string in _cescape_chr_to_symbol_map.items():
22-
_cescape_unicode_to_str[byte] = string
23-
24-
# Lookup table for non-utf8, with necessary escapes at (o >= 127 or o < 32)
25-
_cescape_byte_to_str = ([r'\%03o' % i for i in range(0, 32)] +
26-
[chr(i) for i in range(32, 127)] +
27-
[r'\%03o' % i for i in range(127, 256)])
28-
for byte, string in _cescape_chr_to_symbol_map.items():
29-
_cescape_byte_to_str[byte] = string
30-
del byte, string
11+
def _AsciiIsPrint(i):
12+
return i >= 32 and i < 127
13+
14+
def _MakeStrEscapes():
15+
ret = {}
16+
for i in range(0, 128):
17+
if not _AsciiIsPrint(i):
18+
ret[i] = r'\%03o' % i
19+
ret[ord('\t')] = r'\t' # optional escape
20+
ret[ord('\n')] = r'\n' # optional escape
21+
ret[ord('\r')] = r'\r' # optional escape
22+
ret[ord('"')] = r'\"' # necessary escape
23+
ret[ord('\'')] = r"\'" # optional escape
24+
ret[ord('\\')] = r'\\' # necessary escape
25+
return ret
26+
27+
# Maps int -> char, performing string escapes.
28+
_str_escapes = _MakeStrEscapes()
29+
30+
# Maps int -> char, performing byte escaping and string escapes
31+
_byte_escapes = {i: chr(i) for i in range(0, 256)}
32+
_byte_escapes.update(_str_escapes)
33+
_byte_escapes.update({i: r'\%03o' % i for i in range(128, 256)})
34+
35+
36+
def _DecodeUtf8EscapeErrors(text_bytes):
37+
ret = ''
38+
while text_bytes:
39+
try:
40+
ret += text_bytes.decode('utf-8').translate(_str_escapes)
41+
text_bytes = ''
42+
except UnicodeDecodeError as e:
43+
ret += text_bytes[:e.start].decode('utf-8').translate(_str_escapes)
44+
ret += _byte_escapes[text_bytes[e.start]]
45+
text_bytes = text_bytes[e.start+1:]
46+
return ret
3147

3248

3349
def CEscape(text, as_utf8) -> str:
@@ -47,13 +63,15 @@ def CEscape(text, as_utf8) -> str:
4763
# length. So, "\0011".encode('string_escape') ends up being "\\x011", which
4864
# will be decoded in C++ as a single-character string with char code 0x11.
4965
text_is_unicode = isinstance(text, str)
50-
if as_utf8 and text_is_unicode:
51-
# We're already unicode, no processing beyond control char escapes.
52-
return text.translate(_cescape_chr_to_symbol_map)
53-
ord_ = ord if text_is_unicode else lambda x: x # bytes iterate as ints.
5466
if as_utf8:
55-
return ''.join(_cescape_unicode_to_str[ord_(c)] for c in text)
56-
return ''.join(_cescape_byte_to_str[ord_(c)] for c in text)
67+
if text_is_unicode:
68+
return text.translate(_str_escapes)
69+
else:
70+
return _DecodeUtf8EscapeErrors(text)
71+
else:
72+
if text_is_unicode:
73+
text = text.encode('utf-8')
74+
return ''.join([_byte_escapes[c] for c in text])
5775

5876

5977
_CUNESCAPE_HEX = re.compile(r'(\\+)x([0-9a-fA-F])(?![0-9a-fA-F])')

python/google/protobuf/text_format.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
_ANY_FULL_TYPE_NAME = 'google.protobuf.Any'
4747
_DEBUG_STRING_SILENT_MARKER = '\t '
4848

49+
_as_utf8_default = True
50+
4951

5052
class Error(Exception):
5153
"""Top-level module error for text_format."""
@@ -91,7 +93,7 @@ def getvalue(self):
9193

9294
def MessageToString(
9395
message,
94-
as_utf8=False,
96+
as_utf8=_as_utf8_default,
9597
as_one_line=False,
9698
use_short_repeated_primitives=False,
9799
pointy_brackets=False,
@@ -186,7 +188,7 @@ def _IsMapEntry(field):
186188
def PrintMessage(message,
187189
out,
188190
indent=0,
189-
as_utf8=False,
191+
as_utf8=_as_utf8_default,
190192
as_one_line=False,
191193
use_short_repeated_primitives=False,
192194
pointy_brackets=False,
@@ -229,7 +231,7 @@ def PrintMessage(message,
229231
the field is a proto message.
230232
"""
231233
printer = _Printer(
232-
out=out, indent=indent, as_utf8=as_utf8,
234+
out=out, indent=indent, as_utf8=_as_utf8_default,
233235
as_one_line=as_one_line,
234236
use_short_repeated_primitives=use_short_repeated_primitives,
235237
pointy_brackets=pointy_brackets,
@@ -248,7 +250,7 @@ def PrintField(field,
248250
value,
249251
out,
250252
indent=0,
251-
as_utf8=False,
253+
as_utf8=_as_utf8_default,
252254
as_one_line=False,
253255
use_short_repeated_primitives=False,
254256
pointy_brackets=False,
@@ -272,7 +274,7 @@ def PrintFieldValue(field,
272274
value,
273275
out,
274276
indent=0,
275-
as_utf8=False,
277+
as_utf8=_as_utf8_default,
276278
as_one_line=False,
277279
use_short_repeated_primitives=False,
278280
pointy_brackets=False,
@@ -328,7 +330,7 @@ def __init__(
328330
self,
329331
out,
330332
indent=0,
331-
as_utf8=False,
333+
as_utf8=_as_utf8_default,
332334
as_one_line=False,
333335
use_short_repeated_primitives=False,
334336
pointy_brackets=False,

0 commit comments

Comments
 (0)