Skip to content

Commit f2a91b3

Browse files
habermancopybara-github
authored andcommitted
Make str(msg) in Python print raw UTF-8 strings. Only invalid UTF-8 is escaped.
PiperOrigin-RevId: 597917280
1 parent 3a007b5 commit f2a91b3

File tree

6 files changed

+173
-46
lines changed

6 files changed

+173
-46
lines changed

python/google/protobuf/internal/message_test.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,6 +2606,30 @@ def testTypeNamesCanBeImported(self):
26062606
self.assertImportFromName(pb.repeated_nested_message, 'Composite')
26072607

26082608

2609+
# We can only test this case under proto2, because proto3 will reject invalid
2610+
# UTF-8 in the parser, so there should be no way of creating a string field
2611+
# that contains invalid UTF-8.
2612+
#
2613+
# We also can't test it in pure-Python, which validates all string fields for
2614+
# UTF-8 even when the spec says it shouldn't.
2615+
@unittest.skipIf(api_implementation.Type() == 'python',
2616+
'Python can\'t create invalid UTF-8 strings')
2617+
@testing_refleaks.TestCase
2618+
class InvalidUtf8Test(unittest.TestCase):
2619+
2620+
def testInvalidUtf8Printing(self):
2621+
one_bytes = unittest_pb2.OneBytes()
2622+
one_bytes.data = b'ABC\xff123'
2623+
one_string = unittest_pb2.OneString()
2624+
one_string.ParseFromString(one_bytes.SerializeToString())
2625+
self.assertIn('data: "ABC\\377123"', str(one_string))
2626+
2627+
def testValidUtf8Printing(self):
2628+
self.assertIn('data: "€"', str(unittest_pb2.OneString(data='€'))) # 2 byte
2629+
self.assertIn('data: "£"', str(unittest_pb2.OneString(data='£'))) # 3 byte
2630+
self.assertIn('data: "🙂"', str(unittest_pb2.OneString(data='🙂'))) # 4 byte
2631+
2632+
26092633
@testing_refleaks.TestCase
26102634
class PackedFieldTest(unittest.TestCase):
26112635

python/google/protobuf/pyext/message.cc

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,16 @@
3030
#endif
3131
#include "google/protobuf/stubs/common.h"
3232
#include "google/protobuf/descriptor.pb.h"
33+
#include "absl/strings/escaping.h"
34+
#include "absl/strings/string_view.h"
3335
#include "google/protobuf/descriptor.h"
36+
#include "google/protobuf/io/coded_stream.h"
37+
#include "google/protobuf/io/strtod.h"
38+
#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
3439
#include "google/protobuf/message.h"
3540
#include "google/protobuf/text_format.h"
3641
#include "google/protobuf/unknown_field_set.h"
42+
#include "google/protobuf/util/message_differencer.h"
3743
#include "google/protobuf/pyext/descriptor.h"
3844
#include "google/protobuf/pyext/descriptor_pool.h"
3945
#include "google/protobuf/pyext/extension_dict.h"
@@ -46,11 +52,6 @@
4652
#include "google/protobuf/pyext/scoped_pyobject_ptr.h"
4753
#include "google/protobuf/pyext/unknown_field_set.h"
4854
#include "google/protobuf/pyext/unknown_fields.h"
49-
#include "google/protobuf/util/message_differencer.h"
50-
#include "absl/strings/string_view.h"
51-
#include "google/protobuf/io/coded_stream.h"
52-
#include "google/protobuf/io/strtod.h"
53-
#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
5455

5556
// clang-format off
5657
#include "google/protobuf/port_def.inc"
@@ -1683,6 +1684,18 @@ class PythonFieldValuePrinter : public TextFormat::FastFieldValuePrinter {
16831684

16841685
generator->PrintString(PyString_AsString(py_str.get()));
16851686
}
1687+
void PrintString(const std::string& val,
1688+
TextFormat::BaseTextGenerator* generator) const override {
1689+
TextFormat::Printer::HardenedPrintString(val, generator);
1690+
}
1691+
void PrintBytes(const std::string& val,
1692+
TextFormat::BaseTextGenerator* generator) const override {
1693+
generator->PrintLiteral("\"");
1694+
if (!val.empty()) {
1695+
generator->PrintString(absl::CEscape(val));
1696+
}
1697+
generator->PrintLiteral("\"");
1698+
}
16861699
};
16871700

16881701
static PyObject* ToStr(CMessage* self) {

src/google/protobuf/text_format.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,14 +1706,17 @@ size_t SkipPassthroughBytes(absl::string_view val) {
17061706
return val.size();
17071707
}
17081708

1709-
void HardenedPrintString(absl::string_view src,
1710-
TextFormat::BaseTextGenerator* generator) {
1709+
} // namespace
1710+
1711+
void TextFormat::Printer::HardenedPrintString(
1712+
absl::string_view src, TextFormat::BaseTextGenerator* generator) {
17111713
// Print as UTF-8, while guarding against any invalid UTF-8 in the string
17121714
// field.
17131715
//
17141716
// If in the future we have a guaranteed invariant that invalid UTF-8 will
17151717
// never be present, we could avoid the UTF-8 check here.
17161718

1719+
generator->PrintLiteral("\"");
17171720
while (!src.empty()) {
17181721
size_t n = SkipPassthroughBytes(src);
17191722
if (n != 0) {
@@ -1727,20 +1730,17 @@ void HardenedPrintString(absl::string_view src,
17271730
generator->PrintString(absl::CEscape(src.substr(0, 1)));
17281731
src.remove_prefix(1);
17291732
}
1733+
generator->PrintLiteral("\"");
17301734
}
17311735

1732-
} // namespace
1733-
17341736
// ===========================================================================
17351737
// An internal field value printer that escape UTF8 strings.
17361738
class TextFormat::Printer::FastFieldValuePrinterUtf8Escaping
17371739
: public TextFormat::Printer::DebugStringFieldValuePrinter {
17381740
public:
17391741
void PrintString(const std::string& val,
17401742
TextFormat::BaseTextGenerator* generator) const override {
1741-
generator->PrintLiteral("\"");
1742-
HardenedPrintString(val, generator);
1743-
generator->PrintLiteral("\"");
1743+
TextFormat::Printer::HardenedPrintString(val, generator);
17441744
}
17451745
void PrintBytes(const std::string& val,
17461746
TextFormat::BaseTextGenerator* generator) const override {

src/google/protobuf/text_format.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@ namespace io {
7878
class ErrorCollector; // tokenizer.h
7979
}
8080

81+
namespace python {
82+
namespace cmessage {
83+
class PythonFieldValuePrinter;
84+
}
85+
} // namespace python
86+
8187
namespace internal {
8288
// Enum used to set printing options for StringifyMessage.
8389
PROTOBUF_EXPORT enum class Option;
@@ -526,6 +532,10 @@ class PROTOBUF_EXPORT TextFormat {
526532
: it->second.get();
527533
}
528534

535+
friend class google::protobuf::python::cmessage::PythonFieldValuePrinter;
536+
static void HardenedPrintString(absl::string_view src,
537+
TextFormat::BaseTextGenerator* generator);
538+
529539
int initial_indent_level_;
530540
bool single_line_mode_;
531541
bool use_field_number_;

upb/text/BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@ cc_library(
1818
copts = UPB_DEFAULT_COPTS,
1919
visibility = ["//visibility:public"],
2020
deps = [
21+
"//third_party/utf8_range",
2122
"//upb:base",
2223
"//upb:eps_copy_input_stream",
2324
"//upb:message",
2425
"//upb:port",
2526
"//upb:reflection",
26-
"//upb:wire",
2727
"//upb:wire_reader",
2828
"//upb/lex",
2929
"//upb/message:internal",

upb/text/encode.c

Lines changed: 113 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "upb/wire/eps_copy_input_stream.h"
3030
#include "upb/wire/reader.h"
3131
#include "upb/wire/types.h"
32+
#include "utf8_range.h"
3233

3334
// Must be last.
3435
#include "upb/port/def.inc"
@@ -108,42 +109,121 @@ static void txtenc_enum(int32_t val, const upb_FieldDef* f, txtenc* e) {
108109
}
109110
}
110111

111-
static void txtenc_string(txtenc* e, upb_StringView str, bool bytes) {
112-
const char* ptr = str.data;
113-
const char* end = ptr + str.size;
114-
txtenc_putstr(e, "\"");
112+
static void txtenc_escaped(txtenc* e, unsigned char ch) {
113+
switch (ch) {
114+
case '\n':
115+
txtenc_putstr(e, "\\n");
116+
break;
117+
case '\r':
118+
txtenc_putstr(e, "\\r");
119+
break;
120+
case '\t':
121+
txtenc_putstr(e, "\\t");
122+
break;
123+
case '\"':
124+
txtenc_putstr(e, "\\\"");
125+
break;
126+
case '\'':
127+
txtenc_putstr(e, "\\'");
128+
break;
129+
case '\\':
130+
txtenc_putstr(e, "\\\\");
131+
break;
132+
default:
133+
txtenc_printf(e, "\\%03o", ch);
134+
break;
135+
}
136+
}
137+
138+
// Returns true if `ch` needs to be escaped in TextFormat, independent of any
139+
// UTF-8 validity issues.
140+
static bool upb_DefinitelyNeedsEscape(unsigned char ch) {
141+
if (ch < 32) return true;
142+
switch (ch) {
143+
case '\"':
144+
case '\'':
145+
case '\\':
146+
case 127:
147+
return true;
148+
}
149+
return false;
150+
}
151+
152+
static bool upb_AsciiIsPrint(unsigned char ch) { return ch >= 32 && ch < 127; }
153+
154+
// Returns true if this is a high byte that requires UTF-8 validation. If the
155+
// UTF-8 validation fails, we must escape the byte.
156+
static bool upb_NeedsUtf8Validation(unsigned char ch) { return ch > 127; }
157+
158+
// Returns the number of bytes in the prefix of `val` that do not need escaping.
159+
// This is like utf8_range::SpanStructurallyValid(), except that it also
160+
// terminates at any ASCII char that needs to be escaped in TextFormat (any char
161+
// that has `DefinitelyNeedsEscape(ch) == true`).
162+
//
163+
// If we could get a variant of utf8_range::SpanStructurallyValid() that could
164+
// terminate on any of these chars, that might be more efficient, but it would
165+
// be much more complicated to modify that heavily SIMD code.
166+
static size_t SkipPassthroughBytes(const char* ptr, size_t size) {
167+
for (size_t i = 0; i < size; i++) {
168+
unsigned char uc = ptr[i];
169+
if (upb_DefinitelyNeedsEscape(uc)) return i;
170+
if (upb_NeedsUtf8Validation(uc)) {
171+
// Find the end of this region of consecutive high bytes, so that we only
172+
// give high bytes to the UTF-8 checker. This avoids needing to perform
173+
// a second scan of the ASCII characters looking for characters that
174+
// need escaping.
175+
//
176+
// We assume that high bytes are less frequent than plain, printable ASCII
177+
// bytes, so we accept the double-scan of high bytes.
178+
size_t end = i + 1;
179+
for (; end < size; end++) {
180+
if (!upb_NeedsUtf8Validation(ptr[end])) break;
181+
}
182+
size_t n = end - i;
183+
size_t ok = utf8_range_ValidPrefix(ptr + i, n);
184+
if (ok != n) return i + ok;
185+
i += ok - 1;
186+
}
187+
}
188+
return size;
189+
}
115190

191+
static void upb_HardenedPrintString(txtenc* e, const char* ptr, size_t len) {
192+
// Print as UTF-8, while guarding against any invalid UTF-8 in the string
193+
// field.
194+
//
195+
// If in the future we have a guaranteed invariant that invalid UTF-8 will
196+
// never be present, we could avoid the UTF-8 check here.
197+
txtenc_putstr(e, "\"");
198+
const char* end = ptr + len;
116199
while (ptr < end) {
117-
switch (*ptr) {
118-
case '\n':
119-
txtenc_putstr(e, "\\n");
120-
break;
121-
case '\r':
122-
txtenc_putstr(e, "\\r");
123-
break;
124-
case '\t':
125-
txtenc_putstr(e, "\\t");
126-
break;
127-
case '\"':
128-
txtenc_putstr(e, "\\\"");
129-
break;
130-
case '\'':
131-
txtenc_putstr(e, "\\'");
132-
break;
133-
case '\\':
134-
txtenc_putstr(e, "\\\\");
135-
break;
136-
default:
137-
if ((bytes || (uint8_t)*ptr < 0x80) && !isprint(*ptr)) {
138-
txtenc_printf(e, "\\%03o", (int)(uint8_t)*ptr);
139-
} else {
140-
txtenc_putbytes(e, ptr, 1);
141-
}
142-
break;
200+
size_t n = SkipPassthroughBytes(ptr, end - ptr);
201+
if (n != 0) {
202+
txtenc_putbytes(e, ptr, n);
203+
ptr += n;
204+
if (ptr == end) break;
143205
}
206+
207+
// If repeated calls to CEscape() and PrintString() are expensive, we could
208+
// consider batching them, at the cost of some complexity.
209+
txtenc_escaped(e, *ptr);
144210
ptr++;
145211
}
212+
txtenc_putstr(e, "\"");
213+
}
146214

215+
static void txtenc_bytes(txtenc* e, upb_StringView data) {
216+
const char* ptr = data.data;
217+
const char* end = ptr + data.size;
218+
txtenc_putstr(e, "\"");
219+
for (; ptr < end; ptr++) {
220+
unsigned char uc = *ptr;
221+
if (upb_AsciiIsPrint(uc)) {
222+
txtenc_putbytes(e, ptr, 1);
223+
} else {
224+
txtenc_escaped(e, uc);
225+
}
226+
}
147227
txtenc_putstr(e, "\"");
148228
}
149229

@@ -206,10 +286,10 @@ static void txtenc_field(txtenc* e, upb_MessageValue val,
206286
txtenc_printf(e, "%" PRIu64, val.uint64_val);
207287
break;
208288
case kUpb_CType_String:
209-
txtenc_string(e, val.str_val, false);
289+
upb_HardenedPrintString(e, val.str_val.data, val.str_val.size);
210290
break;
211291
case kUpb_CType_Bytes:
212-
txtenc_string(e, val.str_val, true);
292+
txtenc_bytes(e, val.str_val);
213293
break;
214294
case kUpb_CType_Enum:
215295
txtenc_enum(val.int32_val, f, e);
@@ -378,7 +458,7 @@ static const char* txtenc_unknown(txtenc* e, const char* ptr,
378458
const char* str = ptr;
379459
ptr = upb_EpsCopyInputStream_ReadString(stream, &str, size, NULL);
380460
UPB_ASSERT(ptr);
381-
txtenc_string(e, (upb_StringView){.data = str, .size = size}, true);
461+
txtenc_bytes(e, (upb_StringView){.data = str, .size = size});
382462
}
383463
break;
384464
}

0 commit comments

Comments
 (0)