Skip to content

Commit 33b1c4a

Browse files
committed
pythongh-119182: Optimize PyUnicode_FromFormat() UTF-8 decoder
Add unicode_decode_utf8_writer() to write directly characters into a _PyUnicodeWriter writer: avoid the creation of a temporary string. Optimize PyUnicode_FromFormat() by using the new unicode_decode_utf8_writer(). Rename unicode_fromformat_write_cstr() to unicode_fromformat_write_utf8(). Microbenchmark on the code: return PyUnicode_FromFormat( "%s %s %s %s %s.", "format", "multiple", "utf8", "short", "strings"); Result: 620 ns +- 8 ns -> 382 ns +- 2 ns: 1.62x faster.
1 parent c9073eb commit 33b1c4a

File tree

1 file changed

+96
-58
lines changed

1 file changed

+96
-58
lines changed

Objects/unicodeobject.c

Lines changed: 96 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,11 @@ static PyObject *
202202
unicode_decode_utf8(const char *s, Py_ssize_t size,
203203
_Py_error_handler error_handler, const char *errors,
204204
Py_ssize_t *consumed);
205+
static int
206+
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
207+
const char *s, Py_ssize_t size,
208+
_Py_error_handler error_handler, const char *errors,
209+
Py_ssize_t *consumed);
205210
#ifdef Py_DEBUG
206211
static inline int unicode_is_finalizing(void);
207212
static int unicode_is_singleton(PyObject *unicode);
@@ -2377,14 +2382,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
23772382
}
23782383

23792384
static int
2380-
unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2385+
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
23812386
Py_ssize_t width, Py_ssize_t precision, int flags)
23822387
{
23832388
/* UTF-8 */
23842389
Py_ssize_t length;
2385-
PyObject *unicode;
2386-
int res;
2387-
23882390
if (precision == -1) {
23892391
length = strlen(str);
23902392
}
@@ -2394,13 +2396,22 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
23942396
length++;
23952397
}
23962398
}
2397-
unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2398-
if (unicode == NULL)
2399-
return -1;
24002399

2401-
res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2402-
Py_DECREF(unicode);
2403-
return res;
2400+
if (width < 0) {
2401+
return unicode_decode_utf8_writer(writer, str, length,
2402+
_Py_ERROR_UNKNOWN, "replace", NULL);
2403+
}
2404+
else {
2405+
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2406+
"replace", NULL);
2407+
if (unicode == NULL)
2408+
return -1;
2409+
2410+
int res = unicode_fromformat_write_str(writer, unicode,
2411+
width, -1, flags);
2412+
Py_DECREF(unicode);
2413+
return res;
2414+
}
24042415
}
24052416

24062417
static int
@@ -2700,7 +2711,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
27002711
else {
27012712
/* UTF-8 */
27022713
const char *s = va_arg(*vargs, const char*);
2703-
if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
2714+
if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
27042715
return NULL;
27052716
}
27062717
break;
@@ -2739,7 +2750,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
27392750
}
27402751
else {
27412752
assert(str != NULL);
2742-
if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
2753+
if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
27432754
return NULL;
27442755
}
27452756
break;
@@ -4737,65 +4748,56 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
47374748
return p - start;
47384749
}
47394750

4740-
static PyObject *
4741-
unicode_decode_utf8(const char *s, Py_ssize_t size,
4742-
_Py_error_handler error_handler, const char *errors,
4743-
Py_ssize_t *consumed)
4744-
{
4745-
if (size == 0) {
4746-
if (consumed)
4747-
*consumed = 0;
4748-
_Py_RETURN_UNICODE_EMPTY();
4749-
}
4750-
4751-
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
4752-
if (size == 1 && (unsigned char)s[0] < 128) {
4753-
if (consumed) {
4754-
*consumed = 1;
4755-
}
4756-
return get_latin1_char((unsigned char)s[0]);
4757-
}
47584751

4752+
static int
4753+
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
4754+
const char *s, Py_ssize_t size,
4755+
_Py_error_handler error_handler, const char *errors,
4756+
Py_ssize_t *consumed)
4757+
{
47594758
const char *starts = s;
47604759
const char *end = s + size;
47614760

47624761
// fast path: try ASCII string.
4763-
PyObject *u = PyUnicode_New(size, 127);
4764-
if (u == NULL) {
4765-
return NULL;
4762+
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
4763+
return -1;
47664764
}
4767-
s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
4768-
if (s == end) {
4769-
if (consumed) {
4770-
*consumed = size;
4765+
4766+
Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
4767+
if (writer->kind == PyUnicode_1BYTE_KIND
4768+
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
4769+
{
4770+
Py_ssize_t decoded = ascii_decode(s, end, dest);
4771+
writer->pos += decoded;
4772+
4773+
if (decoded == size) {
4774+
if (consumed) {
4775+
*consumed = size;
4776+
}
4777+
return 0;
47714778
}
4772-
return u;
4779+
s += decoded;
47734780
}
47744781

4775-
// Use _PyUnicodeWriter after fast path is failed.
4776-
_PyUnicodeWriter writer;
4777-
_PyUnicodeWriter_InitWithBuffer(&writer, u);
4778-
writer.pos = s - starts;
4779-
47804782
Py_ssize_t startinpos, endinpos;
47814783
const char *errmsg = "";
47824784
PyObject *error_handler_obj = NULL;
47834785
PyObject *exc = NULL;
47844786

47854787
while (s < end) {
47864788
Py_UCS4 ch;
4787-
int kind = writer.kind;
4789+
int kind = writer->kind;
47884790

47894791
if (kind == PyUnicode_1BYTE_KIND) {
4790-
if (PyUnicode_IS_ASCII(writer.buffer))
4791-
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4792+
if (PyUnicode_IS_ASCII(writer->buffer))
4793+
ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
47924794
else
4793-
ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4795+
ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
47944796
} else if (kind == PyUnicode_2BYTE_KIND) {
4795-
ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4797+
ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
47964798
} else {
47974799
assert(kind == PyUnicode_4BYTE_KIND);
4798-
ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4800+
ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
47994801
}
48004802

48014803
switch (ch) {
@@ -4826,7 +4828,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48264828
endinpos = startinpos + ch - 1;
48274829
break;
48284830
default:
4829-
if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4831+
if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
48304832
goto onError;
48314833
continue;
48324834
}
@@ -4840,7 +4842,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48404842
break;
48414843

48424844
case _Py_ERROR_REPLACE:
4843-
if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4845+
if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
48444846
goto onError;
48454847
s += (endinpos - startinpos);
48464848
break;
@@ -4849,13 +4851,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48494851
{
48504852
Py_ssize_t i;
48514853

4852-
if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4854+
if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
48534855
goto onError;
48544856
for (i=startinpos; i<endinpos; i++) {
48554857
ch = (Py_UCS4)(unsigned char)(starts[i]);
4856-
PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4858+
PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
48574859
ch + 0xdc00);
4858-
writer.pos++;
4860+
writer->pos++;
48594861
}
48604862
s += (endinpos - startinpos);
48614863
break;
@@ -4866,8 +4868,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48664868
errors, &error_handler_obj,
48674869
"utf-8", errmsg,
48684870
&starts, &end, &startinpos, &endinpos, &exc, &s,
4869-
&writer))
4871+
writer)) {
48704872
goto onError;
4873+
}
4874+
4875+
if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
4876+
return -1;
4877+
}
48714878
}
48724879
}
48734880

@@ -4877,13 +4884,44 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48774884

48784885
Py_XDECREF(error_handler_obj);
48794886
Py_XDECREF(exc);
4880-
return _PyUnicodeWriter_Finish(&writer);
4887+
return 0;
48814888

48824889
onError:
48834890
Py_XDECREF(error_handler_obj);
48844891
Py_XDECREF(exc);
4885-
_PyUnicodeWriter_Dealloc(&writer);
4886-
return NULL;
4892+
return -1;
4893+
}
4894+
4895+
4896+
static PyObject *
4897+
unicode_decode_utf8(const char *s, Py_ssize_t size,
4898+
_Py_error_handler error_handler, const char *errors,
4899+
Py_ssize_t *consumed)
4900+
{
4901+
if (size == 0) {
4902+
if (consumed)
4903+
*consumed = 0;
4904+
_Py_RETURN_UNICODE_EMPTY();
4905+
}
4906+
4907+
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
4908+
if (size == 1 && (unsigned char)s[0] < 128) {
4909+
if (consumed) {
4910+
*consumed = 1;
4911+
}
4912+
return get_latin1_char((unsigned char)s[0]);
4913+
}
4914+
4915+
_PyUnicodeWriter writer;
4916+
_PyUnicodeWriter_Init(&writer);
4917+
4918+
if (unicode_decode_utf8_writer(&writer, s, size,
4919+
error_handler, errors,
4920+
consumed) < 0) {
4921+
_PyUnicodeWriter_Dealloc(&writer);
4922+
return NULL;
4923+
}
4924+
return _PyUnicodeWriter_Finish(&writer);
48874925
}
48884926

48894927

0 commit comments

Comments
 (0)