Skip to content

Commit 0518edc

Browse files
authored
gh-119396: Optimize unicode_repr() (#119617)
Use stringlib to specialize unicode_repr() for each string kind (UCS1, UCS2, UCS4). Benchmark: +-------------------------------------+---------+----------------------+ | Benchmark | ref | change2 | +=====================================+=========+======================+ | repr('abc') | 100 ns | 103 ns: 1.02x slower | +-------------------------------------+---------+----------------------+ | repr('a' * 100) | 369 ns | 369 ns: 1.00x slower | +-------------------------------------+---------+----------------------+ | repr(('a' + squote) * 100) | 1.21 us | 946 ns: 1.27x faster | +-------------------------------------+---------+----------------------+ | repr(('a' + nl) * 100) | 1.23 us | 907 ns: 1.36x faster | +-------------------------------------+---------+----------------------+ | repr(dquote + ('a' + squote) * 100) | 1.08 us | 858 ns: 1.25x faster | +-------------------------------------+---------+----------------------+ | Geometric mean | (ref) | 1.16x faster | +-------------------------------------+---------+----------------------+
1 parent 2da0dc0 commit 0518edc

File tree

4 files changed

+131
-102
lines changed

4 files changed

+131
-102
lines changed

Makefile.pre.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1841,6 +1841,7 @@ UNICODE_DEPS = \
18411841
$(srcdir)/Objects/stringlib/localeutil.h \
18421842
$(srcdir)/Objects/stringlib/partition.h \
18431843
$(srcdir)/Objects/stringlib/replace.h \
1844+
$(srcdir)/Objects/stringlib/repr.h \
18441845
$(srcdir)/Objects/stringlib/split.h \
18451846
$(srcdir)/Objects/stringlib/ucs1lib.h \
18461847
$(srcdir)/Objects/stringlib/ucs2lib.h \

Objects/stringlib/repr.h

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
/* stringlib: repr() implementation */
2+
3+
#ifndef STRINGLIB_FASTSEARCH_H
4+
#error must include "stringlib/fastsearch.h" before including this module
5+
#endif
6+
7+
8+
static void
9+
STRINGLIB(repr)(PyObject *unicode, Py_UCS4 quote,
10+
STRINGLIB_CHAR *odata)
11+
{
12+
Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
13+
const void *idata = PyUnicode_DATA(unicode);
14+
int ikind = PyUnicode_KIND(unicode);
15+
16+
*odata++ = quote;
17+
for (Py_ssize_t i = 0; i < isize; i++) {
18+
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
19+
20+
/* Escape quotes and backslashes */
21+
if ((ch == quote) || (ch == '\\')) {
22+
*odata++ = '\\';
23+
*odata++ = ch;
24+
continue;
25+
}
26+
27+
/* Map special whitespace to '\t', \n', '\r' */
28+
if (ch == '\t') {
29+
*odata++ = '\\';
30+
*odata++ = 't';
31+
}
32+
else if (ch == '\n') {
33+
*odata++ = '\\';
34+
*odata++ = 'n';
35+
}
36+
else if (ch == '\r') {
37+
*odata++ = '\\';
38+
*odata++ = 'r';
39+
}
40+
41+
/* Map non-printable US ASCII to '\xhh' */
42+
else if (ch < ' ' || ch == 0x7F) {
43+
*odata++ = '\\';
44+
*odata++ = 'x';
45+
*odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
46+
*odata++ = Py_hexdigits[ch & 0x000F];
47+
}
48+
49+
/* Copy ASCII characters as-is */
50+
else if (ch < 0x7F) {
51+
*odata++ = ch;
52+
}
53+
54+
/* Non-ASCII characters */
55+
else {
56+
/* Map Unicode whitespace and control characters
57+
(categories Z* and C* except ASCII space)
58+
*/
59+
if (!Py_UNICODE_ISPRINTABLE(ch)) {
60+
*odata++ = '\\';
61+
/* Map 8-bit characters to '\xhh' */
62+
if (ch <= 0xff) {
63+
*odata++ = 'x';
64+
*odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
65+
*odata++ = Py_hexdigits[ch & 0x000F];
66+
}
67+
/* Map 16-bit characters to '\uxxxx' */
68+
else if (ch <= 0xffff) {
69+
*odata++ = 'u';
70+
*odata++ = Py_hexdigits[(ch >> 12) & 0xF];
71+
*odata++ = Py_hexdigits[(ch >> 8) & 0xF];
72+
*odata++ = Py_hexdigits[(ch >> 4) & 0xF];
73+
*odata++ = Py_hexdigits[ch & 0xF];
74+
}
75+
/* Map 21-bit characters to '\U00xxxxxx' */
76+
else {
77+
*odata++ = 'U';
78+
*odata++ = Py_hexdigits[(ch >> 28) & 0xF];
79+
*odata++ = Py_hexdigits[(ch >> 24) & 0xF];
80+
*odata++ = Py_hexdigits[(ch >> 20) & 0xF];
81+
*odata++ = Py_hexdigits[(ch >> 16) & 0xF];
82+
*odata++ = Py_hexdigits[(ch >> 12) & 0xF];
83+
*odata++ = Py_hexdigits[(ch >> 8) & 0xF];
84+
*odata++ = Py_hexdigits[(ch >> 4) & 0xF];
85+
*odata++ = Py_hexdigits[ch & 0xF];
86+
}
87+
}
88+
/* Copy characters as-is */
89+
else {
90+
*odata++ = ch;
91+
}
92+
}
93+
}
94+
*odata = quote;
95+
}

Objects/unicodeobject.c

Lines changed: 34 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -899,6 +899,7 @@ ensure_unicode(PyObject *obj)
899899
#include "stringlib/count.h"
900900
#include "stringlib/find.h"
901901
#include "stringlib/replace.h"
902+
#include "stringlib/repr.h"
902903
#include "stringlib/find_max_char.h"
903904
#include "stringlib/undef.h"
904905

@@ -909,6 +910,7 @@ ensure_unicode(PyObject *obj)
909910
#include "stringlib/count.h"
910911
#include "stringlib/find.h"
911912
#include "stringlib/replace.h"
913+
#include "stringlib/repr.h"
912914
#include "stringlib/find_max_char.h"
913915
#include "stringlib/undef.h"
914916

@@ -919,6 +921,7 @@ ensure_unicode(PyObject *obj)
919921
#include "stringlib/count.h"
920922
#include "stringlib/find.h"
921923
#include "stringlib/replace.h"
924+
#include "stringlib/repr.h"
922925
#include "stringlib/find_max_char.h"
923926
#include "stringlib/undef.h"
924927

@@ -12336,24 +12339,17 @@ unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
1233612339
static PyObject *
1233712340
unicode_repr(PyObject *unicode)
1233812341
{
12339-
PyObject *repr;
12340-
Py_ssize_t isize;
12341-
Py_ssize_t osize, squote, dquote, i, o;
12342-
Py_UCS4 max, quote;
12343-
int ikind, okind, unchanged;
12344-
const void *idata;
12345-
void *odata;
12346-
12347-
isize = PyUnicode_GET_LENGTH(unicode);
12348-
idata = PyUnicode_DATA(unicode);
12342+
Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12343+
const void *idata = PyUnicode_DATA(unicode);
1234912344

1235012345
/* Compute length of output, quote characters, and
1235112346
maximum character */
12352-
osize = 0;
12353-
max = 127;
12354-
squote = dquote = 0;
12355-
ikind = PyUnicode_KIND(unicode);
12356-
for (i = 0; i < isize; i++) {
12347+
Py_ssize_t osize = 0;
12348+
Py_UCS4 maxch = 127;
12349+
Py_ssize_t squote = 0;
12350+
Py_ssize_t dquote = 0;
12351+
int ikind = PyUnicode_KIND(unicode);
12352+
for (Py_ssize_t i = 0; i < isize; i++) {
1235712353
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
1235812354
Py_ssize_t incr = 1;
1235912355
switch (ch) {
@@ -12369,7 +12365,7 @@ unicode_repr(PyObject *unicode)
1236912365
else if (ch < 0x7f)
1237012366
;
1237112367
else if (Py_UNICODE_ISPRINTABLE(ch))
12372-
max = ch > max ? ch : max;
12368+
maxch = (ch > maxch) ? ch : maxch;
1237312369
else if (ch < 0x100)
1237412370
incr = 4; /* \xHH */
1237512371
else if (ch < 0x10000)
@@ -12385,10 +12381,10 @@ unicode_repr(PyObject *unicode)
1238512381
osize += incr;
1238612382
}
1238712383

12388-
quote = '\'';
12389-
unchanged = (osize == isize);
12384+
Py_UCS4 quote = '\'';
12385+
int changed = (osize != isize);
1239012386
if (squote) {
12391-
unchanged = 0;
12387+
changed = 1;
1239212388
if (dquote)
1239312389
/* Both squote and dquote present. Use squote,
1239412390
and escape them */
@@ -12398,99 +12394,35 @@ unicode_repr(PyObject *unicode)
1239812394
}
1239912395
osize += 2; /* quotes */
1240012396

12401-
repr = PyUnicode_New(osize, max);
12397+
PyObject *repr = PyUnicode_New(osize, maxch);
1240212398
if (repr == NULL)
1240312399
return NULL;
12404-
okind = PyUnicode_KIND(repr);
12405-
odata = PyUnicode_DATA(repr);
12400+
int okind = PyUnicode_KIND(repr);
12401+
void *odata = PyUnicode_DATA(repr);
12402+
12403+
if (!changed) {
12404+
PyUnicode_WRITE(okind, odata, 0, quote);
1240612405

12407-
PyUnicode_WRITE(okind, odata, 0, quote);
12408-
PyUnicode_WRITE(okind, odata, osize-1, quote);
12409-
if (unchanged) {
1241012406
_PyUnicode_FastCopyCharacters(repr, 1,
1241112407
unicode, 0,
1241212408
isize);
12409+
12410+
PyUnicode_WRITE(okind, odata, osize-1, quote);
1241312411
}
1241412412
else {
12415-
for (i = 0, o = 1; i < isize; i++) {
12416-
Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12417-
12418-
/* Escape quotes and backslashes */
12419-
if ((ch == quote) || (ch == '\\')) {
12420-
PyUnicode_WRITE(okind, odata, o++, '\\');
12421-
PyUnicode_WRITE(okind, odata, o++, ch);
12422-
continue;
12423-
}
12424-
12425-
/* Map special whitespace to '\t', \n', '\r' */
12426-
if (ch == '\t') {
12427-
PyUnicode_WRITE(okind, odata, o++, '\\');
12428-
PyUnicode_WRITE(okind, odata, o++, 't');
12429-
}
12430-
else if (ch == '\n') {
12431-
PyUnicode_WRITE(okind, odata, o++, '\\');
12432-
PyUnicode_WRITE(okind, odata, o++, 'n');
12433-
}
12434-
else if (ch == '\r') {
12435-
PyUnicode_WRITE(okind, odata, o++, '\\');
12436-
PyUnicode_WRITE(okind, odata, o++, 'r');
12437-
}
12438-
12439-
/* Map non-printable US ASCII to '\xhh' */
12440-
else if (ch < ' ' || ch == 0x7F) {
12441-
PyUnicode_WRITE(okind, odata, o++, '\\');
12442-
PyUnicode_WRITE(okind, odata, o++, 'x');
12443-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12444-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12445-
}
12446-
12447-
/* Copy ASCII characters as-is */
12448-
else if (ch < 0x7F) {
12449-
PyUnicode_WRITE(okind, odata, o++, ch);
12450-
}
12451-
12452-
/* Non-ASCII characters */
12453-
else {
12454-
/* Map Unicode whitespace and control characters
12455-
(categories Z* and C* except ASCII space)
12456-
*/
12457-
if (!Py_UNICODE_ISPRINTABLE(ch)) {
12458-
PyUnicode_WRITE(okind, odata, o++, '\\');
12459-
/* Map 8-bit characters to '\xhh' */
12460-
if (ch <= 0xff) {
12461-
PyUnicode_WRITE(okind, odata, o++, 'x');
12462-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12463-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12464-
}
12465-
/* Map 16-bit characters to '\uxxxx' */
12466-
else if (ch <= 0xffff) {
12467-
PyUnicode_WRITE(okind, odata, o++, 'u');
12468-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12469-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12470-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12471-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12472-
}
12473-
/* Map 21-bit characters to '\U00xxxxxx' */
12474-
else {
12475-
PyUnicode_WRITE(okind, odata, o++, 'U');
12476-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12477-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12478-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12479-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12480-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12481-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12482-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12483-
PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12484-
}
12485-
}
12486-
/* Copy characters as-is */
12487-
else {
12488-
PyUnicode_WRITE(okind, odata, o++, ch);
12489-
}
12490-
}
12413+
switch (okind) {
12414+
case PyUnicode_1BYTE_KIND:
12415+
ucs1lib_repr(unicode, quote, odata);
12416+
break;
12417+
case PyUnicode_2BYTE_KIND:
12418+
ucs2lib_repr(unicode, quote, odata);
12419+
break;
12420+
default:
12421+
assert(okind == PyUnicode_4BYTE_KIND);
12422+
ucs4lib_repr(unicode, quote, odata);
1249112423
}
1249212424
}
12493-
/* Closing quote already added at the beginning */
12425+
1249412426
assert(_PyUnicode_CheckConsistency(repr, 1));
1249512427
return repr;
1249612428
}

Tools/c-analyzer/cpython/_parser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ def clean_lines(text):
167167
Objects/stringlib/find.h Objects/stringlib/fastsearch.h
168168
Objects/stringlib/partition.h Objects/stringlib/fastsearch.h
169169
Objects/stringlib/replace.h Objects/stringlib/fastsearch.h
170+
Objects/stringlib/repr.h Objects/stringlib/fastsearch.h
170171
Objects/stringlib/split.h Objects/stringlib/fastsearch.h
171172
172173
# @end=tsv@

0 commit comments

Comments
 (0)