Skip to content

Commit 80bfc80

Browse files
committed
gh-93033: Use wmemchr in find_char and replace_1char_inplace
This was brought up a bit in #69009 but the larger issue is mostly different. Generally comparable perf for the "good" case where memchr doesn't return any collisions (false matches on lower byte) but clearly faster with collisions. Some notes on correctness: wchar_t being signed/unsigned shouldn't matter here BUT wmemchr (along with just about all the other wide-char string functions) can and often does (x86_64 for example) assume that the input is aligned relative to the sizeof(wchar_t). If this is not the case for Py_UCS{2|4} then this patch is broken. Also I think the way I implemented `#define STRINGLIB_FAST_MEMCHR` for ucs{2|4}lib break strict-aliasing. If this is an issue but otherwise the patch is fine, any suggestions for how to fix it? Test results: ``` $> ./python -m test -j4 ... == Tests result: SUCCESS == 406 tests OK. 30 tests skipped: test_bz2 test_curses test_dbm_gnu test_dbm_ndbm test_devpoll test_idle test_ioctl test_kqueue test_launcher test_msilib test_nis test_ossaudiodev test_readline test_smtpnet test_socketserver test_sqlite3 test_startfile test_tcl test_tix test_tk test_ttk_guionly test_ttk_textonly test_turtle test_urllib2net test_urllibnet test_winconsoleio test_winreg test_winsound test_xmlrpc_net test_zipfile64 ``` Benchmarked on: model name : 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz sizeof(wchar_t) == 4 GLIBC 2.35 ``` ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 200 + "\U00018200"' -- 's.find("\U00018210")' ## Long, No match, No collision No wmemchr : 1000 loops, best of 100: 127 nsec per loop With wmemchr: 1000 loops, best of 100: 123 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 200 + "\U00018200"' -- 's.find("\U00018208")' ## Long, No match, High collision No wmemchr : 1000 loops, best of 100: 1.29 usec per loop With wmemchr: 1000 loops, best of 100: 123 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 200 + "\U00018210"' -- 's.find("\U00018210")' ## Long, match, No collision No wmemchr : 1000 loops, best of 100: 136 nsec per loop With wmemchr: 1000 loops, best of 100: 130 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 200 + "\U00018208"' -- 's.find("\U00018208")' ## Long, match, High collision No wmemchr : 1000 loops, best of 100: 1.35 usec per loop With wmemchr: 1000 loops, best of 100: 131 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 3 + "\U00018200"' -- 's.find("\U00018210")' ## Short, No match, No collision No wmemchr : 1000 loops, best of 100: 50.2 nsec per loop With wmemchr: 1000 loops, best of 100: 52.9 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 3 + "\U00018200"' -- 's.find("\U00018208")' ## Short, No match, High collision No wmemchr : 1000 loops, best of 100: 69.1 nsec per loop With wmemchr: 1000 loops, best of 100: 53.7 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 3 + "\U00018210"' -- 's.find("\U00018210")' ## Short, match, No collision No wmemchr : 1000 loops, best of 100: 53.6 nsec per loop With wmemchr: 1000 loops, best of 100: 53.6 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 3 + "\U00018208"' -- 's.find("\U00018208")' ## Short, match, High collision No wmemchr : 1000 loops, best of 100: 69 nsec per loop With wmemchr: 1000 loops, best of 100: 50.9 nsec per loop ```
1 parent 59719a2 commit 80bfc80

File tree

8 files changed

+36
-14
lines changed

8 files changed

+36
-14
lines changed

Objects/stringlib/asciilib.h

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#define STRINGLIB_CHECK PyUnicode_Check
2222
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
2323
#define STRINGLIB_MUTABLE 0
24+
#define STRINGLIB_FAST_MEMCHR memchr
2425

2526
#define STRINGLIB_TOSTR PyObject_Str
2627
#define STRINGLIB_TOASCII PyObject_ASCII

Objects/stringlib/fastsearch.h

+22-12
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
#define STRINGLIB_BLOOM(mask, ch) \
4040
((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
4141

42-
#if STRINGLIB_SIZEOF_CHAR == 1
42+
#ifdef STRINGLIB_FAST_MEMCHR
4343
# define MEMCHR_CUT_OFF 15
4444
#else
4545
# define MEMCHR_CUT_OFF 40
@@ -53,8 +53,8 @@ STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
5353
p = s;
5454
e = s + n;
5555
if (n > MEMCHR_CUT_OFF) {
56-
#if STRINGLIB_SIZEOF_CHAR == 1
57-
p = memchr(s, ch, n);
56+
#ifdef STRINGLIB_FAST_MEMCHR
57+
p = STRINGLIB_FAST_MEMCHR(s, ch, n);
5858
if (p != NULL)
5959
return (p - s);
6060
return -1;
@@ -102,16 +102,26 @@ STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
102102
return -1;
103103
}
104104

105+
#undef MEMCHR_CUT_OFF
106+
107+
#if STRINGLIB_SIZEOF_CHAR == 1
108+
# define MEMRCHR_CUT_OFF 15
109+
#else
110+
# define MEMRCHR_CUT_OFF 40
111+
#endif
112+
113+
105114
Py_LOCAL_INLINE(Py_ssize_t)
106115
STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
107116
{
108117
const STRINGLIB_CHAR *p;
109118
#ifdef HAVE_MEMRCHR
110-
/* memrchr() is a GNU extension, available since glibc 2.1.91.
111-
it doesn't seem as optimized as memchr(), but is still quite
112-
faster than our hand-written loop below */
119+
/* memrchr() is a GNU extension, available since glibc 2.1.91. it
120+
doesn't seem as optimized as memchr(), but is still quite
121+
faster than our hand-written loop below. There is no wmemrchr
122+
for 4-byte chars. */
113123

114-
if (n > MEMCHR_CUT_OFF) {
124+
if (n > MEMRCHR_CUT_OFF) {
115125
#if STRINGLIB_SIZEOF_CHAR == 1
116126
p = memrchr(s, ch, n);
117127
if (p != NULL)
@@ -139,19 +149,19 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
139149
if (*p == ch)
140150
return n;
141151
/* False positive */
142-
if (n1 - n > MEMCHR_CUT_OFF)
152+
if (n1 - n > MEMRCHR_CUT_OFF)
143153
continue;
144-
if (n <= MEMCHR_CUT_OFF)
154+
if (n <= MEMRCHR_CUT_OFF)
145155
break;
146-
s1 = p - MEMCHR_CUT_OFF;
156+
s1 = p - MEMRCHR_CUT_OFF;
147157
while (p > s1) {
148158
p--;
149159
if (*p == ch)
150160
return (p - s);
151161
}
152162
n = p - s;
153163
}
154-
while (n > MEMCHR_CUT_OFF);
164+
while (n > MEMRCHR_CUT_OFF);
155165
}
156166
#endif
157167
}
@@ -165,7 +175,7 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
165175
return -1;
166176
}
167177

168-
#undef MEMCHR_CUT_OFF
178+
#undef MEMRCHR_CUT_OFF
169179

170180
/* Change to a 1 to see logging comments walk through the algorithm. */
171181
#if 0 && STRINGLIB_SIZEOF_CHAR == 1

Objects/stringlib/replace.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ STRINGLIB(replace_1char_inplace)(STRINGLIB_CHAR* s, STRINGLIB_CHAR* end,
2929
if (!--attempts) {
3030
/* if u1 was not found for attempts iterations,
3131
use FASTSEARCH() or memchr() */
32-
#if STRINGLIB_SIZEOF_CHAR == 1
32+
#ifdef STRINGLIB_FAST_MEMCHR
3333
s++;
34-
s = memchr(s, u1, end - s);
34+
s = STRINGLIB_FAST_MEMCHR(s, u1, end - s);
3535
if (s == NULL)
3636
return;
3737
#else

Objects/stringlib/stringdefs.h

+1
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@
2424
#define STRINGLIB_CHECK_EXACT PyBytes_CheckExact
2525
#define STRINGLIB_TOSTR PyObject_Str
2626
#define STRINGLIB_TOASCII PyObject_Repr
27+
#define STRINGLIB_FAST_MEMCHR memchr
2728
#endif /* !STRINGLIB_STRINGDEFS_H */

Objects/stringlib/ucs1lib.h

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#define STRINGLIB_NEW _PyUnicode_FromUCS1
2121
#define STRINGLIB_CHECK PyUnicode_Check
2222
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
23+
#define STRINGLIB_FAST_MEMCHR memchr
2324
#define STRINGLIB_MUTABLE 0
2425

2526
#define STRINGLIB_TOSTR PyObject_Str

Objects/stringlib/ucs2lib.h

+4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
#define STRINGLIB_CHECK PyUnicode_Check
2222
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
2323
#define STRINGLIB_MUTABLE 0
24+
#if SIZEOF_WCHAR_T == 2
25+
#define STRINGLIB_FAST_MEMCHR(s, c, n) \
26+
(Py_UCS2 *)wmemchr((const wchar_t *)(s), c, n)
27+
#endif
2428

2529
#define STRINGLIB_TOSTR PyObject_Str
2630
#define STRINGLIB_TOASCII PyObject_ASCII

Objects/stringlib/ucs4lib.h

+4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
#define STRINGLIB_CHECK PyUnicode_Check
2222
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
2323
#define STRINGLIB_MUTABLE 0
24+
#if SIZEOF_WCHAR_T == 4
25+
#define STRINGLIB_FAST_MEMCHR(s, c, n) \
26+
(Py_UCS4 *)wmemchr((const wchar_t *)(s), c, n)
27+
#endif
2428

2529
#define STRINGLIB_TOSTR PyObject_Str
2630
#define STRINGLIB_TOASCII PyObject_ASCII

Objects/stringlib/undef.h

+1
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
#undef STRINGLIB_NEW
99
#undef STRINGLIB_IS_UNICODE
1010
#undef STRINGLIB_MUTABLE
11+
#undef STRINGLIB_FAST_MEMCHR

0 commit comments

Comments
 (0)