gh-93033: Use wmemchr in find_char and replace_1char_inplace

goldsteinn · goldsteinn · commit 80bfc80054f9 · 2022-05-21T15:15:15.000-05:00
This was brought up a bit in #69009 but the larger issue is mostly different. Generally comparable perf for the "good" case where memchr doesn't return any collisions (false matches on lower byte) but clearly faster with collisions. Some notes on correctness: wchar_t being signed/unsigned shouldn't matter here BUT wmemchr (along with just about all the other wide-char string functions) can and often does (x86_64 for example) assume that the input is aligned relative to the sizeof(wchar_t). If this is not the case for Py_UCS{2|4} then this patch is broken. Also I think the way I implemented `#define STRINGLIB_FAST_MEMCHR` for ucs{2|4}lib break strict-aliasing. If this is an issue but otherwise the patch is fine, any suggestions for how to fix it? Test results: ``` $> ./python -m test -j4 ... == Tests result: SUCCESS == 406 tests OK. 30 tests skipped: test_bz2 test_curses test_dbm_gnu test_dbm_ndbm test_devpoll test_idle test_ioctl test_kqueue test_launcher test_msilib test_nis test_ossaudiodev test_readline test_smtpnet test_socketserver test_sqlite3 test_startfile test_tcl test_tix test_tk test_ttk_guionly test_ttk_textonly test_turtle test_urllib2net test_urllibnet test_winconsoleio test_winreg test_winsound test_xmlrpc_net test_zipfile64 ``` Benchmarked on: model name : 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz sizeof(wchar_t) == 4 GLIBC 2.35 ``` ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 200 + "\U00018200"' -- 's.find("\U00018210")' ## Long, No match, No collision No wmemchr : 1000 loops, best of 100: 127 nsec per loop With wmemchr: 1000 loops, best of 100: 123 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 200 + "\U00018200"' -- 's.find("\U00018208")' ## Long, No match, High collision No wmemchr : 1000 loops, best of 100: 1.29 usec per loop With wmemchr: 1000 loops, best of 100: 123 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 200 + "\U00018210"' -- 's.find("\U00018210")' ## Long, match, No collision No wmemchr : 1000 loops, best of 100: 136 nsec per loop With wmemchr: 1000 loops, best of 100: 130 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 200 + "\U00018208"' -- 's.find("\U00018208")' ## Long, match, High collision No wmemchr : 1000 loops, best of 100: 1.35 usec per loop With wmemchr: 1000 loops, best of 100: 131 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 3 + "\U00018200"' -- 's.find("\U00018210")' ## Short, No match, No collision No wmemchr : 1000 loops, best of 100: 50.2 nsec per loop With wmemchr: 1000 loops, best of 100: 52.9 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 3 + "\U00018200"' -- 's.find("\U00018208")' ## Short, No match, High collision No wmemchr : 1000 loops, best of 100: 69.1 nsec per loop With wmemchr: 1000 loops, best of 100: 53.7 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 3 + "\U00018210"' -- 's.find("\U00018210")' ## Short, match, No collision No wmemchr : 1000 loops, best of 100: 53.6 nsec per loop With wmemchr: 1000 loops, best of 100: 53.6 nsec per loop ./python -m timeit -s 's = "\U00010200\U00010201\U00010202\U00010203\U00010204\U00010205\U00010206\U00010207\U00010208\U00010209\U0001020a\U0001020b\U0001020c\U0001020d\U0001020e\U0001020f" * 3 + "\U00018208"' -- 's.find("\U00018208")' ## Short, match, High collision No wmemchr : 1000 loops, best of 100: 69 nsec per loop With wmemchr: 1000 loops, best of 100: 50.9 nsec per loop ```
diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h
@@ -21,6 +21,7 @@
 #define STRINGLIB_CHECK          PyUnicode_Check
 #define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact
 #define STRINGLIB_MUTABLE 0
+#define STRINGLIB_FAST_MEMCHR    memchr
 
 #define STRINGLIB_TOSTR          PyObject_Str
 #define STRINGLIB_TOASCII        PyObject_ASCII
diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
@@ -39,7 +39,7 @@
 #define STRINGLIB_BLOOM(mask, ch)     \
     ((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
 
-#if STRINGLIB_SIZEOF_CHAR == 1
+#ifdef STRINGLIB_FAST_MEMCHR
 #  define MEMCHR_CUT_OFF 15
 #else
 #  define MEMCHR_CUT_OFF 40
@@ -53,8 +53,8 @@ STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
     p = s;
     e = s + n;
     if (n > MEMCHR_CUT_OFF) {
-#if STRINGLIB_SIZEOF_CHAR == 1
-        p = memchr(s, ch, n);
+#ifdef STRINGLIB_FAST_MEMCHR
+        p = STRINGLIB_FAST_MEMCHR(s, ch, n);
         if (p != NULL)
             return (p - s);
         return -1;
@@ -102,16 +102,26 @@ STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
     return -1;
 }
 
+#undef MEMCHR_CUT_OFF
+
+#if STRINGLIB_SIZEOF_CHAR == 1
+#  define MEMRCHR_CUT_OFF 15
+#else
+#  define MEMRCHR_CUT_OFF 40
+#endif
+
+
 Py_LOCAL_INLINE(Py_ssize_t)
 STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
 {
     const STRINGLIB_CHAR *p;
 #ifdef HAVE_MEMRCHR
-    /* memrchr() is a GNU extension, available since glibc 2.1.91.
-       it doesn't seem as optimized as memchr(), but is still quite
-       faster than our hand-written loop below */
+    /* memrchr() is a GNU extension, available since glibc 2.1.91.  it
+       doesn't seem as optimized as memchr(), but is still quite
+       faster than our hand-written loop below. There is no wmemrchr
+       for 4-byte chars. */
 
-    if (n > MEMCHR_CUT_OFF) {
+    if (n > MEMRCHR_CUT_OFF) {
 #if STRINGLIB_SIZEOF_CHAR == 1
         p = memrchr(s, ch, n);
         if (p != NULL)
@@ -139,19 +149,19 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
                 if (*p == ch)
                     return n;
                 /* False positive */
-                if (n1 - n > MEMCHR_CUT_OFF)
+                if (n1 - n > MEMRCHR_CUT_OFF)
                     continue;
-                if (n <= MEMCHR_CUT_OFF)
+                if (n <= MEMRCHR_CUT_OFF)
                     break;
-                s1 = p - MEMCHR_CUT_OFF;
+                s1 = p - MEMRCHR_CUT_OFF;
                 while (p > s1) {
                     p--;
                     if (*p == ch)
                         return (p - s);
                 }
                 n = p - s;
             }
-            while (n > MEMCHR_CUT_OFF);
+            while (n > MEMRCHR_CUT_OFF);
         }
 #endif
     }
@@ -165,7 +175,7 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
     return -1;
 }
 
-#undef MEMCHR_CUT_OFF
+#undef MEMRCHR_CUT_OFF
 
 /* Change to a 1 to see logging comments walk through the algorithm. */
 #if 0 && STRINGLIB_SIZEOF_CHAR == 1
diff --git a/Objects/stringlib/replace.h b/Objects/stringlib/replace.h
@@ -29,9 +29,9 @@ STRINGLIB(replace_1char_inplace)(STRINGLIB_CHAR* s, STRINGLIB_CHAR* end,
                 if (!--attempts) {
                     /* if u1 was not found for attempts iterations,
                        use FASTSEARCH() or memchr() */
-#if STRINGLIB_SIZEOF_CHAR == 1
+#ifdef STRINGLIB_FAST_MEMCHR
                     s++;
-                    s = memchr(s, u1, end - s);
+                    s = STRINGLIB_FAST_MEMCHR(s, u1, end - s);
                     if (s == NULL)
                         return;
 #else
diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h
@@ -24,4 +24,5 @@
 #define STRINGLIB_CHECK_EXACT    PyBytes_CheckExact
 #define STRINGLIB_TOSTR          PyObject_Str
 #define STRINGLIB_TOASCII        PyObject_Repr
+#define STRINGLIB_FAST_MEMCHR    memchr
 #endif /* !STRINGLIB_STRINGDEFS_H */
diff --git a/Objects/stringlib/ucs1lib.h b/Objects/stringlib/ucs1lib.h
@@ -20,6 +20,7 @@
 #define STRINGLIB_NEW            _PyUnicode_FromUCS1
 #define STRINGLIB_CHECK          PyUnicode_Check
 #define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact
+#define STRINGLIB_FAST_MEMCHR    memchr
 #define STRINGLIB_MUTABLE 0
 
 #define STRINGLIB_TOSTR          PyObject_Str
diff --git a/Objects/stringlib/ucs2lib.h b/Objects/stringlib/ucs2lib.h
@@ -21,6 +21,10 @@
 #define STRINGLIB_CHECK          PyUnicode_Check
 #define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact
 #define STRINGLIB_MUTABLE 0
+#if SIZEOF_WCHAR_T == 2
+#define STRINGLIB_FAST_MEMCHR(s, c, n)              \
+    (Py_UCS2 *)wmemchr((const wchar_t *)(s), c, n)
+#endif
 
 #define STRINGLIB_TOSTR          PyObject_Str
 #define STRINGLIB_TOASCII        PyObject_ASCII
diff --git a/Objects/stringlib/ucs4lib.h b/Objects/stringlib/ucs4lib.h
@@ -21,6 +21,10 @@
 #define STRINGLIB_CHECK          PyUnicode_Check
 #define STRINGLIB_CHECK_EXACT    PyUnicode_CheckExact
 #define STRINGLIB_MUTABLE 0
+#if SIZEOF_WCHAR_T == 4
+#define STRINGLIB_FAST_MEMCHR(s, c, n)              \
+    (Py_UCS4 *)wmemchr((const wchar_t *)(s), c, n)
+#endif
 
 #define STRINGLIB_TOSTR          PyObject_Str
 #define STRINGLIB_TOASCII        PyObject_ASCII
diff --git a/Objects/stringlib/undef.h b/Objects/stringlib/undef.h
@@ -8,3 +8,4 @@
 #undef STRINGLIB_NEW
 #undef STRINGLIB_IS_UNICODE
 #undef STRINGLIB_MUTABLE
+#undef STRINGLIB_FAST_MEMCHR