From dcd22193444736bfc14a4bc35a434216bc53877e Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 21 Jun 2019 17:41:56 +0900 Subject: [PATCH 1/7] Skip using _PyUnicodeWriter for simple ASCII string --- Objects/unicodeobject.c | 48 ++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4f8362590595cd..2a7731153b7f8a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value, /* Forward declaration */ static inline int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); +static inline void +_PyUnicodeWriter_Update(_PyUnicodeWriter *writer); static PyObject * unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, const char *errors); @@ -4877,7 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed) { - _PyUnicodeWriter writer; const char *starts = s; const char *end = s + size; @@ -4900,13 +4901,22 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, return get_latin1_char((unsigned char)s[0]); } + // Try simple ASCII case + PyObject *u = PyUnicode_New(size, 127); + if (u == NULL) { + return NULL; + } + s += ascii_decode(s, end, PyUnicode_DATA(u)); + if (s == end) { + return u; + } + + _PyUnicodeWriter writer; _PyUnicodeWriter_Init(&writer); - writer.min_length = size; - if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) - goto onError; + writer.buffer = u; + writer.pos = s - starts; + _PyUnicodeWriter_Update(&writer); - writer.pos = ascii_decode(s, end, writer.data); - s += writer.pos; while (s < end) { Py_UCS4 ch; int kind = writer.kind; @@ -6975,13 +6985,12 @@ PyUnicode_DecodeASCII(const char *s, const char *errors) { const char *starts = s; - _PyUnicodeWriter writer; int kind; void *data; Py_ssize_t startinpos; Py_ssize_t endinpos; Py_ssize_t outpos; - const char *e; + const char *e = s + size; PyObject *error_handler_obj = NULL; PyObject *exc = NULL; _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; @@ -6993,19 +7002,24 @@ PyUnicode_DecodeASCII(const char *s, if (size == 1 && (unsigned char)s[0] < 128) return get_latin1_char((unsigned char)s[0]); - _PyUnicodeWriter_Init(&writer); - writer.min_length = size; - if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) + // Shortcut for simple case + PyObject *u = PyUnicode_New(size, 127); + if (u == NULL) { return NULL; + } + outpos = ascii_decode(s, e, PyUnicode_DATA(u)); + if (outpos == size) { + return u; + } - e = s + size; - data = writer.data; - outpos = ascii_decode(s, e, (Py_UCS1 *)data); + _PyUnicodeWriter writer; + _PyUnicodeWriter_Init(&writer); + writer.buffer = u; writer.pos = outpos; - if (writer.pos == size) - return _PyUnicodeWriter_Finish(&writer); + _PyUnicodeWriter_Update(&writer); - s += writer.pos; + data = writer.data; + s += outpos; kind = writer.kind; while (s < e) { unsigned char c = (unsigned char)*s; From 40750a2ad956fa306528a1a08c7a46ef67f1b93d Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 21 Jun 2019 19:23:32 +0900 Subject: [PATCH 2/7] code cleanup --- Objects/unicodeobject.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2a7731153b7f8a..4aab9b5b9f0142 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4879,15 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed) { - const char *starts = s; - const char *end = s + size; - - Py_ssize_t startinpos; - Py_ssize_t endinpos; - const char *errmsg = ""; - PyObject *error_handler_obj = NULL; - PyObject *exc = NULL; - if (size == 0) { if (consumed) *consumed = 0; @@ -4901,7 +4892,10 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, return get_latin1_char((unsigned char)s[0]); } - // Try simple ASCII case + const char *starts = s; + const char *end = s + size; + + // fast path: try ASCII string. PyObject *u = PyUnicode_New(size, 127); if (u == NULL) { return NULL; @@ -4911,12 +4905,18 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, return u; } + // Use _PyUnicodeWriter after fast path is failed. _PyUnicodeWriter writer; _PyUnicodeWriter_Init(&writer); writer.buffer = u; writer.pos = s - starts; _PyUnicodeWriter_Update(&writer); + Py_ssize_t startinpos, endinpos; + const char *errmsg = ""; + PyObject *error_handler_obj = NULL; + PyObject *exc = NULL; + while (s < end) { Py_UCS4 ch; int kind = writer.kind; @@ -6985,11 +6985,6 @@ PyUnicode_DecodeASCII(const char *s, const char *errors) { const char *starts = s; - int kind; - void *data; - Py_ssize_t startinpos; - Py_ssize_t endinpos; - Py_ssize_t outpos; const char *e = s + size; PyObject *error_handler_obj = NULL; PyObject *exc = NULL; @@ -7007,7 +7002,7 @@ PyUnicode_DecodeASCII(const char *s, if (u == NULL) { return NULL; } - outpos = ascii_decode(s, e, PyUnicode_DATA(u)); + Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u)); if (outpos == size) { return u; } @@ -7018,9 +7013,11 @@ PyUnicode_DecodeASCII(const char *s, writer.pos = outpos; _PyUnicodeWriter_Update(&writer); - data = writer.data; s += outpos; - kind = writer.kind; + int kind = writer.kind; + void *data = writer.data; + Py_ssize_t startinpos, endinpos; + while (s < e) { unsigned char c = (unsigned char)*s; if (c < 128) { From 60be28321a53b013ca59906fc5d923c94d719666 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 21 Jun 2019 21:55:39 +0900 Subject: [PATCH 3/7] fix error handler writes after allocated --- Objects/unicodeobject.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4aab9b5b9f0142..ad916447b16d33 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4908,6 +4908,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, // Use _PyUnicodeWriter after fast path is failed. _PyUnicodeWriter writer; _PyUnicodeWriter_Init(&writer); + writer.min_length = size; writer.buffer = u; writer.pos = s - starts; _PyUnicodeWriter_Update(&writer); @@ -7009,6 +7010,7 @@ PyUnicode_DecodeASCII(const char *s, _PyUnicodeWriter writer; _PyUnicodeWriter_Init(&writer); + writer.min_length = size; writer.buffer = u; writer.pos = outpos; _PyUnicodeWriter_Update(&writer); From 0c5d9d4e2cdea4b49e0cae2f9d1b37b452622bca Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 23 Jun 2019 00:11:46 +0900 Subject: [PATCH 4/7] add _PyUnicodeWriter_InitWithBuffer function --- Objects/unicodeobject.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ad916447b16d33..af287bce37d909 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -265,8 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value, /* Forward declaration */ static inline int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); -static inline void -_PyUnicodeWriter_Update(_PyUnicodeWriter *writer); +static inline int +_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer); static PyObject * unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, const char *errors); @@ -4907,11 +4907,8 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, // Use _PyUnicodeWriter after fast path is failed. _PyUnicodeWriter writer; - _PyUnicodeWriter_Init(&writer); - writer.min_length = size; - writer.buffer = u; + _PyUnicodeWriter_InitWithBuffer(&writer, u); writer.pos = s - starts; - _PyUnicodeWriter_Update(&writer); Py_ssize_t startinpos, endinpos; const char *errmsg = ""; @@ -7009,11 +7006,8 @@ PyUnicode_DecodeASCII(const char *s, } _PyUnicodeWriter writer; - _PyUnicodeWriter_Init(&writer); - writer.min_length = size; - writer.buffer = u; + _PyUnicodeWriter_InitWithBuffer(&writer, u); writer.pos = outpos; - _PyUnicodeWriter_Update(&writer); s += outpos; int kind = writer.kind; @@ -13519,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) assert(writer->kind <= PyUnicode_1BYTE_KIND); } +// Initialize _PyUnicodeWriter with initial buffer +static inline int +_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) +{ + memset(writer, 0, sizeof(*writer)); + writer->buffer = buffer; + _PyUnicodeWriter_Update(writer); + writer->min_length = writer->size; +} + int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t length, Py_UCS4 maxchar) From 3d34e179908a1ceac9e2a3dc7dfa9fc9f5350d3c Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 23 Jun 2019 00:13:29 +0900 Subject: [PATCH 5/7] fix indent --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index af287bce37d909..a6e623c6db0be2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6459,7 +6459,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, length after conversion to the true value. (But decoding error handler might have to resize the string) */ _PyUnicodeWriter_Init(&writer); - writer.min_length = size; + writer.min_length = size; if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { goto onError; } From 7f5698d06e81fb89be3964358fcb72f2ebdf98cd Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 23 Jun 2019 00:16:54 +0900 Subject: [PATCH 6/7] fix return type --- Objects/unicodeobject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a6e623c6db0be2..625be4b5594b15 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -265,7 +265,7 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value, /* Forward declaration */ static inline int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); -static inline int +static inline void _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer); static PyObject * unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, @@ -13514,7 +13514,7 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) } // Initialize _PyUnicodeWriter with initial buffer -static inline int +static inline void _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) { memset(writer, 0, sizeof(*writer)); From 24c7b2aef1fff69eced42b4798a6b4e763229da3 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 23 Jun 2019 00:26:32 +0900 Subject: [PATCH 7/7] add NEWS entry --- .../Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst b/Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst new file mode 100644 index 00000000000000..5859837d236854 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst @@ -0,0 +1,2 @@ +Optimized decoding short ASCII string with UTF-8 and ascii codecs. +``b"foo".decode()`` is about 15% faster. Patch by Inada Naoki.