From dcd22193444736bfc14a4bc35a434216bc53877e Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Fri, 21 Jun 2019 17:41:56 +0900
Subject: [PATCH 1/7] Skip using _PyUnicodeWriter for simple ASCII string

---
 Objects/unicodeobject.c | 48 ++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 4f8362590595cd..2a7731153b7f8a 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
 /* Forward declaration */
 static inline int
 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
+static inline void
+_PyUnicodeWriter_Update(_PyUnicodeWriter *writer);
 static PyObject *
 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
                     const char *errors);
@@ -4877,7 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
                     _Py_error_handler error_handler, const char *errors,
                     Py_ssize_t *consumed)
 {
-    _PyUnicodeWriter writer;
     const char *starts = s;
     const char *end = s + size;
 
@@ -4900,13 +4901,22 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
         return get_latin1_char((unsigned char)s[0]);
     }
 
+    // Try simple ASCII case
+    PyObject *u = PyUnicode_New(size, 127);
+    if (u == NULL) {
+        return NULL;
+    }
+    s += ascii_decode(s, end, PyUnicode_DATA(u));
+    if (s == end) {
+        return u;
+    }
+
+    _PyUnicodeWriter writer;
     _PyUnicodeWriter_Init(&writer);
-    writer.min_length = size;
-    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
-        goto onError;
+    writer.buffer = u;
+    writer.pos = s - starts;
+    _PyUnicodeWriter_Update(&writer);
 
-    writer.pos = ascii_decode(s, end, writer.data);
-    s += writer.pos;
     while (s < end) {
         Py_UCS4 ch;
         int kind = writer.kind;
@@ -6975,13 +6985,12 @@ PyUnicode_DecodeASCII(const char *s,
                       const char *errors)
 {
     const char *starts = s;
-    _PyUnicodeWriter writer;
     int kind;
     void *data;
     Py_ssize_t startinpos;
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
-    const char *e;
+    const char *e = s + size;
     PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
@@ -6993,19 +7002,24 @@ PyUnicode_DecodeASCII(const char *s,
     if (size == 1 && (unsigned char)s[0] < 128)
         return get_latin1_char((unsigned char)s[0]);
 
-    _PyUnicodeWriter_Init(&writer);
-    writer.min_length = size;
-    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
+    // Shortcut for simple case
+    PyObject *u = PyUnicode_New(size, 127);
+    if (u == NULL) {
         return NULL;
+    }
+    outpos = ascii_decode(s, e, PyUnicode_DATA(u));
+    if (outpos == size) {
+        return u;
+    }
 
-    e = s + size;
-    data = writer.data;
-    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
+    _PyUnicodeWriter writer;
+    _PyUnicodeWriter_Init(&writer);
+    writer.buffer = u;
     writer.pos = outpos;
-    if (writer.pos == size)
-        return _PyUnicodeWriter_Finish(&writer);
+    _PyUnicodeWriter_Update(&writer);
 
-    s += writer.pos;
+    data = writer.data;
+    s += outpos;
     kind = writer.kind;
     while (s < e) {
         unsigned char c = (unsigned char)*s;

From 40750a2ad956fa306528a1a08c7a46ef67f1b93d Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Fri, 21 Jun 2019 19:23:32 +0900
Subject: [PATCH 2/7] code cleanup

---
 Objects/unicodeobject.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 2a7731153b7f8a..4aab9b5b9f0142 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4879,15 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
                     _Py_error_handler error_handler, const char *errors,
                     Py_ssize_t *consumed)
 {
-    const char *starts = s;
-    const char *end = s + size;
-
-    Py_ssize_t startinpos;
-    Py_ssize_t endinpos;
-    const char *errmsg = "";
-    PyObject *error_handler_obj = NULL;
-    PyObject *exc = NULL;
-
     if (size == 0) {
         if (consumed)
             *consumed = 0;
@@ -4901,7 +4892,10 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
         return get_latin1_char((unsigned char)s[0]);
     }
 
-    // Try simple ASCII case
+    const char *starts = s;
+    const char *end = s + size;
+
+    // fast path: try ASCII string.
     PyObject *u = PyUnicode_New(size, 127);
     if (u == NULL) {
         return NULL;
@@ -4911,12 +4905,18 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
         return u;
     }
 
+    // Use _PyUnicodeWriter after fast path is failed.
     _PyUnicodeWriter writer;
     _PyUnicodeWriter_Init(&writer);
     writer.buffer = u;
     writer.pos = s - starts;
     _PyUnicodeWriter_Update(&writer);
 
+    Py_ssize_t startinpos, endinpos;
+    const char *errmsg = "";
+    PyObject *error_handler_obj = NULL;
+    PyObject *exc = NULL;
+
     while (s < end) {
         Py_UCS4 ch;
         int kind = writer.kind;
@@ -6985,11 +6985,6 @@ PyUnicode_DecodeASCII(const char *s,
                       const char *errors)
 {
     const char *starts = s;
-    int kind;
-    void *data;
-    Py_ssize_t startinpos;
-    Py_ssize_t endinpos;
-    Py_ssize_t outpos;
     const char *e = s + size;
     PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
@@ -7007,7 +7002,7 @@ PyUnicode_DecodeASCII(const char *s,
     if (u == NULL) {
         return NULL;
     }
-    outpos = ascii_decode(s, e, PyUnicode_DATA(u));
+    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
     if (outpos == size) {
         return u;
     }
@@ -7018,9 +7013,11 @@ PyUnicode_DecodeASCII(const char *s,
     writer.pos = outpos;
     _PyUnicodeWriter_Update(&writer);
 
-    data = writer.data;
     s += outpos;
-    kind = writer.kind;
+    int kind = writer.kind;
+    void *data = writer.data;
+    Py_ssize_t startinpos, endinpos;
+
     while (s < e) {
         unsigned char c = (unsigned char)*s;
         if (c < 128) {

From 60be28321a53b013ca59906fc5d923c94d719666 Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Fri, 21 Jun 2019 21:55:39 +0900
Subject: [PATCH 3/7] fix error handler writes after allocated

---
 Objects/unicodeobject.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 4aab9b5b9f0142..ad916447b16d33 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4908,6 +4908,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
     // Use _PyUnicodeWriter after fast path is failed.
     _PyUnicodeWriter writer;
     _PyUnicodeWriter_Init(&writer);
+    writer.min_length = size;
     writer.buffer = u;
     writer.pos = s - starts;
     _PyUnicodeWriter_Update(&writer);
@@ -7009,6 +7010,7 @@ PyUnicode_DecodeASCII(const char *s,
 
     _PyUnicodeWriter writer;
     _PyUnicodeWriter_Init(&writer);
+    writer.min_length = size;
     writer.buffer = u;
     writer.pos = outpos;
     _PyUnicodeWriter_Update(&writer);

From 0c5d9d4e2cdea4b49e0cae2f9d1b37b452622bca Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Sun, 23 Jun 2019 00:11:46 +0900
Subject: [PATCH 4/7] add _PyUnicodeWriter_InitWithBuffer function

---
 Objects/unicodeobject.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index ad916447b16d33..af287bce37d909 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -265,8 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
 /* Forward declaration */
 static inline int
 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
-static inline void
-_PyUnicodeWriter_Update(_PyUnicodeWriter *writer);
+static inline int
+_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
 static PyObject *
 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
                     const char *errors);
@@ -4907,11 +4907,8 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
 
     // Use _PyUnicodeWriter after fast path is failed.
     _PyUnicodeWriter writer;
-    _PyUnicodeWriter_Init(&writer);
-    writer.min_length = size;
-    writer.buffer = u;
+    _PyUnicodeWriter_InitWithBuffer(&writer, u);
     writer.pos = s - starts;
-    _PyUnicodeWriter_Update(&writer);
 
     Py_ssize_t startinpos, endinpos;
     const char *errmsg = "";
@@ -7009,11 +7006,8 @@ PyUnicode_DecodeASCII(const char *s,
     }
 
     _PyUnicodeWriter writer;
-    _PyUnicodeWriter_Init(&writer);
-    writer.min_length = size;
-    writer.buffer = u;
+    _PyUnicodeWriter_InitWithBuffer(&writer, u);
     writer.pos = outpos;
-    _PyUnicodeWriter_Update(&writer);
 
     s += outpos;
     int kind = writer.kind;
@@ -13519,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
     assert(writer->kind <= PyUnicode_1BYTE_KIND);
 }
 
+// Initialize _PyUnicodeWriter with initial buffer
+static inline int
+_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
+{
+    memset(writer, 0, sizeof(*writer));
+    writer->buffer = buffer;
+    _PyUnicodeWriter_Update(writer);
+    writer->min_length = writer->size;
+}
+
 int
 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                                  Py_ssize_t length, Py_UCS4 maxchar)

From 3d34e179908a1ceac9e2a3dc7dfa9fc9f5350d3c Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Sun, 23 Jun 2019 00:13:29 +0900
Subject: [PATCH 5/7] fix indent

---
 Objects/unicodeobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index af287bce37d909..a6e623c6db0be2 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6459,7 +6459,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
        length after conversion to the true value. (But decoding error
        handler might have to resize the string) */
     _PyUnicodeWriter_Init(&writer);
-     writer.min_length = size;
+    writer.min_length = size;
     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
         goto onError;
     }

From 7f5698d06e81fb89be3964358fcb72f2ebdf98cd Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Sun, 23 Jun 2019 00:16:54 +0900
Subject: [PATCH 6/7] fix return type

---
 Objects/unicodeobject.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index a6e623c6db0be2..625be4b5594b15 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -265,7 +265,7 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
 /* Forward declaration */
 static inline int
 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
-static inline int
+static inline void
 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
 static PyObject *
 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
@@ -13514,7 +13514,7 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
 }
 
 // Initialize _PyUnicodeWriter with initial buffer
-static inline int
+static inline void
 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
 {
     memset(writer, 0, sizeof(*writer));

From 24c7b2aef1fff69eced42b4798a6b4e763229da3 Mon Sep 17 00:00:00 2001
From: Inada Naoki <songofacandy@gmail.com>
Date: Sun, 23 Jun 2019 00:26:32 +0900
Subject: [PATCH 7/7] add NEWS entry

---
 .../Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst  | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst

diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst b/Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst
new file mode 100644
index 00000000000000..5859837d236854
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-06-23-00-26-30.bpo-37348.pp8P-x.rst	
@@ -0,0 +1,2 @@
+Optimized decoding short ASCII string with UTF-8 and ascii codecs.
+``b"foo".decode()`` is about 15% faster.  Patch by Inada Naoki.