Skip to content

bpo-37348: optimize decoding ASCII string #14283

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Optimized decoding short ASCII string with UTF-8 and ascii codecs.
``b"foo".decode()`` is about 15% faster. Patch by Inada Naoki.
85 changes: 51 additions & 34 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
/* Forward declaration */
static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
static PyObject *
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
const char *errors);
Expand Down Expand Up @@ -4877,16 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
_PyUnicodeWriter writer;
const char *starts = s;
const char *end = s + size;

Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *errmsg = "";
PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;

if (size == 0) {
if (consumed)
*consumed = 0;
Expand All @@ -4900,13 +4892,29 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
return get_latin1_char((unsigned char)s[0]);
}

_PyUnicodeWriter_Init(&writer);
writer.min_length = size;
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
goto onError;
const char *starts = s;
const char *end = s + size;

// fast path: try ASCII string.
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
}
s += ascii_decode(s, end, PyUnicode_DATA(u));
if (s == end) {
return u;
}

// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = s - starts;

Py_ssize_t startinpos, endinpos;
const char *errmsg = "";
PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;

writer.pos = ascii_decode(s, end, writer.data);
s += writer.pos;
while (s < end) {
Py_UCS4 ch;
int kind = writer.kind;
Expand Down Expand Up @@ -6451,7 +6459,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
length after conversion to the true value. (But decoding error
handler might have to resize the string) */
_PyUnicodeWriter_Init(&writer);
writer.min_length = size;
writer.min_length = size;
if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
goto onError;
}
Expand Down Expand Up @@ -6975,13 +6983,7 @@ PyUnicode_DecodeASCII(const char *s,
const char *errors)
{
const char *starts = s;
_PyUnicodeWriter writer;
int kind;
void *data;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
const char *e = s + size;
PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
_Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Expand All @@ -6993,20 +6995,25 @@ PyUnicode_DecodeASCII(const char *s,
if (size == 1 && (unsigned char)s[0] < 128)
return get_latin1_char((unsigned char)s[0]);

_PyUnicodeWriter_Init(&writer);
writer.min_length = size;
if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
// Shortcut for simple case
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
}
Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
if (outpos == size) {
return u;
}

e = s + size;
data = writer.data;
outpos = ascii_decode(s, e, (Py_UCS1 *)data);
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = outpos;
if (writer.pos == size)
return _PyUnicodeWriter_Finish(&writer);

s += writer.pos;
kind = writer.kind;
s += outpos;
int kind = writer.kind;
void *data = writer.data;
Py_ssize_t startinpos, endinpos;

while (s < e) {
unsigned char c = (unsigned char)*s;
if (c < 128) {
Expand Down Expand Up @@ -13506,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
assert(writer->kind <= PyUnicode_1BYTE_KIND);
}

// Initialize _PyUnicodeWriter with initial buffer
static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
{
memset(writer, 0, sizeof(*writer));
writer->buffer = buffer;
_PyUnicodeWriter_Update(writer);
writer->min_length = writer->size;
}

int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
Py_ssize_t length, Py_UCS4 maxchar)
Expand Down