Skip to content

gh-95382: Use cache for indentations in the JSON encoder #118636

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
182 changes: 118 additions & 64 deletions Modules/_json.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,11 @@ encoder_dealloc(PyObject *self);
static int
encoder_clear(PyEncoderObject *self);
static int
encoder_listencode_list(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *seq, PyObject *newline_indent);
encoder_listencode_list(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *seq, Py_ssize_t indent_level, PyObject *indent_cache);
static int
encoder_listencode_obj(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj, PyObject *newline_indent);
encoder_listencode_obj(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *obj, Py_ssize_t indent_level, PyObject *indent_cache);
static int
encoder_listencode_dict(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *dct, PyObject *newline_indent);
encoder_listencode_dict(PyEncoderObject *s, PyUnicodeWriter *writer, PyObject *dct, Py_ssize_t indent_level, PyObject *indent_cache);
static PyObject *
_encoded_const(PyObject *obj);
static void
Expand Down Expand Up @@ -1252,17 +1252,92 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
return (PyObject *)s;
}


/* indent_cache is a list that contains intermixed values at even and odd
* positions:
*
* 2*k : '\n' + indent * (k + initial_indent_level)
* strings written after opening and before closing brackets
* 2*k-1 : item_separator + '\n' + indent * (k + initial_indent_level)
* strings written between items
*
* Its size is always an odd number.
*/
static PyObject *
_create_newline_indent(PyObject *indent, Py_ssize_t indent_level)
create_indent_cache(PyEncoderObject *s, Py_ssize_t indent_level)
{
PyObject *newline_indent = PyUnicode_FromOrdinal('\n');
if (newline_indent != NULL && indent_level) {
PyUnicode_AppendAndDel(&newline_indent,
PySequence_Repeat(indent, indent_level));
PySequence_Repeat(s->indent, indent_level));
}
if (newline_indent == NULL) {
return NULL;
}
PyObject *indent_cache = PyList_New(1);
if (indent_cache == NULL) {
Py_DECREF(newline_indent);
return NULL;
}
return newline_indent;
PyList_SET_ITEM(indent_cache, 0, newline_indent);
return indent_cache;
}

/* Extend indent_cache by adding values for the next level.
* It should have values for the indent_level-1 level before the call.
*/
static int
update_indent_cache(PyEncoderObject *s,
Py_ssize_t indent_level, PyObject *indent_cache)
{
assert(indent_level * 2 == PyList_GET_SIZE(indent_cache) + 1);
assert(indent_level > 0);
PyObject *newline_indent = PyList_GET_ITEM(indent_cache, (indent_level - 1)*2);
newline_indent = PyUnicode_Concat(newline_indent, s->indent);
if (newline_indent == NULL) {
return -1;
}
PyObject *separator_indent = PyUnicode_Concat(s->item_separator, newline_indent);
if (separator_indent == NULL) {
Py_DECREF(newline_indent);
return -1;
}

if (PyList_Append(indent_cache, separator_indent) < 0 ||
PyList_Append(indent_cache, newline_indent) < 0)
{
Py_DECREF(separator_indent);
Py_DECREF(newline_indent);
return -1;
}
Py_DECREF(separator_indent);
Py_DECREF(newline_indent);
return 0;
}

static PyObject *
get_item_separator(PyEncoderObject *s,
Py_ssize_t indent_level, PyObject *indent_cache)
{
assert(indent_level > 0);
if (indent_level * 2 > PyList_GET_SIZE(indent_cache)) {
if (update_indent_cache(s, indent_level, indent_cache) < 0) {
return NULL;
}
}
assert(indent_level * 2 < PyList_GET_SIZE(indent_cache));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
assert(indent_level * 2 < PyList_GET_SIZE(indent_cache));
assert(indent_level * 2 - 1 < PyList_GET_SIZE(indent_cache));

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indent_level * 2 < PyList_GET_SIZE(indent_cache) is more strong condition.

return PyList_GET_ITEM(indent_cache, indent_level * 2 - 1);
}

static int
write_newline_indent(PyUnicodeWriter *writer,
Py_ssize_t indent_level, PyObject *indent_cache)
{
PyObject *newline_indent = PyList_GET_ITEM(indent_cache, indent_level * 2);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
PyObject *newline_indent = PyList_GET_ITEM(indent_cache, indent_level * 2);
assert(indent_level * 2 < PyList_GET_SIZE(indent_cache));
PyObject *newline_indent = PyList_GET_ITEM(indent_cache, indent_level * 2);

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The assert above is placed after conditional resizing the cache. It is useful to check that we always get the right cache size, independently from conditions. But here we do not change the cache.

return PyUnicodeWriter_WriteStr(writer, newline_indent);
}


static PyObject *
encoder_call(PyEncoderObject *self, PyObject *args, PyObject *kwds)
{
Expand All @@ -1280,20 +1355,20 @@ encoder_call(PyEncoderObject *self, PyObject *args, PyObject *kwds)
return NULL;
}

PyObject *newline_indent = NULL;
PyObject *indent_cache = NULL;
if (self->indent != Py_None) {
newline_indent = _create_newline_indent(self->indent, indent_level);
if (newline_indent == NULL) {
indent_cache = create_indent_cache(self, indent_level);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When called via the public interface indent_level is always zero. By removing the argument indent_level in the internal implementation this call and some other bits of the code can be a bit simplified.

(this is probably not worth creating a separate PR for, I leave it up to you)

if (indent_cache == NULL) {
PyUnicodeWriter_Discard(writer);
return NULL;
}
}
if (encoder_listencode_obj(self, writer, obj, newline_indent)) {
if (encoder_listencode_obj(self, writer, obj, indent_level, indent_cache)) {
PyUnicodeWriter_Discard(writer);
Py_XDECREF(newline_indent);
Py_XDECREF(indent_cache);
return NULL;
}
Py_XDECREF(newline_indent);
Py_XDECREF(indent_cache);

PyObject *str = PyUnicodeWriter_Finish(writer);
if (str == NULL) {
Expand Down Expand Up @@ -1381,7 +1456,8 @@ _steal_accumulate(PyUnicodeWriter *writer, PyObject *stolen)

static int
encoder_listencode_obj(PyEncoderObject *s, PyUnicodeWriter *writer,
PyObject *obj, PyObject *newline_indent)
PyObject *obj,
Py_ssize_t indent_level, PyObject *indent_cache)
{
/* Encode Python object obj to a JSON term */
PyObject *newobj;
Expand Down Expand Up @@ -1421,14 +1497,14 @@ encoder_listencode_obj(PyEncoderObject *s, PyUnicodeWriter *writer,
else if (PyList_Check(obj) || PyTuple_Check(obj)) {
if (_Py_EnterRecursiveCall(" while encoding a JSON object"))
return -1;
rv = encoder_listencode_list(s, writer, obj, newline_indent);
rv = encoder_listencode_list(s, writer, obj, indent_level, indent_cache);
_Py_LeaveRecursiveCall();
return rv;
}
else if (PyDict_Check(obj)) {
if (_Py_EnterRecursiveCall(" while encoding a JSON object"))
return -1;
rv = encoder_listencode_dict(s, writer, obj, newline_indent);
rv = encoder_listencode_dict(s, writer, obj, indent_level, indent_cache);
_Py_LeaveRecursiveCall();
return rv;
}
Expand Down Expand Up @@ -1462,7 +1538,7 @@ encoder_listencode_obj(PyEncoderObject *s, PyUnicodeWriter *writer,
Py_XDECREF(ident);
return -1;
}
rv = encoder_listencode_obj(s, writer, newobj, newline_indent);
rv = encoder_listencode_obj(s, writer, newobj, indent_level, indent_cache);
_Py_LeaveRecursiveCall();

Py_DECREF(newobj);
Expand All @@ -1485,7 +1561,7 @@ encoder_listencode_obj(PyEncoderObject *s, PyUnicodeWriter *writer,
static int
encoder_encode_key_value(PyEncoderObject *s, PyUnicodeWriter *writer, bool *first,
PyObject *dct, PyObject *key, PyObject *value,
PyObject *newline_indent,
Py_ssize_t indent_level, PyObject *indent_cache,
PyObject *item_separator)
{
PyObject *keystr = NULL;
Expand Down Expand Up @@ -1541,7 +1617,7 @@ encoder_encode_key_value(PyEncoderObject *s, PyUnicodeWriter *writer, bool *firs
if (PyUnicodeWriter_WriteStr(writer, s->key_separator) < 0) {
return -1;
}
if (encoder_listencode_obj(s, writer, value, newline_indent) < 0) {
if (encoder_listencode_obj(s, writer, value, indent_level, indent_cache) < 0) {
_PyErr_FormatNote("when serializing %T item %R", dct, key);
return -1;
}
Expand All @@ -1550,15 +1626,14 @@ encoder_encode_key_value(PyEncoderObject *s, PyUnicodeWriter *writer, bool *firs

static int
encoder_listencode_dict(PyEncoderObject *s, PyUnicodeWriter *writer,
PyObject *dct, PyObject *newline_indent)
PyObject *dct,
Py_ssize_t indent_level, PyObject *indent_cache)
{
/* Encode Python dict dct a JSON term */
PyObject *ident = NULL;
PyObject *items = NULL;
PyObject *key, *value;
bool first = true;
PyObject *new_newline_indent = NULL;
PyObject *separator_indent = NULL;

if (PyDict_GET_SIZE(dct) == 0) {
/* Fast path */
Expand All @@ -1585,19 +1660,13 @@ encoder_listencode_dict(PyEncoderObject *s, PyUnicodeWriter *writer,
goto bail;
}

PyObject *current_item_separator = s->item_separator; // borrowed reference
PyObject *separator = s->item_separator; // borrowed reference
if (s->indent != Py_None) {
new_newline_indent = PyUnicode_Concat(newline_indent, s->indent);
if (new_newline_indent == NULL) {
goto bail;
}
separator_indent = PyUnicode_Concat(current_item_separator, new_newline_indent);
if (separator_indent == NULL) {
goto bail;
}
// update item separator with a borrowed reference
current_item_separator = separator_indent;
if (PyUnicodeWriter_WriteStr(writer, new_newline_indent) < 0) {
indent_level++;
separator = get_item_separator(s, indent_level, indent_cache);
if (separator == NULL ||
write_newline_indent(writer, indent_level, indent_cache) < 0)
{
goto bail;
}
}
Expand All @@ -1618,8 +1687,8 @@ encoder_listencode_dict(PyEncoderObject *s, PyUnicodeWriter *writer,
key = PyTuple_GET_ITEM(item, 0);
value = PyTuple_GET_ITEM(item, 1);
if (encoder_encode_key_value(s, writer, &first, dct, key, value,
new_newline_indent,
current_item_separator) < 0)
indent_level, indent_cache,
separator) < 0)
goto bail;
}
Py_CLEAR(items);
Expand All @@ -1628,8 +1697,8 @@ encoder_listencode_dict(PyEncoderObject *s, PyUnicodeWriter *writer,
Py_ssize_t pos = 0;
while (PyDict_Next(dct, &pos, &key, &value)) {
if (encoder_encode_key_value(s, writer, &first, dct, key, value,
new_newline_indent,
current_item_separator) < 0)
indent_level, indent_cache,
separator) < 0)
goto bail;
}
}
Expand All @@ -1640,10 +1709,8 @@ encoder_listencode_dict(PyEncoderObject *s, PyUnicodeWriter *writer,
Py_CLEAR(ident);
}
if (s->indent != Py_None) {
Py_CLEAR(new_newline_indent);
Py_CLEAR(separator_indent);

if (PyUnicodeWriter_WriteStr(writer, newline_indent) < 0) {
indent_level--;
if (write_newline_indent(writer, indent_level, indent_cache) < 0) {
goto bail;
}
}
Expand All @@ -1656,20 +1723,17 @@ encoder_listencode_dict(PyEncoderObject *s, PyUnicodeWriter *writer,
bail:
Py_XDECREF(items);
Py_XDECREF(ident);
Py_XDECREF(separator_indent);
Py_XDECREF(new_newline_indent);
return -1;
}

static int
encoder_listencode_list(PyEncoderObject *s, PyUnicodeWriter *writer,
PyObject *seq, PyObject *newline_indent)
PyObject *seq,
Py_ssize_t indent_level, PyObject *indent_cache)
{
PyObject *ident = NULL;
PyObject *s_fast = NULL;
Py_ssize_t i;
PyObject *new_newline_indent = NULL;
PyObject *separator_indent = NULL;

ident = NULL;
s_fast = PySequence_Fast(seq, "_iterencode_list needs a sequence");
Expand Down Expand Up @@ -1702,28 +1766,21 @@ encoder_listencode_list(PyEncoderObject *s, PyUnicodeWriter *writer,

PyObject *separator = s->item_separator; // borrowed reference
if (s->indent != Py_None) {
new_newline_indent = PyUnicode_Concat(newline_indent, s->indent);
if (new_newline_indent == NULL) {
goto bail;
}

if (PyUnicodeWriter_WriteStr(writer, new_newline_indent) < 0) {
goto bail;
}

separator_indent = PyUnicode_Concat(separator, new_newline_indent);
if (separator_indent == NULL) {
indent_level++;
separator = get_item_separator(s, indent_level, indent_cache);
if (separator == NULL ||
write_newline_indent(writer, indent_level, indent_cache) < 0)
{
goto bail;
}
separator = separator_indent; // assign separator with borrowed reference
}
for (i = 0; i < PySequence_Fast_GET_SIZE(s_fast); i++) {
PyObject *obj = PySequence_Fast_GET_ITEM(s_fast, i);
if (i) {
if (PyUnicodeWriter_WriteStr(writer, separator) < 0)
goto bail;
}
if (encoder_listencode_obj(s, writer, obj, new_newline_indent)) {
if (encoder_listencode_obj(s, writer, obj, indent_level, indent_cache)) {
_PyErr_FormatNote("when serializing %T item %zd", seq, i);
goto bail;
}
Expand All @@ -1735,9 +1792,8 @@ encoder_listencode_list(PyEncoderObject *s, PyUnicodeWriter *writer,
}

if (s->indent != Py_None) {
Py_CLEAR(new_newline_indent);
Py_CLEAR(separator_indent);
if (PyUnicodeWriter_WriteStr(writer, newline_indent) < 0) {
indent_level--;
if (write_newline_indent(writer, indent_level, indent_cache) < 0) {
goto bail;
}
}
Expand All @@ -1751,8 +1807,6 @@ encoder_listencode_list(PyEncoderObject *s, PyUnicodeWriter *writer,
bail:
Py_XDECREF(ident);
Py_DECREF(s_fast);
Py_XDECREF(separator_indent);
Py_XDECREF(new_newline_indent);
return -1;
}

Expand Down
Loading