From f26cf16f049520e77a9d966cfc6a8bdfa398ef7d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 21 Dec 2022 11:34:02 -0800 Subject: [PATCH 1/7] BUG: Index with null value not serialized correctly to json --- pandas/_libs/src/ujson/python/objToJSON.c | 59 +++++++++++++++++------ pandas/tests/io/json/test_pandas.py | 12 ++--- 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index a6f18e0aec4d9..6cacfc0a840d0 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1283,6 +1283,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { + int is_null = 0; // Whether current val is a null item = PyArray_GETITEM(labels, dataptr); if (!item) { NpyArr_freeLabels(ret, num); @@ -1320,9 +1321,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, if (is_datetimelike) { if (nanosecVal == get_nat()) { - len = 4; - cLabel = PyObject_Malloc(len + 1); - strncpy(cLabel, "null", len + 1); + is_null = 1; } else { if (enc->datetimeIso) { if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { @@ -1348,17 +1347,38 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, len = strlen(cLabel); } } - } else { // Fallback to string representation - // Replace item with the string to keep it alive. - Py_SETREF(item, PyObject_Str(item)); - if (item == NULL) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; + } else { + // NA values need special handling + if (PyFloat_Check(item)) { + double fval = PyFloat_AS_DOUBLE(item); + is_null = npy_isnan(fval); + } else if (item == Py_None || object_is_na_type(item)) { + is_null = 1; + } else if (object_is_decimal_type(item)) { + PyObject *is_null_obj = PyObject_CallMethod(item, + "is_nan", + NULL); + is_null = (is_null_obj == Py_True); + Py_DECREF(is_null_obj); + } else { + // Otherwise, fallback to string representation + // Replace item with the string to keep it alive. + Py_SETREF(item, PyObject_Str(item)); + if (item == NULL) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(item); + len = strlen(cLabel); } + } - cLabel = (char *)PyUnicode_AsUTF8(item); - len = strlen(cLabel); + if (is_null) { + len = 4; + cLabel = PyObject_Malloc(len + 1); + strncpy(cLabel, "null", len + 1); } // Add 1 to include NULL terminator @@ -1366,7 +1386,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, memcpy(ret[i], cLabel, len + 1); Py_DECREF(item); - if (is_datetimelike) { + if (is_datetimelike || is_null) { PyObject_Free(cLabel); } @@ -1512,8 +1532,17 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_UTF8; return; } else if (object_is_decimal_type(obj)) { - GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj); - tc->type = JT_DOUBLE; + /* Check for null, since null can't go thru double path */ + PyObject *is_null_obj = PyObject_CallMethod(obj, + "is_nan", + NULL); + if (is_null_obj == Py_False) { + GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj); + tc->type = JT_DOUBLE; + } else { + tc->type = JT_NULL; + } + Py_DECREF(is_null_obj); return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (object_is_nat_type(obj)) { diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 4edd08014050e..f63f986f3c3b7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,6 +1,5 @@ import datetime from datetime import timedelta -from decimal import Decimal from io import StringIO import json import os @@ -1744,15 +1743,16 @@ def test_to_s3(self, s3_resource, s3so): timeout -= 0.1 assert timeout > 0, "Timed out waiting for file to appear on moto" - def test_json_pandas_nulls(self, nulls_fixture, request): + def test_json_pandas_nulls(self, nulls_fixture): # GH 31615 - if isinstance(nulls_fixture, Decimal): - mark = pytest.mark.xfail(reason="not implemented") - request.node.add_marker(mark) - result = DataFrame([[nulls_fixture]]).to_json() assert result == '{"0":{"0":null}}' + def test_json_pandas_index_nulls(self, nulls_fixture): + # GH 31801 + result = Series([1], index=[nulls_fixture]).to_json() + assert result == '{"null":1}' + def test_readjson_bool_series(self): # GH31464 result = read_json("[true, true, false]", typ="series") From 23cd66db9b07bdd46bdf38720aa2ade49b2f18d3 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 22 Dec 2022 09:12:35 -0800 Subject: [PATCH 2/7] add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 208bbfa10b9b2..d9e5ec74b9a65 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -892,6 +892,8 @@ I/O - Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`) - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`) - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`) +- Bug in :meth:`DataFrame.to_json` where it would incorrectly use the string representations of NA-values instead of null when serializing an index (:issue:`31801`) +- Bug in :meth:`DataFrame.to_json` where it would error when serializing ``Decimal("NaN")`` (:issue:`50399`) Period ^^^^^^ From 8f5e6e5fa8330dbce6216437098a7d5acdb1b8ca Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 22 Dec 2022 09:13:39 -0800 Subject: [PATCH 3/7] Update v2.0.0.rst --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d9e5ec74b9a65..8cf91fbbff5cc 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -893,7 +893,7 @@ I/O - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`) - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`) - Bug in :meth:`DataFrame.to_json` where it would incorrectly use the string representations of NA-values instead of null when serializing an index (:issue:`31801`) -- Bug in :meth:`DataFrame.to_json` where it would error when serializing ``Decimal("NaN")`` (:issue:`50399`) +- Bug in :meth:`DataFrame.to_json` where it would error when serializing ``Decimal("NaN")`` (:issue:`50400`) Period ^^^^^^ From 147cd887b3305b67dc4cf84eab3ff9312119ea2c Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 23 Dec 2022 10:21:31 -0800 Subject: [PATCH 4/7] Use Py_XDECREF --- pandas/_libs/src/ujson/python/objToJSON.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 6cacfc0a840d0..223565def10b5 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1359,7 +1359,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, "is_nan", NULL); is_null = (is_null_obj == Py_True); - Py_DECREF(is_null_obj); + Py_XDECREF(is_null_obj); } else { // Otherwise, fallback to string representation // Replace item with the string to keep it alive. @@ -1542,7 +1542,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } else { tc->type = JT_NULL; } - Py_DECREF(is_null_obj); + Py_XDECREF(is_null_obj); return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (object_is_nat_type(obj)) { From 9afd12be5b2a7e432ad3d99b7ecf4af0794ba221 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 8 Jan 2023 11:09:11 -0800 Subject: [PATCH 5/7] address comments --- pandas/_libs/src/ujson/python/objToJSON.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 223565def10b5..8a0e69e4200b5 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1359,7 +1359,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, "is_nan", NULL); is_null = (is_null_obj == Py_True); - Py_XDECREF(is_null_obj); + if (!is_null_obj) { + goto INVALID; + } + Py_DECREF(is_null); } else { // Otherwise, fallback to string representation // Replace item with the string to keep it alive. @@ -1542,7 +1545,10 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } else { tc->type = JT_NULL; } - Py_XDECREF(is_null_obj); + if (!is_null_obj) { + goto INVALID; + } + Py_DECREF(is_null_obj); return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (object_is_nat_type(obj)) { From 52e20ea5dc3e688002a828086f9c5f6a4e9bedb7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 15 Jan 2023 12:09:39 -0800 Subject: [PATCH 6/7] address comments --- pandas/_libs/src/ujson/python/objToJSON.c | 44 +++++++++++++---------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 8a0e69e4200b5..a183aa38da040 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -276,6 +276,27 @@ static int is_simple_frame(PyObject *obj) { Py_DECREF(mgr); return ret; } +/* TODO: Consider unifying with checknull and co. + in missing.pyx */ +static int is_null_obj(PyObject* obj) { + int is_null = 0; + if (PyFloat_Check(obj)) { + double fval = PyFloat_AS_DOUBLE(obj); + is_null = npy_isnan(fval); + } else if (obj == Py_None || object_is_na_type(obj)) { + is_null = 1; + } else if (object_is_decimal_type(obj)) { + PyObject *is_null_obj = PyObject_CallMethod(item, + "is_nan", + NULL); + is_null = (is_null_obj == Py_True); + if (!is_null_obj) { + goto INVALID; + } + Py_DECREF(is_null_obj); + } + return is_null; +} static npy_int64 get_long_attr(PyObject *o, const char *attr) { // NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT @@ -1349,21 +1370,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } } else { // NA values need special handling - if (PyFloat_Check(item)) { - double fval = PyFloat_AS_DOUBLE(item); - is_null = npy_isnan(fval); - } else if (item == Py_None || object_is_na_type(item)) { - is_null = 1; - } else if (object_is_decimal_type(item)) { - PyObject *is_null_obj = PyObject_CallMethod(item, - "is_nan", - NULL); - is_null = (is_null_obj == Py_True); - if (!is_null_obj) { - goto INVALID; - } - Py_DECREF(is_null); - } else { + is_null = is_null_obj(item); + if (!is_null) { // Otherwise, fallback to string representation // Replace item with the string to keep it alive. Py_SETREF(item, PyObject_Str(item)); @@ -1539,15 +1547,15 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PyObject *is_null_obj = PyObject_CallMethod(obj, "is_nan", NULL); + if (!is_null_obj) { + goto INVALID; + } if (is_null_obj == Py_False) { GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj); tc->type = JT_DOUBLE; } else { tc->type = JT_NULL; } - if (!is_null_obj) { - goto INVALID; - } Py_DECREF(is_null_obj); return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { From d3cecc57036eb3adfe257678b1a64663b3c12a49 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 15 Jan 2023 12:27:24 -0800 Subject: [PATCH 7/7] address comments --- pandas/_libs/src/ujson/python/objToJSON.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index a183aa38da040..513fa6abfc760 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -286,12 +286,12 @@ static int is_null_obj(PyObject* obj) { } else if (obj == Py_None || object_is_na_type(obj)) { is_null = 1; } else if (object_is_decimal_type(obj)) { - PyObject *is_null_obj = PyObject_CallMethod(item, + PyObject *is_null_obj = PyObject_CallMethod(obj, "is_nan", NULL); is_null = (is_null_obj == Py_True); if (!is_null_obj) { - goto INVALID; + return -1; } Py_DECREF(is_null_obj); } @@ -1371,6 +1371,11 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } else { // NA values need special handling is_null = is_null_obj(item); + if (is_null == -1) { + // Something errored + // Return to let the error surface + return 0; + } if (!is_null) { // Otherwise, fallback to string representation // Replace item with the string to keep it alive.