From 48d2cf0a3bcf6d90d2334d7a6b50ec766142bc3d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 3 Jan 2020 12:36:52 -0500 Subject: [PATCH 01/31] Working C impl of timedelta ISO --- pandas/_libs/src/ujson/python/objToJSON.c | 19 ++++++++++++- .../tslibs/src/datetime/np_datetime_strings.c | 28 +++++++++++++++++++ .../tslibs/src/datetime/np_datetime_strings.h | 9 ++++++ 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 2192539e24626..7b4db0f8a7b11 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1568,9 +1568,25 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - // TODO: vectorized timedelta solution if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { + if (type_num == NPY_TIMEDELTA) { + npy_int64 longVal; + PyArray_VectorUnaryFunc *castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, + "Cannot cast numpy dtype %d to long", + enc->npyType); + } + castfunc(dataptr, &longVal, 1, NULL, NULL); + pandas_timedeltastruct tds; + pandas_timedelta_to_timedeltastruct(longVal, NPY_FR_ns, &tds); + cLabel = make_iso_8601_timedelta(&tds, &len); + if (cLabel == NULL) { + // Error occurred + } + } else { // Timedelta-like objects PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); if (td == NULL) { Py_DECREF(item); @@ -1591,6 +1607,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, cLabel = (char *)PyUnicode_AsUTF8(iso); Py_DECREF(iso); len = strlen(cLabel); + } } else if (PyTypeNum_ISDATETIME(type_num)) { NPY_DATETIMEUNIT base = enc->datetimeUnit; npy_int64 longVal; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 54ed6ecff21e2..6a12db029c3c4 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -905,3 +905,31 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, outlen); return -1; } + + +char *make_iso_8601_timedelta(pandas_timedeltastruct *tds, size_t *outlen) { + char *begin; + asprintf(&begin, "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT "M%" NPY_INT32_FMT, + tds->days, tds->hrs, tds->min, tds->sec); + + char *append; + if (tds->ns != 0) { + asprintf(&append, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us, tds->ns); + } else if (tds->us != 0) { + asprintf(&append, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us); + } else if (tds->ms != 0) { + asprintf(&append, ".%03" NPY_INT32_FMT "S", tds->ms); + } else { + asprintf(&append, "%s", "S"); + } + + *outlen = strlen(begin) + strlen(append); + + // TODO: we are using builtin malloc calls here but freeing in JSON + // with PyObject_Free; that is not ideal + char *result = realloc(begin, *outlen); + strcat(result, append); + free(append); + + return result; +} diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 880c34ea77638..cc0ea24422fc6 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -79,4 +79,13 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, NPY_DATETIMEUNIT base); + +/* + * Converts an pandas_timedeltastruct to an ISO 8601 string. + * + * Mutates outlen to provide size of (non-NULL terminated) string. + * + * Returns NULL on error. + */ +char *make_iso_8601_timedelta(pandas_timedeltastruct *tds, size_t *outlen); #endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ From 9d0384b8162caf2c2ea3864866631e3e038098db Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 09:23:05 -0800 Subject: [PATCH 02/31] More consistent impl --- pandas/_libs/src/ujson/python/objToJSON.c | 8 +++--- .../tslibs/src/datetime/np_datetime_strings.c | 27 +++++++------------ .../tslibs/src/datetime/np_datetime_strings.h | 2 +- 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index af0ee7bfb536b..705c4d7361bd9 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1503,7 +1503,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp i, stride; char **ret; char *dataptr, *cLabel; - int type_num; + int type_num, ret_val; PRINTMARK(); if (!labels) { @@ -1555,8 +1555,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, castfunc(dataptr, &longVal, 1, NULL, NULL); pandas_timedeltastruct tds; pandas_timedelta_to_timedeltastruct(longVal, NPY_FR_ns, &tds); - cLabel = make_iso_8601_timedelta(&tds, &len); - if (cLabel == NULL) { + + cLabel = PyObject_Malloc(100); // TODO: Better bounds + ret_val = make_iso_8601_timedelta(&tds, cLabel, &len); + if (ret_val == -1) { // Error occurred } } else { // Timedelta-like objects diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 6a12db029c3c4..cad6f734cbcae 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -907,29 +907,22 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } -char *make_iso_8601_timedelta(pandas_timedeltastruct *tds, size_t *outlen) { - char *begin; - asprintf(&begin, "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT "M%" NPY_INT32_FMT, +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, size_t *outlen) { + *outlen = 0; + // sprintf returns the number of characters required for formatting, so use that to move buffer + *outlen += sprintf(outstr, "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT "M%" NPY_INT32_FMT, tds->days, tds->hrs, tds->min, tds->sec); + outstr += *outlen; - char *append; if (tds->ns != 0) { - asprintf(&append, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us, tds->ns); + *outlen += sprintf(outstr, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us, tds->ns); } else if (tds->us != 0) { - asprintf(&append, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us); + *outlen += sprintf(outstr, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us); } else if (tds->ms != 0) { - asprintf(&append, ".%03" NPY_INT32_FMT "S", tds->ms); + *outlen += sprintf(outstr, ".%03" NPY_INT32_FMT "S", tds->ms); } else { - asprintf(&append, "%s", "S"); + *outlen += sprintf(outstr, "%s", "S"); } - *outlen = strlen(begin) + strlen(append); - - // TODO: we are using builtin malloc calls here but freeing in JSON - // with PyObject_Free; that is not ideal - char *result = realloc(begin, *outlen); - strcat(result, append); - free(append); - - return result; + return 0; } diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index cc0ea24422fc6..268452e376021 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -87,5 +87,5 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, * * Returns NULL on error. */ -char *make_iso_8601_timedelta(pandas_timedeltastruct *tds, size_t *outlen); +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, size_t *outlen); #endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ From fd9675e332e5575d0c1ded8855039b613144085e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 09:42:00 -0800 Subject: [PATCH 03/31] shared between python / numpy timedelta --- pandas/_libs/src/ujson/python/objToJSON.c | 47 ++++++++++------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 705c4d7361bd9..7a292c049c9b0 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1543,8 +1543,9 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { + npy_int64 nanosecVal; if (type_num == NPY_TIMEDELTA) { - npy_int64 longVal; + PyArray_VectorUnaryFunc *castfunc = PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); if (!castfunc) { @@ -1552,37 +1553,29 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, "Cannot cast numpy dtype %d to long", enc->npyType); } - castfunc(dataptr, &longVal, 1, NULL, NULL); - pandas_timedeltastruct tds; - pandas_timedelta_to_timedeltastruct(longVal, NPY_FR_ns, &tds); - - cLabel = PyObject_Malloc(100); // TODO: Better bounds - ret_val = make_iso_8601_timedelta(&tds, cLabel, &len); - if (ret_val == -1) { - // Error occurred - } - } else { // Timedelta-like objects - PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); - if (td == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); - Py_DECREF(td); - if (iso == NULL) { + castfunc(dataptr, &nanosecVal, 1, NULL, NULL); + } else { + PyObject *total_sec = PyObject_CallMethod(item, "total_seconds", NULL); + if (total_sec == NULL) { Py_DECREF(item); NpyArr_freeLabels(ret, num); ret = 0; break; } - - cLabel = (char *)PyUnicode_AsUTF8(iso); - Py_DECREF(iso); - len = strlen(cLabel); - } + double total_sec_c = PyFloat_AsDouble(total_sec); + nanosecVal = (npy_int64)(total_sec_c * 1000000000); + } + pandas_timedeltastruct tds; + pandas_timedelta_to_timedeltastruct(nanosecVal, NPY_FR_ns, &tds); + + cLabel = PyObject_Malloc(100); // TODO: Better bounds + ret_val = make_iso_8601_timedelta(&tds, cLabel, &len); + if (ret_val == -1) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } } else if (PyTypeNum_ISDATETIME(type_num)) { NPY_DATETIMEUNIT base = enc->datetimeUnit; npy_int64 longVal; From 33fc37b242ddfe406cc174f5f42128e3b3e5f215 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 09:47:36 -0800 Subject: [PATCH 04/31] Shared td handling code --- pandas/_libs/src/ujson/python/objToJSON.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 7a292c049c9b0..ddc213674a26c 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1554,16 +1554,12 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, enc->npyType); } castfunc(dataptr, &nanosecVal, 1, NULL, NULL); - } else { - PyObject *total_sec = PyObject_CallMethod(item, "total_seconds", NULL); - if (total_sec == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - double total_sec_c = PyFloat_AsDouble(total_sec); - nanosecVal = (npy_int64)(total_sec_c * 1000000000); + } else { // Python timedelta + if (PyObject_HasAttrString(item, "value")) { + nanosecVal = get_long_attr(item, "value"); + } else { + nanosecVal = total_seconds(item) * 1000000000LL; // nanoseconds per second + } } pandas_timedeltastruct tds; pandas_timedelta_to_timedeltastruct(nanosecVal, NPY_FR_ns, &tds); @@ -1574,7 +1570,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, Py_DECREF(item); NpyArr_freeLabels(ret, num); ret = 0; - break; + break; } } else if (PyTypeNum_ISDATETIME(type_num)) { NPY_DATETIMEUNIT base = enc->datetimeUnit; From e2e999515611d12e7cf1c91c4d726b50514786eb Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 09:54:47 -0800 Subject: [PATCH 05/31] added tests from cbertinato --- pandas/tests/io/json/test_pandas.py | 34 +++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index e909a4952948c..d95f358d5c6c9 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1032,6 +1032,40 @@ def test_mixed_timedelta_datetime(self): result = pd.read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) + @pytest.mark.parametrize( + "date_format,expected", + [ + ("iso", '{"0":"P1DT0H0M0S","1":"P2DT0H0M0S"}'), + ("epoch", '{"0":86400000,"1":172800000}'), + ], + ) + def test_series_timedelta_to_json(self, date_format, expected): + # GH28156: to_json not correctly formatting Timedelta + s = Series(pd.timedelta_range(start="1D", periods=2)) + + result = s.to_json(date_format=date_format) + assert result == expected + + result = s.astype(object).to_json(date_format=date_format) + assert result == expected + + @pytest.mark.parametrize( + "date_format,expected", + [ + ("iso", '{"0":{"0":"P1DT0H0M0S","1":"P2DT0H0M0S"}}'), + ("epoch", '{"0":{"0":86400000,"1":172800000}}'), + ], + ) + def test_dataframe_timedelta_to_json(self, date_format, expected): + # GH28156: to_json not correctly formatting Timedelta + df = DataFrame(pd.timedelta_range(start="1D", periods=2)) + + result = df.to_json(date_format=date_format) + assert result == expected + + result = df.astype(object).to_json(date_format=date_format) + assert result == expected + def test_default_handler(self): value = object() frame = DataFrame({"a": [7, value]}) From a2cbd85bfde5e7733f1fb5beb933780061f93e39 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 10:18:52 -0800 Subject: [PATCH 06/31] shared code --- pandas/_libs/src/ujson/python/objToJSON.c | 49 +++++++++++++++++++---- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index ddc213674a26c..4bde35771198b 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -399,6 +399,7 @@ static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); PyObject_Free(result); + return NULL; } // Note that get_datetime_iso_8601_strlen just gives a generic size @@ -407,6 +408,30 @@ static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { return result; } +/* Converts the int64_t representation of a duration to ISO; mutates len */ +static char *int64ToIsoDuration(int64_t value, size_t *len) { + pandas_timedeltastruct tds; + int ret_code; + + pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); + + char *result = PyObject_Malloc(100); // TODO: Better bounds + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } + + ret_code = make_iso_8601_timedelta(&tds, result, len); + if (ret_code == -1) { + PyErr_SetString(PyExc_ValueError, + "Could not convert timedelta value to string"); + PyObject_Free(result); + return NULL; + } + + return result; +} + /* JSON callback. returns a char* and mutates the pointer to *len */ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { @@ -419,6 +444,13 @@ static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { return dt; } +/* JSON callback. returns a char* and mutates the pointer to *len */ +static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { + return int64ToIsoDuration(GET_TC(tc)->longValue, len); +} + + /* Convert PyDatetime To ISO C-string. mutates len */ static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len) { @@ -1503,7 +1535,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp i, stride; char **ret; char *dataptr, *cLabel; - int type_num, ret_val; + int type_num; PRINTMARK(); if (!labels) { @@ -1561,12 +1593,9 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, nanosecVal = total_seconds(item) * 1000000000LL; // nanoseconds per second } } - pandas_timedeltastruct tds; - pandas_timedelta_to_timedeltastruct(nanosecVal, NPY_FR_ns, &tds); - - cLabel = PyObject_Malloc(100); // TODO: Better bounds - ret_val = make_iso_8601_timedelta(&tds, cLabel, &len); - if (ret_val == -1) { + + cLabel = int64ToIsoDuration(nanosecVal, &len); + if (cLabel == NULL) { Py_DECREF(item); NpyArr_freeLabels(ret, num); ret = 0; @@ -1711,7 +1740,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (enc->datetimeIso) { PRINTMARK(); - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + if (enc->npyType == NPY_TIMEDELTA) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + } else { + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + } // Currently no way to pass longVal to iso function, so use // state management GET_TC(tc)->longValue = longVal; From 88974b5cc721ec2b0fe35328f1575e4a6c2585b8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 10:33:02 -0800 Subject: [PATCH 07/31] working tests --- pandas/_libs/src/ujson/python/objToJSON.c | 29 +++++++++++++---------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 4bde35771198b..2353ef6eb71cf 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1864,28 +1864,33 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } - unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (scaleNanosecToUnit(&value, unit) != 0) { + GET_TC(tc)->longValue = value; + + PRINTMARK(); + if (enc->datetimeIso) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + tc->type = JT_UTF8; + } else { + unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + if (scaleNanosecToUnit(&(GET_TC(tc)->longValue), unit) != 0) { // TODO: Add some kind of error handling here - } + } - exc = PyErr_Occurred(); + exc = PyErr_Occurred(); - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { PRINTMARK(); goto INVALID; - } + } - if (value == get_nat()) { + if (value == get_nat()) { PRINTMARK(); tc->type = JT_NULL; return; + } + + tc->type = JT_LONG; } - - GET_TC(tc)->longValue = value; - - PRINTMARK(); - tc->type = JT_LONG; return; } else if (PyArray_IsScalar(obj, Integer)) { PRINTMARK(); From 65a727f7aa94a0428f721955d033ad4a61bf3a06 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 10:33:16 -0800 Subject: [PATCH 08/31] reformat --- pandas/_libs/src/ujson/python/objToJSON.c | 128 +++++++++++----------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 2353ef6eb71cf..f409fc47f2be0 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -410,26 +410,26 @@ static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { /* Converts the int64_t representation of a duration to ISO; mutates len */ static char *int64ToIsoDuration(int64_t value, size_t *len) { - pandas_timedeltastruct tds; - int ret_code; - - pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); - - char *result = PyObject_Malloc(100); // TODO: Better bounds - if (result == NULL) { - PyErr_NoMemory(); - return NULL; - } + pandas_timedeltastruct tds; + int ret_code; + + pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); + + char *result = PyObject_Malloc(100); // TODO: Better bounds + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } - ret_code = make_iso_8601_timedelta(&tds, result, len); - if (ret_code == -1) { - PyErr_SetString(PyExc_ValueError, + ret_code = make_iso_8601_timedelta(&tds, result, len); + if (ret_code == -1) { + PyErr_SetString(PyExc_ValueError, "Could not convert timedelta value to string"); - PyObject_Free(result); - return NULL; - } + PyObject_Free(result); + return NULL; + } - return result; + return result; } /* JSON callback. returns a char* and mutates the pointer to *len */ @@ -446,11 +446,10 @@ static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { /* JSON callback. returns a char* and mutates the pointer to *len */ static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), - JSONTypeContext *tc, size_t *len) { + JSONTypeContext *tc, size_t *len) { return int64ToIsoDuration(GET_TC(tc)->longValue, len); } - /* Convert PyDatetime To ISO C-string. mutates len */ static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len) { @@ -1575,32 +1574,33 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { - npy_int64 nanosecVal; - if (type_num == NPY_TIMEDELTA) { + npy_int64 nanosecVal; + if (type_num == NPY_TIMEDELTA) { + + PyArray_VectorUnaryFunc *castfunc = PyArray_GetCastFunc( + PyArray_DescrFromType(type_num), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, + "Cannot cast numpy dtype %d to long", + enc->npyType); + } + castfunc(dataptr, &nanosecVal, 1, NULL, NULL); + } else { // Python timedelta + if (PyObject_HasAttrString(item, "value")) { + nanosecVal = get_long_attr(item, "value"); + } else { + nanosecVal = total_seconds(item) * + 1000000000LL; // nanoseconds per second + } + } - PyArray_VectorUnaryFunc *castfunc = - PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, - "Cannot cast numpy dtype %d to long", - enc->npyType); + cLabel = int64ToIsoDuration(nanosecVal, &len); + if (cLabel == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; } - castfunc(dataptr, &nanosecVal, 1, NULL, NULL); - } else { // Python timedelta - if (PyObject_HasAttrString(item, "value")) { - nanosecVal = get_long_attr(item, "value"); - } else { - nanosecVal = total_seconds(item) * 1000000000LL; // nanoseconds per second - } - } - - cLabel = int64ToIsoDuration(nanosecVal, &len); - if (cLabel == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } } else if (PyTypeNum_ISDATETIME(type_num)) { NPY_DATETIMEUNIT base = enc->datetimeUnit; npy_int64 longVal; @@ -1741,9 +1741,9 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (enc->datetimeIso) { PRINTMARK(); if (enc->npyType == NPY_TIMEDELTA) { - pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; } else { - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; } // Currently no way to pass longVal to iso function, so use // state management @@ -1864,32 +1864,32 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } - GET_TC(tc)->longValue = value; + GET_TC(tc)->longValue = value; PRINTMARK(); if (enc->datetimeIso) { - pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; - tc->type = JT_UTF8; + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + tc->type = JT_UTF8; } else { - unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (scaleNanosecToUnit(&(GET_TC(tc)->longValue), unit) != 0) { - // TODO: Add some kind of error handling here - } + unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + if (scaleNanosecToUnit(&(GET_TC(tc)->longValue), unit) != 0) { + // TODO: Add some kind of error handling here + } - exc = PyErr_Occurred(); + exc = PyErr_Occurred(); - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { - PRINTMARK(); - goto INVALID; - } + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + PRINTMARK(); + goto INVALID; + } - if (value == get_nat()) { - PRINTMARK(); - tc->type = JT_NULL; - return; - } - - tc->type = JT_LONG; + if (value == get_nat()) { + PRINTMARK(); + tc->type = JT_NULL; + return; + } + + tc->type = JT_LONG; } return; } else if (PyArray_IsScalar(obj, Integer)) { From 5d84cc30f643d5512afe6d3bcf60c4b3c1ee36cf Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 10:47:33 -0800 Subject: [PATCH 09/31] Expanded test coverage --- pandas/tests/io/json/test_pandas.py | 33 +++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d95f358d5c6c9..f5187192c64bc 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1032,21 +1032,36 @@ def test_mixed_timedelta_datetime(self): result = pd.read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) + @pytest.mark.parametrize("as_index", [True, False]) + @pytest.mark.parametrize("as_object", [True, False]) @pytest.mark.parametrize( - "date_format,expected", + "date_format,exp_values", [ - ("iso", '{"0":"P1DT0H0M0S","1":"P2DT0H0M0S"}'), - ("epoch", '{"0":86400000,"1":172800000}'), + ("iso", {"x": "P1DT0H0M0S", "y":"P2DT0H0M0S"}), + ("epoch", {"x": 86400000, "y": 172800000}), ], ) - def test_series_timedelta_to_json(self, date_format, expected): + def test_series_timedelta_to_json(self, as_index, as_object, date_format, exp_values): # GH28156: to_json not correctly formatting Timedelta - s = Series(pd.timedelta_range(start="1D", periods=2)) - - result = s.to_json(date_format=date_format) - assert result == expected + if as_index: + s = Series(range(2), index=pd.timedelta_range(start="1D", periods=2)) + else: + s = Series(pd.timedelta_range(start="1D", periods=2)) - result = s.astype(object).to_json(date_format=date_format) + if as_index: + expected = '{{"{x}":0,"{y}":1}}'.format(**exp_values) + else: + # strings must be quoted as values, integers cannot be + if date_format == "iso": + expected = '{{"0":"{x}","1":"{y}"}}'.format(**exp_values) + else: + expected = '{{"0":{x},"1":{y}}}'.format(**exp_values) + + if as_object: + result = s.astype(object).to_json(date_format=date_format) + else: + result = s.to_json(date_format=date_format) + assert result == expected @pytest.mark.parametrize( From 5445333a335abfeac0d8a9a8233d7f17ec963624 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 11:01:33 -0800 Subject: [PATCH 10/31] fixed test --- pandas/tests/io/json/test_pandas.py | 32 ++++++++--------------------- 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f5187192c64bc..c23c4179e6d71 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1037,25 +1037,26 @@ def test_mixed_timedelta_datetime(self): @pytest.mark.parametrize( "date_format,exp_values", [ - ("iso", {"x": "P1DT0H0M0S", "y":"P2DT0H0M0S"}), - ("epoch", {"x": 86400000, "y": 172800000}), + ("iso", {"x": "P1DT0H0M0S", "y": "P2DT0H0M0S", "z": "null"}), + ("epoch", {"x": 86400000, "y": 172800000, "z": "null"}), ], ) def test_series_timedelta_to_json(self, as_index, as_object, date_format, exp_values): # GH28156: to_json not correctly formatting Timedelta + data = [pd.Timedelta(days=1), pd.Timedelta(days=2), pd.NaT] if as_index: - s = Series(range(2), index=pd.timedelta_range(start="1D", periods=2)) + s = pd.Series(range(3), index=data) else: - s = Series(pd.timedelta_range(start="1D", periods=2)) + s = pd.Series(data) if as_index: - expected = '{{"{x}":0,"{y}":1}}'.format(**exp_values) + expected = '{{"{x}":0,"{y}":1,"{z}":2}}'.format(**exp_values) else: # strings must be quoted as values, integers cannot be if date_format == "iso": - expected = '{{"0":"{x}","1":"{y}"}}'.format(**exp_values) + expected = '{{"0":"{x}","1":"{y}","2":{z}}}'.format(**exp_values) else: - expected = '{{"0":{x},"1":{y}}}'.format(**exp_values) + expected = '{{"0":{x},"1":{y},"2":{z}}}'.format(**exp_values) if as_object: result = s.astype(object).to_json(date_format=date_format) @@ -1064,23 +1065,6 @@ def test_series_timedelta_to_json(self, as_index, as_object, date_format, exp_va assert result == expected - @pytest.mark.parametrize( - "date_format,expected", - [ - ("iso", '{"0":{"0":"P1DT0H0M0S","1":"P2DT0H0M0S"}}'), - ("epoch", '{"0":{"0":86400000,"1":172800000}}'), - ], - ) - def test_dataframe_timedelta_to_json(self, date_format, expected): - # GH28156: to_json not correctly formatting Timedelta - df = DataFrame(pd.timedelta_range(start="1D", periods=2)) - - result = df.to_json(date_format=date_format) - assert result == expected - - result = df.astype(object).to_json(date_format=date_format) - assert result == expected - def test_default_handler(self): value = object() frame = DataFrame({"a": [7, value]}) From 191d219fd3104d73d52a655ee8ce988c6a630ed0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 11:34:55 -0800 Subject: [PATCH 11/31] better null handling --- pandas/_libs/src/ujson/python/objToJSON.c | 34 +++++++++++++++-------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index f409fc47f2be0..d9630ff5e1df1 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1593,13 +1593,20 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, 1000000000LL; // nanoseconds per second } } - - cLabel = int64ToIsoDuration(nanosecVal, &len); - if (cLabel == NULL) { + // JSON requires a string for the index so write "null" + // unclear if there is a standard for this + if (nanosecVal == get_nat()) { + len = 5; // TODO: shouldn't require extra space for terminator + cLabel = PyObject_Malloc(len); + strncpy(cLabel, "null", len); + } else { + cLabel = int64ToIsoDuration(nanosecVal, &len); + if (cLabel == NULL) { Py_DECREF(item); NpyArr_freeLabels(ret, num); ret = 0; break; + } } } else if (PyTypeNum_ISDATETIME(type_num)) { NPY_DATETIMEUNIT base = enc->datetimeUnit; @@ -1612,7 +1619,11 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, enc->npyType); } castfunc(dataptr, &longVal, 1, NULL, NULL); - if (enc->datetimeIso) { + if (longVal == get_nat()) { + len = 5; // TODO: shouldn't require extra space for terminator + cLabel = PyObject_Malloc(len); + strncpy(cLabel, "null", len); + } else if (enc->datetimeIso) { cLabel = int64ToIso(longVal, base, &len); } else { if (!scaleNanosecToUnit(&longVal, base)) { @@ -1625,6 +1636,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } } else if (PyDateTime_Check(item) || PyDate_Check(item)) { NPY_DATETIMEUNIT base = enc->datetimeUnit; + + // TODO: null check here? if (enc->datetimeIso) { cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len); } else { @@ -1867,7 +1880,12 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->longValue = value; PRINTMARK(); - if (enc->datetimeIso) { + if (value == get_nat()) { + PRINTMARK(); + tc->type = JT_NULL; + return; + } + else if (enc->datetimeIso) { pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; tc->type = JT_UTF8; } else { @@ -1883,12 +1901,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } - if (value == get_nat()) { - PRINTMARK(); - tc->type = JT_NULL; - return; - } - tc->type = JT_LONG; } return; From 66a2a4314ad2875d6845b9d31a438d22889a4d6e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 11:38:39 -0800 Subject: [PATCH 12/31] expanded test --- pandas/tests/io/json/test_pandas.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c23c4179e6d71..28b1ba5639909 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1045,9 +1045,13 @@ def test_series_timedelta_to_json(self, as_index, as_object, date_format, exp_va # GH28156: to_json not correctly formatting Timedelta data = [pd.Timedelta(days=1), pd.Timedelta(days=2), pd.NaT] if as_index: - s = pd.Series(range(3), index=data) + ser = pd.Series(range(3), index=data) + if as_object: + ser.index = ser.index.astype(object) else: - s = pd.Series(data) + ser = pd.Series(data) + if as_object: + ser = ser.astype(object) if as_index: expected = '{{"{x}":0,"{y}":1,"{z}":2}}'.format(**exp_values) @@ -1058,11 +1062,7 @@ def test_series_timedelta_to_json(self, as_index, as_object, date_format, exp_va else: expected = '{{"0":{x},"1":{y},"2":{z}}}'.format(**exp_values) - if as_object: - result = s.astype(object).to_json(date_format=date_format) - else: - result = s.to_json(date_format=date_format) - + result = ser.to_json(date_format=date_format) assert result == expected def test_default_handler(self): From 60b55370c299b9f5d5e7ccffed1d7ff8c72ce94b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 13:06:42 -0800 Subject: [PATCH 13/31] removed print --- pandas/_libs/src/ujson/python/objToJSON.c | 101 ++++++++++++++++------ 1 file changed, 75 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index d9630ff5e1df1..86ff348e70df2 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1596,17 +1596,17 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, // JSON requires a string for the index so write "null" // unclear if there is a standard for this if (nanosecVal == get_nat()) { - len = 5; // TODO: shouldn't require extra space for terminator - cLabel = PyObject_Malloc(len); - strncpy(cLabel, "null", len); + len = 5; // TODO: shouldn't require extra space for terminator + cLabel = PyObject_Malloc(len); + strncpy(cLabel, "null", len); } else { - cLabel = int64ToIsoDuration(nanosecVal, &len); - if (cLabel == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } + cLabel = int64ToIsoDuration(nanosecVal, &len); + if (cLabel == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } } } else if (PyTypeNum_ISDATETIME(type_num)) { NPY_DATETIMEUNIT base = enc->datetimeUnit; @@ -1620,9 +1620,9 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } castfunc(dataptr, &longVal, 1, NULL, NULL); if (longVal == get_nat()) { - len = 5; // TODO: shouldn't require extra space for terminator - cLabel = PyObject_Malloc(len); - strncpy(cLabel, "null", len); + len = 5; // TODO: shouldn't require extra space for terminator + cLabel = PyObject_Malloc(len); + strncpy(cLabel, "null", len); } else if (enc->datetimeIso) { cLabel = int64ToIso(longVal, base, &len); } else { @@ -1635,16 +1635,66 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, len = strlen(cLabel); } } else if (PyDateTime_Check(item) || PyDate_Check(item)) { - NPY_DATETIMEUNIT base = enc->datetimeUnit; + npy_int64 nanosecVal; + if (PyObject_HasAttrString(item, "value")) { + PRINTMARK(); + nanosecVal = get_long_attr(item, "value"); + } else { + PRINTMARK(); + nanosecVal = total_seconds(item) * + 1000000000LL; // nanoseconds per second + } - // TODO: null check here? - if (enc->datetimeIso) { - cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len); + if (nanosecVal == get_nat()) { + len = 5; // TODO: shouldn't require extra space for terminator + cLabel = PyObject_Malloc(len); + strncpy(cLabel, "null", len); } else { - cLabel = PyObject_Malloc(21); // 21 chars for int64 - sprintf(cLabel, "%" NPY_DATETIME_FMT, - PyDateTimeToEpoch(item, base)); - len = strlen(cLabel); + // TODO: null check here? + NPY_DATETIMEUNIT base = enc->datetimeUnit; + if (enc->datetimeIso) { + cLabel = + PyDateTimeToIso((PyDateTime_Date *)item, base, &len); + } else { + cLabel = PyObject_Malloc(21); // 21 chars for int64 + sprintf(cLabel, "%" NPY_DATETIME_FMT, + PyDateTimeToEpoch(item, base)); + len = strlen(cLabel); + } + } + } else if (PyDelta_Check(item)) { + npy_int64 nanosecVal; + if (PyObject_HasAttrString(item, "value")) { + PRINTMARK(); + nanosecVal = get_long_attr(item, "value"); + } else { + PRINTMARK(); + nanosecVal = total_seconds(item) * + 1000000000LL; // nanoseconds per second + } + + if (nanosecVal == get_nat()) { + len = 5; // TODO: shouldn't require extra space for terminator + cLabel = PyObject_Malloc(len); + strncpy(cLabel, "null", len); + } else { + if (enc->datetimeIso) { + cLabel = int64ToIsoDuration(nanosecVal, &len); + if (cLabel == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + } else { + NPY_DATETIMEUNIT base = enc->datetimeUnit; + cLabel = PyObject_Malloc(21); // 21 chars for int64 + if (!scaleNanosecToUnit(&nanosecVal, base)) { + // TODO: error handler + } + sprintf(cLabel, "%" NPY_DATETIME_FMT, nanosecVal); + len = strlen(cLabel); + } } } else { // Fallback to string representation PyObject *str = PyObject_Str(item); @@ -1881,11 +1931,10 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); if (value == get_nat()) { - PRINTMARK(); - tc->type = JT_NULL; - return; - } - else if (enc->datetimeIso) { + PRINTMARK(); + tc->type = JT_NULL; + return; + } else if (enc->datetimeIso) { pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; tc->type = JT_UTF8; } else { From dae5336c10954b569f2ca5b5e2e8e0ee07af3834 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 13:10:57 -0800 Subject: [PATCH 14/31] refactor --- pandas/_libs/src/ujson/python/objToJSON.c | 54 ++++++++--------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 86ff348e70df2..1dcb30fb0f7e0 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1634,7 +1634,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, sprintf(cLabel, "%" NPY_INT64_FMT, longVal); len = strlen(cLabel); } - } else if (PyDateTime_Check(item) || PyDate_Check(item)) { + } else if (PyDateTime_Check(item) || PyDate_Check(item) || PyDelta_Check(item)) { npy_int64 nanosecVal; if (PyObject_HasAttrString(item, "value")) { PRINTMARK(); @@ -1650,49 +1650,33 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, cLabel = PyObject_Malloc(len); strncpy(cLabel, "null", len); } else { - // TODO: null check here? NPY_DATETIMEUNIT base = enc->datetimeUnit; if (enc->datetimeIso) { + if (PyDelta_Check(item)) { + cLabel = int64ToIsoDuration(nanosecVal, &len); + } else { cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len); - } else { - cLabel = PyObject_Malloc(21); // 21 chars for int64 - sprintf(cLabel, "%" NPY_DATETIME_FMT, - PyDateTimeToEpoch(item, base)); - len = strlen(cLabel); - } - } - } else if (PyDelta_Check(item)) { - npy_int64 nanosecVal; - if (PyObject_HasAttrString(item, "value")) { - PRINTMARK(); - nanosecVal = get_long_attr(item, "value"); - } else { - PRINTMARK(); - nanosecVal = total_seconds(item) * - 1000000000LL; // nanoseconds per second - } + } - if (nanosecVal == get_nat()) { - len = 5; // TODO: shouldn't require extra space for terminator - cLabel = PyObject_Malloc(len); - strncpy(cLabel, "null", len); - } else { - if (enc->datetimeIso) { - cLabel = int64ToIsoDuration(nanosecVal, &len); - if (cLabel == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } + if (cLabel == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } } else { - NPY_DATETIMEUNIT base = enc->datetimeUnit; cLabel = PyObject_Malloc(21); // 21 chars for int64 - if (!scaleNanosecToUnit(&nanosecVal, base)) { + + if (PyDelta_Check(item)) { + if (!scaleNanosecToUnit(&nanosecVal, base)) { // TODO: error handler + } + sprintf(cLabel, "%" NPY_DATETIME_FMT, nanosecVal); + } else { + sprintf(cLabel, "%" NPY_DATETIME_FMT, + PyDateTimeToEpoch(item, base)); } - sprintf(cLabel, "%" NPY_DATETIME_FMT, nanosecVal); len = strlen(cLabel); } } From 88df8bf58e9e6a0221c5a8378c16963cc659cd1d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 13:11:50 -0800 Subject: [PATCH 15/31] fix incorrect test --- pandas/tests/io/json/test_json_table_schema.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 2ac2acc6748d1..c0d40048a72fe 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -603,8 +603,7 @@ def test_timestamp_in_columns(self): result = df.to_json(orient="table") js = json.loads(result) assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z" - # TODO - below expectation is not correct; see GH 28256 - assert js["schema"]["fields"][2]["name"] == 10000 + assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S" @pytest.mark.parametrize( "case", From 4146d9f8402a94fd4de74372485394ca29f87de1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 13:44:24 -0800 Subject: [PATCH 16/31] refactor --- pandas/_libs/src/ujson/python/objToJSON.c | 159 ++++++++-------------- 1 file changed, 55 insertions(+), 104 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 1dcb30fb0f7e0..3811a81a006be 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1535,6 +1535,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, char **ret; char *dataptr, *cLabel; int type_num; + NPY_DATETIMEUNIT base = enc->datetimeUnit; PRINTMARK(); if (!labels) { @@ -1572,114 +1573,59 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - if (enc->datetimeIso && - (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { - npy_int64 nanosecVal; - if (type_num == NPY_TIMEDELTA) { - - PyArray_VectorUnaryFunc *castfunc = PyArray_GetCastFunc( - PyArray_DescrFromType(type_num), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, - "Cannot cast numpy dtype %d to long", - enc->npyType); - } - castfunc(dataptr, &nanosecVal, 1, NULL, NULL); - } else { // Python timedelta - if (PyObject_HasAttrString(item, "value")) { - nanosecVal = get_long_attr(item, "value"); - } else { - nanosecVal = total_seconds(item) * - 1000000000LL; // nanoseconds per second - } - } - // JSON requires a string for the index so write "null" - // unclear if there is a standard for this - if (nanosecVal == get_nat()) { - len = 5; // TODO: shouldn't require extra space for terminator - cLabel = PyObject_Malloc(len); - strncpy(cLabel, "null", len); - } else { + int is_datetimelike = 0; + npy_int64 nanosecVal; + if (PyTypeNum_ISDATETIME(type_num)) { + is_datetimelike = 1; + PyArray_VectorUnaryFunc *castfunc = PyArray_GetCastFunc( + PyArray_DescrFromType(type_num), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, + "Cannot cast numpy dtype %d to long", + enc->npyType); + } + castfunc(dataptr, &nanosecVal, 1, NULL, NULL); + } else if (PyDate_Check(item) || PyDelta_Check(item)) { + is_datetimelike = 1; + if (PyObject_HasAttrString(item, "value")) { + nanosecVal = get_long_attr(item, "value"); + } else { + nanosecVal = total_seconds(item) * + 1000000000LL; // nanoseconds per second + } + } + + if (is_datetimelike) { + // JSON requires a string for the index so write "null" + // is there is a standard for this? + if (nanosecVal == get_nat()) { + len = 5; // TODO: shouldn't require extra space for terminator + cLabel = PyObject_Malloc(len); + strncpy(cLabel, "null", len); + } else { + if (enc->datetimeIso) { + if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { cLabel = int64ToIsoDuration(nanosecVal, &len); - if (cLabel == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - } - } else if (PyTypeNum_ISDATETIME(type_num)) { - NPY_DATETIMEUNIT base = enc->datetimeUnit; - npy_int64 longVal; - PyArray_VectorUnaryFunc *castfunc = - PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, - "Cannot cast numpy dtype %d to long", - enc->npyType); - } - castfunc(dataptr, &longVal, 1, NULL, NULL); - if (longVal == get_nat()) { - len = 5; // TODO: shouldn't require extra space for terminator - cLabel = PyObject_Malloc(len); - strncpy(cLabel, "null", len); - } else if (enc->datetimeIso) { - cLabel = int64ToIso(longVal, base, &len); - } else { - if (!scaleNanosecToUnit(&longVal, base)) { - // TODO: This gets hit but somehow doesn't cause errors - // need to clean up (elsewhere in module as well) - } - cLabel = PyObject_Malloc(21); // 21 chars for int64 - sprintf(cLabel, "%" NPY_INT64_FMT, longVal); - len = strlen(cLabel); - } - } else if (PyDateTime_Check(item) || PyDate_Check(item) || PyDelta_Check(item)) { - npy_int64 nanosecVal; - if (PyObject_HasAttrString(item, "value")) { - PRINTMARK(); - nanosecVal = get_long_attr(item, "value"); - } else { - PRINTMARK(); - nanosecVal = total_seconds(item) * - 1000000000LL; // nanoseconds per second - } - - if (nanosecVal == get_nat()) { - len = 5; // TODO: shouldn't require extra space for terminator - cLabel = PyObject_Malloc(len); - strncpy(cLabel, "null", len); - } else { - NPY_DATETIMEUNIT base = enc->datetimeUnit; - if (enc->datetimeIso) { - if (PyDelta_Check(item)) { - cLabel = int64ToIsoDuration(nanosecVal, &len); - } else { - cLabel = - PyDateTimeToIso((PyDateTime_Date *)item, base, &len); - } - - if (cLabel == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } + } else { + if (type_num == NPY_DATETIME) { + cLabel = int64ToIso(nanosecVal, base, &len); } else { - cLabel = PyObject_Malloc(21); // 21 chars for int64 - - if (PyDelta_Check(item)) { - if (!scaleNanosecToUnit(&nanosecVal, base)) { - // TODO: error handler - } - sprintf(cLabel, "%" NPY_DATETIME_FMT, nanosecVal); - } else { - sprintf(cLabel, "%" NPY_DATETIME_FMT, - PyDateTimeToEpoch(item, base)); - } - len = strlen(cLabel); + cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len); } + } + if (cLabel == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + } else { + cLabel = PyObject_Malloc(21); // 21 chars for int64 + sprintf(cLabel, "%" NPY_DATETIME_FMT, + NpyDateTimeToEpoch(nanosecVal, base)); + len = strlen(cLabel); } + } } else { // Fallback to string representation PyObject *str = PyObject_Str(item); if (str == NULL) { @@ -1699,6 +1645,11 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, ret[i] = PyObject_Malloc(len + 1); memcpy(ret[i], cLabel, len + 1); + if (is_datetimelike) { + // these were created with PyObject_Malloc so free accordingly + PyObject_Free(cLabel); + } + if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); ret = 0; From 24a79107ea8af8cf030d8d4507f3278236690e28 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 13:44:43 -0800 Subject: [PATCH 17/31] reformat --- pandas/_libs/src/ujson/python/objToJSON.c | 93 ++++++++++++----------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 3811a81a006be..a1b8f251aaf59 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1576,56 +1576,57 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, int is_datetimelike = 0; npy_int64 nanosecVal; if (PyTypeNum_ISDATETIME(type_num)) { - is_datetimelike = 1; - PyArray_VectorUnaryFunc *castfunc = PyArray_GetCastFunc( - PyArray_DescrFromType(type_num), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, - "Cannot cast numpy dtype %d to long", - enc->npyType); - } - castfunc(dataptr, &nanosecVal, 1, NULL, NULL); + is_datetimelike = 1; + PyArray_VectorUnaryFunc *castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, + "Cannot cast numpy dtype %d to long", + enc->npyType); + } + castfunc(dataptr, &nanosecVal, 1, NULL, NULL); } else if (PyDate_Check(item) || PyDelta_Check(item)) { - is_datetimelike = 1; - if (PyObject_HasAttrString(item, "value")) { - nanosecVal = get_long_attr(item, "value"); - } else { - nanosecVal = total_seconds(item) * - 1000000000LL; // nanoseconds per second - } - } + is_datetimelike = 1; + if (PyObject_HasAttrString(item, "value")) { + nanosecVal = get_long_attr(item, "value"); + } else { + nanosecVal = total_seconds(item) * + 1000000000LL; // nanoseconds per second + } + } if (is_datetimelike) { - // JSON requires a string for the index so write "null" - // is there is a standard for this? - if (nanosecVal == get_nat()) { - len = 5; // TODO: shouldn't require extra space for terminator - cLabel = PyObject_Malloc(len); - strncpy(cLabel, "null", len); - } else { - if (enc->datetimeIso) { - if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { - cLabel = int64ToIsoDuration(nanosecVal, &len); - } else { - if (type_num == NPY_DATETIME) { - cLabel = int64ToIso(nanosecVal, base, &len); + // JSON requires a string for the index so write "null" + // is there is a standard for this? + if (nanosecVal == get_nat()) { + len = 5; // TODO: shouldn't require extra space for terminator + cLabel = PyObject_Malloc(len); + strncpy(cLabel, "null", len); + } else { + if (enc->datetimeIso) { + if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { + cLabel = int64ToIsoDuration(nanosecVal, &len); + } else { + if (type_num == NPY_DATETIME) { + cLabel = int64ToIso(nanosecVal, base, &len); + } else { + cLabel = PyDateTimeToIso((PyDateTime_Date *)item, + base, &len); + } + } + if (cLabel == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } } else { - cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len); + cLabel = PyObject_Malloc(21); // 21 chars for int64 + sprintf(cLabel, "%" NPY_DATETIME_FMT, + NpyDateTimeToEpoch(nanosecVal, base)); + len = strlen(cLabel); } - } - if (cLabel == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - } else { - cLabel = PyObject_Malloc(21); // 21 chars for int64 - sprintf(cLabel, "%" NPY_DATETIME_FMT, - NpyDateTimeToEpoch(nanosecVal, base)); - len = strlen(cLabel); } - } } else { // Fallback to string representation PyObject *str = PyObject_Str(item); if (str == NULL) { @@ -1646,8 +1647,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, memcpy(ret[i], cLabel, len + 1); if (is_datetimelike) { - // these were created with PyObject_Malloc so free accordingly - PyObject_Free(cLabel); + // these were created with PyObject_Malloc so free accordingly + PyObject_Free(cLabel); } if (PyErr_Occurred()) { From b1e7da0873e234593e13a1a74e189625c5eb7876 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 14:06:37 -0800 Subject: [PATCH 18/31] more date testing --- pandas/tests/io/json/test_pandas.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 28b1ba5639909..85c0ac343e583 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,4 +1,5 @@ from collections import OrderedDict +import datetime from datetime import timedelta from io import StringIO import json @@ -810,6 +811,29 @@ def test_convert_dates(self): result = read_json(json, typ="series") tm.assert_series_equal(result, ts) + @pytest.mark.parametrize("date_format", ["epoch", "iso"]) + @pytest.mark.parametrize("as_object", [True, False]) + @pytest.mark.parametrize("date_typ", [ + datetime.datetime, pd.Timestamp]) + def test_date_index_and_values(self, date_format, as_object, date_typ): + data = [date_typ(year=2020, month=1, day=1), pd.NaT] + if as_object: + data.append("a") + + ser = pd.Series(data, index=data) + result = ser.to_json(date_format=date_format) + + if date_format == "epoch": + expected = '{"1577836800000":1577836800000,"null":null}' + else: + expected = ('{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z"' + ',"null":null}') + + if as_object: + expected = expected.replace("}", ',"a":"a"}') + + assert result == expected + @pytest.mark.parametrize( "infer_word", [ From 0046c3c83feaef791bbc9eabf4d522a42cece3b8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 14:24:17 -0800 Subject: [PATCH 19/31] refactored with bug fix --- pandas/_libs/src/ujson/python/objToJSON.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index a1b8f251aaf59..7185631ca5d25 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1590,8 +1590,13 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, if (PyObject_HasAttrString(item, "value")) { nanosecVal = get_long_attr(item, "value"); } else { + if (PyDelta_Check(item)) { nanosecVal = total_seconds(item) * 1000000000LL; // nanoseconds per second + } else { + // datetime.* objects don't follow above rules + nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); + } } } From 77b7bae7f4156b30a7875594654a6de804c66dd9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 14:29:51 -0800 Subject: [PATCH 20/31] simplified timedelta test --- pandas/tests/io/json/test_pandas.py | 35 +++++++++-------------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 85c0ac343e583..7731188345426 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1056,35 +1056,22 @@ def test_mixed_timedelta_datetime(self): result = pd.read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) - @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("as_object", [True, False]) - @pytest.mark.parametrize( - "date_format,exp_values", - [ - ("iso", {"x": "P1DT0H0M0S", "y": "P2DT0H0M0S", "z": "null"}), - ("epoch", {"x": 86400000, "y": 172800000, "z": "null"}), - ], - ) - def test_series_timedelta_to_json(self, as_index, as_object, date_format, exp_values): + @pytest.mark.parametrize("date_format", ["iso", "epoch"]) + def test_series_timedelta_to_json(self, as_object, date_format): # GH28156: to_json not correctly formatting Timedelta data = [pd.Timedelta(days=1), pd.Timedelta(days=2), pd.NaT] - if as_index: - ser = pd.Series(range(3), index=data) - if as_object: - ser.index = ser.index.astype(object) - else: - ser = pd.Series(data) - if as_object: - ser = ser.astype(object) + if as_object: + data.append("a") - if as_index: - expected = '{{"{x}":0,"{y}":1,"{z}":2}}'.format(**exp_values) + ser = pd.Series(data, index=data) + if date_format == "iso": + expected = '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}' else: - # strings must be quoted as values, integers cannot be - if date_format == "iso": - expected = '{{"0":"{x}","1":"{y}","2":{z}}}'.format(**exp_values) - else: - expected = '{{"0":{x},"1":{y},"2":{z}}}'.format(**exp_values) + expected = '{"86400000":86400000,"172800000":172800000,"null":null}' + + if as_object: + expected = expected.replace("}", ',"a":"a"}') result = ser.to_json(date_format=date_format) assert result == expected From 3ef4affc1f4cf5551fc929d0ef6101ed86bcd379 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 14:30:59 -0800 Subject: [PATCH 21/31] Added timedelta coverage --- pandas/tests/io/json/test_pandas.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7731188345426..43972b9c6586d 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1058,9 +1058,10 @@ def test_mixed_timedelta_datetime(self): @pytest.mark.parametrize("as_object", [True, False]) @pytest.mark.parametrize("date_format", ["iso", "epoch"]) - def test_series_timedelta_to_json(self, as_object, date_format): + @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta]) + def test_timedelta_to_json(self, as_object, date_format, timedelta_typ): # GH28156: to_json not correctly formatting Timedelta - data = [pd.Timedelta(days=1), pd.Timedelta(days=2), pd.NaT] + data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT] if as_object: data.append("a") From 960dce69db92df7109ef2f80e7dafd5ab44a68c9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 14:32:03 -0800 Subject: [PATCH 22/31] stylistic updates --- pandas/_libs/src/ujson/python/objToJSON.c | 14 +++++++------- pandas/tests/io/json/test_pandas.py | 12 +++++++----- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 7185631ca5d25..b3c4646a3d6a3 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1590,13 +1590,13 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, if (PyObject_HasAttrString(item, "value")) { nanosecVal = get_long_attr(item, "value"); } else { - if (PyDelta_Check(item)) { - nanosecVal = total_seconds(item) * - 1000000000LL; // nanoseconds per second - } else { - // datetime.* objects don't follow above rules - nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); - } + if (PyDelta_Check(item)) { + nanosecVal = total_seconds(item) * + 1000000000LL; // nanoseconds per second + } else { + // datetime.* objects don't follow above rules + nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); + } } } diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 43972b9c6586d..eee2a933928db 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -813,8 +813,7 @@ def test_convert_dates(self): @pytest.mark.parametrize("date_format", ["epoch", "iso"]) @pytest.mark.parametrize("as_object", [True, False]) - @pytest.mark.parametrize("date_typ", [ - datetime.datetime, pd.Timestamp]) + @pytest.mark.parametrize("date_typ", [datetime.datetime, pd.Timestamp]) def test_date_index_and_values(self, date_format, as_object, date_typ): data = [date_typ(year=2020, month=1, day=1), pd.NaT] if as_object: @@ -826,8 +825,9 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): if date_format == "epoch": expected = '{"1577836800000":1577836800000,"null":null}' else: - expected = ('{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z"' - ',"null":null}') + expected = ( + '{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z"' ',"null":null}' + ) if as_object: expected = expected.replace("}", ',"a":"a"}') @@ -1067,7 +1067,9 @@ def test_timedelta_to_json(self, as_object, date_format, timedelta_typ): ser = pd.Series(data, index=data) if date_format == "iso": - expected = '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}' + expected = ( + '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}' + ) else: expected = '{"86400000":86400000,"172800000":172800000,"null":null}' From 6d2c8dabdcc2eb52102322f472ecc39ef1520e55 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 14:53:59 -0800 Subject: [PATCH 23/31] Removed unneeded timedelta import --- pandas/_libs/src/ujson/python/objToJSON.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index b3c4646a3d6a3..548f64a48850a 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -54,7 +54,6 @@ static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; -PyObject *cls_timedelta; npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -165,7 +164,6 @@ void *initObjToJSON(void) { cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); - cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } From 4a94f15939cdfda29ecdbc0ba3b5871479eac065 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 15:38:42 -0800 Subject: [PATCH 24/31] style updates --- .../tslibs/src/datetime/np_datetime_strings.c | 23 ++++++++++++++----- .../tslibs/src/datetime/np_datetime_strings.h | 3 ++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index cad6f734cbcae..39768104b8959 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -907,17 +907,28 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, } -int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, size_t *outlen) { +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, + char *outstr, size_t *outlen) { *outlen = 0; - // sprintf returns the number of characters required for formatting, so use that to move buffer - *outlen += sprintf(outstr, "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT "M%" NPY_INT32_FMT, - tds->days, tds->hrs, tds->min, tds->sec); + *outlen += sprintf(outstr, + "P%" NPY_INT64_FMT + "DT%" NPY_INT32_FMT + "H%" NPY_INT32_FMT + "M%" NPY_INT32_FMT, + tds->days, tds->hrs, tds->min, tds->sec); outstr += *outlen; if (tds->ns != 0) { - *outlen += sprintf(outstr, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us, tds->ns); + *outlen += sprintf(outstr, + ".%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "S", tds->ms, tds->us, tds->ns); } else if (tds->us != 0) { - *outlen += sprintf(outstr, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us); + *outlen += sprintf(outstr, + ".%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT + "S", tds->ms, tds->us); } else if (tds->ms != 0) { *outlen += sprintf(outstr, ".%03" NPY_INT32_FMT "S", tds->ms); } else { diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 268452e376021..0ac203cf027f4 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -87,5 +87,6 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, * * Returns NULL on error. */ -int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, size_t *outlen); +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, + size_t *outlen); #endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ From 40468bf14aae29dc169bc344d2a0667db6cedc76 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 16:29:04 -0800 Subject: [PATCH 25/31] replace sprintf with snprintf --- pandas/_libs/src/ujson/python/objToJSON.c | 4 +++- pandas/_libs/tslibs/src/datetime/np_datetime_strings.c | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 548f64a48850a..cf41454481d6d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -413,7 +413,9 @@ static char *int64ToIsoDuration(int64_t value, size_t *len) { pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); - char *result = PyObject_Malloc(100); // TODO: Better bounds + // Max theoretical length of ISO Duration with 64 bit day + // as the largest unit is 70 characters + 1 for a null terminator + char *result = PyObject_Malloc(71); if (result == NULL) { PyErr_NoMemory(); return NULL; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 39768104b8959..be6d23c41051e 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -910,7 +910,7 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, size_t *outlen) { *outlen = 0; - *outlen += sprintf(outstr, + *outlen += snprintf(outstr, 60, // max length for first part of str "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT @@ -919,20 +919,20 @@ int make_iso_8601_timedelta(pandas_timedeltastruct *tds, outstr += *outlen; if (tds->ns != 0) { - *outlen += sprintf(outstr, + *outlen += snprintf(outstr, 12, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us, tds->ns); } else if (tds->us != 0) { - *outlen += sprintf(outstr, + *outlen += snprintf(outstr, 9, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us); } else if (tds->ms != 0) { - *outlen += sprintf(outstr, ".%03" NPY_INT32_FMT "S", tds->ms); + *outlen += snprintf(outstr, 6, ".%03" NPY_INT32_FMT "S", tds->ms); } else { - *outlen += sprintf(outstr, "%s", "S"); + *outlen += snprintf(outstr, 2, "%s", "S"); } return 0; From 0259370d757bcc646e8ec79f7d14222d93926428 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Jan 2020 17:11:28 -0800 Subject: [PATCH 26/31] ignore lint errors --- .../_libs/tslibs/src/datetime/np_datetime_strings.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index be6d23c41051e..b245ae5880ecb 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -910,7 +910,7 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, size_t *outlen) { *outlen = 0; - *outlen += snprintf(outstr, 60, // max length for first part of str + *outlen += snprintf(outstr, 60, // NOLINT "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT @@ -919,20 +919,22 @@ int make_iso_8601_timedelta(pandas_timedeltastruct *tds, outstr += *outlen; if (tds->ns != 0) { - *outlen += snprintf(outstr, 12, + *outlen += snprintf(outstr, 12, // NOLINT ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us, tds->ns); } else if (tds->us != 0) { - *outlen += snprintf(outstr, 9, + *outlen += snprintf(outstr, 9, // NOLINT ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us); } else if (tds->ms != 0) { - *outlen += snprintf(outstr, 6, ".%03" NPY_INT32_FMT "S", tds->ms); + *outlen += snprintf(outstr, 6, // NOLINT + ".%03" NPY_INT32_FMT "S", tds->ms); } else { - *outlen += snprintf(outstr, 2, "%s", "S"); + *outlen += snprintf(outstr, 2, // NOLINT + "%s", "S"); } return 0; From d1c00e501b96466e085f864ed1c1959f75ae3fe2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 11 Jan 2020 15:25:56 -0800 Subject: [PATCH 27/31] Update test_pandas.py --- pandas/tests/io/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index eee2a933928db..53d0c9b0346b6 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -826,7 +826,7 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): expected = '{"1577836800000":1577836800000,"null":null}' else: expected = ( - '{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z"' ',"null":null}' + '{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z","null":null}' ) if as_object: From 29f497f69f189a59d9e9411f706a9bb2a878bb2e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 20 Jan 2020 16:02:43 -0800 Subject: [PATCH 28/31] moved conversion func --- .../_libs/src/ujson/python/date_conversions.c | 26 +++++++++++ .../_libs/src/ujson/python/date_conversions.h | 2 + pandas/_libs/src/ujson/python/objToJSON.c | 46 ++++++------------- 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/src/ujson/python/date_conversions.c b/pandas/_libs/src/ujson/python/date_conversions.c index fc4bdef8463af..bcb1334d978ef 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.c +++ b/pandas/_libs/src/ujson/python/date_conversions.c @@ -116,3 +116,29 @@ npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) { npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); return NpyDateTimeToEpoch(npy_dt, base); } + +/* Converts the int64_t representation of a duration to ISO; mutates len */ +char *int64ToIsoDuration(int64_t value, size_t *len) { + pandas_timedeltastruct tds; + int ret_code; + + pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); + + // Max theoretical length of ISO Duration with 64 bit day + // as the largest unit is 70 characters + 1 for a null terminator + char *result = PyObject_Malloc(71); + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } + + ret_code = make_iso_8601_timedelta(&tds, result, len); + if (ret_code == -1) { + PyErr_SetString(PyExc_ValueError, + "Could not convert timedelta value to string"); + PyObject_Free(result); + return NULL; + } + + return result; +} diff --git a/pandas/_libs/src/ujson/python/date_conversions.h b/pandas/_libs/src/ujson/python/date_conversions.h index 45455f4d6128b..1b5cbf2a7e307 100644 --- a/pandas/_libs/src/ujson/python/date_conversions.h +++ b/pandas/_libs/src/ujson/python/date_conversions.h @@ -28,4 +28,6 @@ char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len); // Convert a Python Date/Datetime to Unix epoch with resolution base npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base); +char *int64ToIsoDuration(int64_t value, size_t *len); + #endif diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index d082fa30b9c1f..c540cb5235954 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -348,32 +348,6 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); } -/* Converts the int64_t representation of a duration to ISO; mutates len */ -static char *int64ToIsoDuration(int64_t value, size_t *len) { - pandas_timedeltastruct tds; - int ret_code; - - pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); - - // Max theoretical length of ISO Duration with 64 bit day - // as the largest unit is 70 characters + 1 for a null terminator - char *result = PyObject_Malloc(71); - if (result == NULL) { - PyErr_NoMemory(); - return NULL; - } - - ret_code = make_iso_8601_timedelta(&tds, result, len); - if (ret_code == -1) { - PyErr_SetString(PyExc_ValueError, - "Could not convert timedelta value to string"); - PyObject_Free(result); - return NULL; - } - - return result; -} - /* JSON callback. returns a char* and mutates the pointer to *len */ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { @@ -381,6 +355,12 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), return int64ToIso(GET_TC(tc)->longValue, base, len); } +/* JSON callback. returns a char* and mutates the pointer to *len */ +static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { + return int64ToIsoDuration(GET_TC(tc)->longValue, len); +} + /* JSON callback */ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, size_t *len) { @@ -1469,7 +1449,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, 1000000000LL; // nanoseconds per second } else { // datetime.* objects don't follow above rules - nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); + nanosecVal = + PyDateTimeToEpoch((PyDateTime_Date *)item, NPY_FR_ns); } } } @@ -1700,7 +1681,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base); + GET_TC(tc)->longValue = + PyDateTimeToEpoch((PyDateTime_Date *)obj, base); tc->type = JT_LONG; } return; @@ -1726,7 +1708,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base); + GET_TC(tc)->longValue = + PyDateTimeToEpoch((PyDateTime_Date *)obj, base); tc->type = JT_LONG; } return; @@ -1739,8 +1722,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } - GET_TC(tc)->longValue = value; - PRINTMARK(); if (value == get_nat()) { PRINTMARK(); @@ -1751,7 +1732,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_UTF8; } else { unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (scaleNanosecToUnit(&(GET_TC(tc)->longValue), unit) != 0) { + if (scaleNanosecToUnit(&value, unit) != 0) { // TODO: Add some kind of error handling here } @@ -1764,6 +1745,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_LONG; } + GET_TC(tc)->longValue = value; return; } else if (PyArray_IsScalar(obj, Integer)) { PRINTMARK(); From 35d4a4bf36153d0bf0cb9cab76b89e5f28c9cc2c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 20 Jan 2020 16:09:42 -0800 Subject: [PATCH 29/31] fix note --- pandas/_libs/tslibs/src/datetime/np_datetime_strings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 0ac203cf027f4..200a71ff0c2b7 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -85,7 +85,7 @@ make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, * * Mutates outlen to provide size of (non-NULL terminated) string. * - * Returns NULL on error. + * Currently has no error handling */ int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, size_t *outlen); From ebe58c76cc3cbaaf5942c4627b6e3b968534a176 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 11 Feb 2020 16:45:08 -0800 Subject: [PATCH 30/31] Whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 381578ad13bdd..cf2e6d331a334 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -187,6 +187,7 @@ I/O ``coerce_timestamps``; following pyarrow's default allows writing nanosecond timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) Plotting ^^^^^^^^ From ef08ad6c0bb4b1ded4d6f00d21bbb77f0a6055a3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 16 Mar 2020 19:43:34 -0700 Subject: [PATCH 31/31] more comprehensive testing --- pandas/tests/io/json/test_ujson.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index e86667626deda..34dd9ba9bc7b6 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -16,7 +16,7 @@ from pandas._libs.tslib import Timestamp import pandas.compat as compat -from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, date_range +from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range import pandas._testing as tm @@ -1103,3 +1103,24 @@ def test_encode_set(self): for v in dec: assert v in s + + @pytest.mark.parametrize( + "td", + [ + Timedelta(days=366), + Timedelta(days=-1), + Timedelta(hours=13, minutes=5, seconds=5), + Timedelta(hours=13, minutes=20, seconds=30), + Timedelta(days=-1, nanoseconds=5), + Timedelta(nanoseconds=1), + Timedelta(microseconds=1, nanoseconds=1), + Timedelta(milliseconds=1, microseconds=1, nanoseconds=1), + Timedelta(milliseconds=999, microseconds=999, nanoseconds=999), + ], + ) + def test_encode_timedelta_iso(self, td): + # GH 28256 + result = ujson.encode(td, iso_dates=True) + expected = f'"{td.isoformat()}"' + + assert result == expected