diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index f2bf446c3bb6d..d38f4991759a1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -530,6 +530,7 @@ I/O - Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`) - Bug in :func:`read_xml` stripping whitespace in string data (:issue:`53811`) - Bug in :meth:`DataFrame.to_html` where ``colspace`` was incorrectly applied in case of multi index columns (:issue:`53885`) +- Bug in :meth:`DataFrame.to_json` where :class:`DateTimeArray`/:class:`DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`) - Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`) - Bug where ``bz2`` was treated as a hard requirement (:issue:`53857`) diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h index 3f9dad918938e..a5ad926924dc5 100644 --- a/pandas/_libs/include/pandas/datetime/date_conversions.h +++ b/pandas/_libs/include/pandas/datetime/date_conversions.h @@ -18,7 +18,10 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit); // up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z // while base="ns" yields "2020-01-01T00:00:00.000000000Z" // len is mutated to save the length of the returned string -char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len); +char *int64ToIso(int64_t value, + NPY_DATETIMEUNIT valueUnit, + NPY_DATETIMEUNIT base, + size_t *len); // TODO(username): this function doesn't do a lot; should augment or // replace with scaleNanosecToUnit diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h index 55aa046cf076b..3e362deb87807 100644 --- a/pandas/_libs/include/pandas/datetime/pd_datetime.h +++ b/pandas/_libs/include/pandas/datetime/pd_datetime.h @@ -34,7 +34,7 @@ typedef struct { npy_datetime (*npy_datetimestruct_to_datetime)(NPY_DATETIMEUNIT, const npy_datetimestruct *); int (*scaleNanosecToUnit)(npy_int64 *, NPY_DATETIMEUNIT); - char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, size_t *); + char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *); npy_datetime (*NpyDateTimeToEpoch)(npy_datetime, NPY_DATETIMEUNIT); char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *); npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT); @@ -73,8 +73,8 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL; (npy_datetimestruct)) #define scaleNanosecToUnit(value, unit) \ PandasDateTimeAPI->scaleNanosecToUnit((value), (unit)) -#define int64ToIso(value, base, len) \ - PandasDateTimeAPI->int64ToIso((value), (base), (len)) +#define int64ToIso(value, valueUnit, base, len) \ + PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (len)) #define NpyDateTimeToEpoch(dt, base) \ PandasDateTimeAPI->NpyDateTimeToEpoch((dt), (base)) #define PyDateTimeToIso(obj, base, len) \ diff --git a/pandas/_libs/src/datetime/date_conversions.c b/pandas/_libs/src/datetime/date_conversions.c index 84fc5507010ed..3bc3275be1cfe 100644 --- a/pandas/_libs/src/datetime/date_conversions.c +++ b/pandas/_libs/src/datetime/date_conversions.c @@ -41,11 +41,14 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { } /* Converts the int64_t representation of a datetime to ISO; mutates len */ -char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { +char *int64ToIso(int64_t value, + NPY_DATETIMEUNIT valueUnit, + NPY_DATETIMEUNIT base, + size_t *len) { npy_datetimestruct dts; int ret_code; - pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts); + pandas_datetime_to_datetimestruct(value, valueUnit, &dts); *len = (size_t)get_datetime_iso_8601_strlen(0, base); char *result = PyObject_Malloc(*len); diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 65b468f268d75..1fa82215179a8 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -131,6 +131,7 @@ typedef struct __PyObjectEncoder { int datetimeIso; NPY_DATETIMEUNIT datetimeUnit; + NPY_DATETIMEUNIT valueUnit; // output format style for pandas data types int outputFormat; @@ -350,7 +351,8 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, base, len); + NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; + GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); return GET_TC(tc)->cStr; } @@ -364,8 +366,9 @@ static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), /* JSON callback */ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, size_t *len) { - if (!PyDate_Check(obj)) { - PyErr_SetString(PyExc_TypeError, "Expected date object"); + if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; return NULL; } @@ -502,6 +505,10 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->itemValue = obj; Py_INCREF(obj); ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); + // Also write the resolution (unit) of the ndarray + PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); + ((PyObjectEncoder *)tc->encoder)->valueUnit = + get_datetime_metadata_from_dtype(dtype).base; ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { @@ -1255,6 +1262,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, char **ret; char *dataptr, *cLabel; int type_num; + PyArray_Descr *dtype; NPY_DATETIMEUNIT base = enc->datetimeUnit; if (!labels) { @@ -1283,6 +1291,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); type_num = PyArray_TYPE(labels); + dtype = PyArray_DESCR(labels); for (i = 0; i < num; i++) { item = PyArray_GETITEM(labels, dataptr); @@ -1293,7 +1302,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, } int is_datetimelike = 0; - npy_int64 nanosecVal; + npy_int64 i8date; + NPY_DATETIMEUNIT dateUnit = NPY_FR_ns; if (PyTypeNum_ISDATETIME(type_num)) { is_datetimelike = 1; PyArray_VectorUnaryFunc *castfunc = @@ -1303,35 +1313,37 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, "Cannot cast numpy dtype %d to long", enc->npyType); } - castfunc(dataptr, &nanosecVal, 1, NULL, NULL); + castfunc(dataptr, &i8date, 1, NULL, NULL); + dateUnit = get_datetime_metadata_from_dtype(dtype).base; } else if (PyDate_Check(item) || PyDelta_Check(item)) { is_datetimelike = 1; if (PyObject_HasAttrString(item, "_value")) { // see test_date_index_and_values for case with non-nano - nanosecVal = get_long_attr(item, "_value"); + i8date = get_long_attr(item, "_value"); } else { if (PyDelta_Check(item)) { - nanosecVal = total_seconds(item) * + i8date = total_seconds(item) * 1000000000LL; // nanoseconds per second } else { // datetime.* objects don't follow above rules - nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); + i8date = PyDateTimeToEpoch(item, NPY_FR_ns); } } } if (is_datetimelike) { - if (nanosecVal == get_nat()) { + if (i8date == get_nat()) { len = 4; cLabel = PyObject_Malloc(len + 1); strncpy(cLabel, "null", len + 1); } else { if (enc->datetimeIso) { if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { - cLabel = int64ToIsoDuration(nanosecVal, &len); + // TODO(username): non-nano timedelta support? + cLabel = int64ToIsoDuration(i8date, &len); } else { if (type_num == NPY_DATETIME) { - cLabel = int64ToIso(nanosecVal, base, &len); + cLabel = int64ToIso(i8date, dateUnit, base, &len); } else { cLabel = PyDateTimeToIso(item, base, &len); } @@ -1346,7 +1358,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, int size_of_cLabel = 21; // 21 chars for int 64 cLabel = PyObject_Malloc(size_of_cLabel); snprintf(cLabel, size_of_cLabel, "%" NPY_DATETIME_FMT, - NpyDateTimeToEpoch(nanosecVal, base)); + NpyDateTimeToEpoch(i8date, base)); len = strlen(cLabel); } } @@ -1538,13 +1550,25 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->type = JT_UTF8; return; } else if (PyArray_IsScalar(obj, Datetime)) { + npy_int64 longVal; if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) { tc->type = JT_NULL; return; } + PyArray_Descr *dtype = PyArray_DescrFromScalar(obj); + if (!PyTypeNum_ISDATETIME(dtype->type_num)) { + PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime"); + return; + } + + PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64); + PyArray_CastScalarToCtype(obj, &longVal, outcode); + Py_DECREF(outcode); if (enc->datetimeIso) { - pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; + GET_TC(tc)->longValue = longVal; + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base; tc->type = JT_UTF8; } else { NPY_DATETIMEUNIT base = diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 11909bf56f05c..3596e8128f883 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -954,6 +954,45 @@ def test_date_unit(self, unit, datetime_frame): result = read_json(StringIO(json), date_unit=None) tm.assert_frame_equal(result, df) + @pytest.mark.parametrize("unit", ["s", "ms", "us"]) + def test_iso_non_nano_datetimes(self, unit): + # Test that numpy datetimes + # in an Index or a column with non-nano resolution can be serialized + # correctly + # GH53686 + index = DatetimeIndex( + [np.datetime64("2023-01-01T11:22:33.123456", unit)], + dtype=f"datetime64[{unit}]", + ) + df = DataFrame( + { + "date": Series( + [np.datetime64("2022-01-01T11:22:33.123456", unit)], + dtype=f"datetime64[{unit}]", + index=index, + ), + "date_obj": Series( + [np.datetime64("2023-01-01T11:22:33.123456", unit)], + dtype=object, + index=index, + ), + }, + ) + + buf = StringIO() + df.to_json(buf, date_format="iso", date_unit=unit) + buf.seek(0) + + # read_json always reads datetimes in nanosecond resolution + # TODO: check_dtype/check_index_type should be removable + # once read_json gets non-nano support + tm.assert_frame_equal( + read_json(buf, convert_dates=["date", "date_obj"]), + df, + check_index_type=False, + check_dtype=False, + ) + def test_weird_nested_json(self): # this used to core dump the parser s = r"""{