Skip to content

Improve ISO Date Performance for JSON #30496

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jan 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions asv_bench/benchmarks/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,30 @@ def peakmem_to_json_wide(self, orient, frame):
df.to_json(self.fname, orient=orient)


class ToJSONISO(BaseIO):
fname = "__test__.json"
params = [["split", "columns", "index", "values", "records"]]
param_names = ["orient"]

def setup(self, orient):
N = 10 ** 5
index = date_range("20000101", periods=N, freq="H")
timedeltas = timedelta_range(start=1, periods=N, freq="s")
datetimes = date_range(start=1, periods=N, freq="s")
self.df = DataFrame(
{
"td_1": timedeltas,
"td_2": timedeltas,
"ts_1": datetimes,
"ts_2": datetimes,
},
index=index,
)

def time_iso_format(self, orient):
self.df.to_json(orient=orient, date_format="iso")


class ToJSONLines(BaseIO):

fname = "__test__.json"
Expand Down
124 changes: 57 additions & 67 deletions pandas/_libs/src/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ static PyTypeObject *cls_dataframe;
static PyTypeObject *cls_series;
static PyTypeObject *cls_index;
static PyTypeObject *cls_nat;
PyObject *cls_timestamp;
PyObject *cls_timedelta;

npy_int64 get_nat(void) { return NPY_MIN_INT64; }
Expand Down Expand Up @@ -166,7 +165,6 @@ void *initObjToJSON(void) {
cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index");
cls_series =
(PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series");
cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp");
cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta");
Py_DECREF(mod_pandas);
}
Expand Down Expand Up @@ -408,30 +406,25 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen);
}

/* returns a char* and mutates the pointer to *len */
static char *NpyDateTimeToIso(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
size_t *len) {
/* Converts the int64_t representation of a datetime to ISO; mutates len */
static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
npy_datetimestruct dts;
int ret_code;
int64_t longVal = GET_TC(tc)->longValue;

pandas_datetime_to_datetimestruct(longVal, NPY_FR_ns, &dts);
pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts);

NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
char *result = PyObject_Malloc(*len);

if (result == NULL) {
PyErr_NoMemory();
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
return NULL;
}

ret_code = make_iso_8601_datetime(&dts, result, *len, base);
if (ret_code != 0) {
PyErr_SetString(PyExc_ValueError,
"Could not convert datetime value to string");
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
PyObject_Free(result);
}

Expand All @@ -441,30 +434,33 @@ static char *NpyDateTimeToIso(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
return result;
}

/* JSON callback. returns a char* and mutates the pointer to *len */
static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc,
size_t *len) {
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
return int64ToIso(GET_TC(tc)->longValue, base, len);
}

static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) {
scaleNanosecToUnit(&dt, base);
return dt;
}

static char *PyDateTimeToIso(JSOBJ obj, JSONTypeContext *tc, size_t *len) {
/* Convert PyDatetime To ISO C-string. mutates len */
static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base,
size_t *len) {
npy_datetimestruct dts;
int ret;

if (!PyDateTime_Check(obj)) {
// TODO: raise TypeError
}

ret = convert_pydatetime_to_datetimestruct(obj, &dts);
if (ret != 0) {
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_ValueError,
"Could not convert PyDateTime to numpy datetime");
}
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
return NULL;
}

NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
char *result = PyObject_Malloc(*len);
ret = make_iso_8601_datetime(&dts, result, *len, base);
Expand All @@ -473,7 +469,6 @@ static char *PyDateTimeToIso(JSOBJ obj, JSONTypeContext *tc, size_t *len) {
PRINTMARK();
PyErr_SetString(PyExc_ValueError,
"Could not convert datetime value to string");
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
PyObject_Free(result);
return NULL;
}
Expand All @@ -484,6 +479,19 @@ static char *PyDateTimeToIso(JSOBJ obj, JSONTypeContext *tc, size_t *len) {
return result;
}

/* JSON callback */
static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In another follow up I think I'll move the conversion routines to another file and keep only the core JSON serialization functionality within this ones; makes it a little easier to grok the difference between functions used as callbacks and those used to convert values into various formats

size_t *len) {

if (!PyDateTime_Check(obj)) {
PyErr_SetString(PyExc_TypeError, "Expected datetime object");
return NULL;
}

NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
return PyDateTimeToIso(obj, base, len);
}

static npy_datetime PyDateTimeToEpoch(PyObject *obj, NPY_DATETIMEUNIT base) {
npy_datetimestruct dts;
int ret;
Expand Down Expand Up @@ -1518,7 +1526,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
npy_intp num) {
// NOTE this function steals a reference to labels.
PyObject *item = NULL;
npy_intp i, stride, len;
size_t len;
npy_intp i, stride;
char **ret;
char *dataptr, *cLabel;
int type_num;
Expand Down Expand Up @@ -1559,8 +1568,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
break;
}

// TODO: for any matches on type_num (date and timedeltas) should use a
// vectorized solution to convert to epoch or iso formats
// TODO: vectorized timedelta solution
if (enc->datetimeIso &&
(type_num == NPY_TIMEDELTA || PyDelta_Check(item))) {
PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item);
Expand All @@ -1583,54 +1591,36 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
cLabel = (char *)PyUnicode_AsUTF8(iso);
Py_DECREF(iso);
len = strlen(cLabel);
} else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) ||
PyDate_Check(item)) {
PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item);
if (ts == NULL) {
Py_DECREF(item);
NpyArr_freeLabels(ret, num);
ret = 0;
break;
} else if (PyTypeNum_ISDATETIME(type_num)) {
NPY_DATETIMEUNIT base = enc->datetimeUnit;
npy_int64 longVal;
PyArray_VectorUnaryFunc *castfunc =
PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64);
if (!castfunc) {
PyErr_Format(PyExc_ValueError,
"Cannot cast numpy dtype %d to long",
enc->npyType);
}

castfunc(dataptr, &longVal, 1, NULL, NULL);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you need a Py_DECREF anywhere here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't think so - here's a snippet in NumPy where there is no DECREF

https://github.com/numpy/numpy/blob/5ce770ae3de63861c768229573397cadd052f712/numpy/core/src/multiarray/scalarapi.c#L212

And testing locally did segfault when trying

if (enc->datetimeIso) {
PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL);
Py_DECREF(ts);
if (iso == NULL) {
Py_DECREF(item);
NpyArr_freeLabels(ret, num);
ret = 0;
break;
cLabel = int64ToIso(longVal, base, &len);
} else {
if (!scaleNanosecToUnit(&longVal, base)) {
// TODO: This gets hit but somehow doesn't cause errors
// need to clean up (elsewhere in module as well)
}

cLabel = (char *)PyUnicode_AsUTF8(iso);
Py_DECREF(iso);
cLabel = PyObject_Malloc(21); // 21 chars for int64
sprintf(cLabel, "%" NPY_INT64_FMT, longVal);
len = strlen(cLabel);
}
} else if (PyDateTime_Check(item) || PyDate_Check(item)) {
NPY_DATETIMEUNIT base = enc->datetimeUnit;
if (enc->datetimeIso) {
cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len);
} else {
npy_int64 value;
// TODO: refactor to not duplicate what goes on in
// beginTypeContext
if (PyObject_HasAttrString(ts, "value")) {
PRINTMARK();
value = get_long_attr(ts, "value");
} else {
PRINTMARK();
value = total_seconds(ts) *
1000000000LL; // nanoseconds per second
}
Py_DECREF(ts);

NPY_DATETIMEUNIT unit = enc->datetimeUnit;
if (scaleNanosecToUnit(&value, unit) != 0) {
Py_DECREF(item);
NpyArr_freeLabels(ret, num);
ret = 0;
break;
}

char buf[21] = {0}; // 21 chars for 2**63 as string
cLabel = buf;
sprintf(buf, "%" NPY_INT64_FMT, value);
cLabel = PyObject_Malloc(21); // 21 chars for int64
sprintf(cLabel, "%" NPY_DATETIME_FMT,
PyDateTimeToEpoch(item, base));
len = strlen(cLabel);
}
} else { // Fallback to string representation
Expand Down Expand Up @@ -1740,7 +1730,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {

if (enc->datetimeIso) {
PRINTMARK();
pc->PyTypeToUTF8 = NpyDateTimeToIso;
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
// Currently no way to pass longVal to iso function, so use
// state management
GET_TC(tc)->longValue = longVal;
Expand Down Expand Up @@ -1815,7 +1805,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
PRINTMARK();
if (enc->datetimeIso) {
PRINTMARK();
pc->PyTypeToUTF8 = PyDateTimeToIso;
pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
tc->type = JT_UTF8;
} else {
PRINTMARK();
Expand All @@ -1841,7 +1831,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
PRINTMARK();
if (enc->datetimeIso) {
PRINTMARK();
pc->PyTypeToUTF8 = PyDateTimeToIso;
pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
tc->type = JT_UTF8;
} else {
PRINTMARK();
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,7 @@ def test_date_format_frame(self, date, date_unit):
json = df.to_json(date_format="iso")
result = read_json(json)
expected = df.copy()
# expected.index = expected.index.tz_localize("UTC")
expected.index = expected.index.tz_localize("UTC")
expected["date"] = expected["date"].dt.tz_localize("UTC")
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -884,7 +884,7 @@ def test_date_format_series(self, date, date_unit):
json = ts.to_json(date_format="iso")
result = read_json(json, typ="series")
expected = ts.copy()
# expected.index = expected.index.tz_localize("UTC")
expected.index = expected.index.tz_localize("UTC")
expected = expected.dt.tz_localize("UTC")
tm.assert_series_equal(result, expected)

Expand Down