Skip to content

Commit 31f7201

Browse files
Merge remote-tracking branch 'upstream/main' into bisect
2 parents 8853ad2 + 11462d6 commit 31f7201

24 files changed

+481
-102
lines changed

doc/source/whatsnew/v1.5.0.rst

Lines changed: 83 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ Other enhancements
120120
- :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
121121
- :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
122122
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
123-
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.GroupBy.quantile` (:issue:`46560`)
123+
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, :meth:`DataFrame.cov`, :meth:`DataFrame.idxmin`, :meth:`DataFrame.idxmax`, :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.GroupBy.var`, :meth:`.GroupBy.std`, :meth:`.GroupBy.sem`, and :meth:`.DataFrameGroupBy.quantile` (:issue:`46560`)
124124
- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`)
125125
- Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`)
126126
- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
@@ -194,10 +194,47 @@ did not have the same index as the input.
194194
df.groupby('a', dropna=True).transform('ffill')
195195
df.groupby('a', dropna=True).transform(lambda x: x)
196196
197-
.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
197+
.. _whatsnew_150.notable_bug_fixes.to_json_incorrectly_localizing_naive_timestamps:
198198

199-
notable_bug_fix2
200-
^^^^^^^^^^^^^^^^
199+
Serializing tz-naive Timestamps with to_json() with ``iso_dates=True``
200+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
201+
202+
:meth:`DataFrame.to_json`, :meth:`Series.to_json`, and :meth:`Index.to_json`
203+
would incorrectly localize DatetimeArrays/DatetimeIndexes with tz-naive Timestamps
204+
to UTC. (:issue:`38760`)
205+
206+
Note that this patch does not fix the localization of tz-aware Timestamps to UTC
207+
upon serialization. (Related issue :issue:`12997`)
208+
209+
*Old Behavior*
210+
211+
.. ipython:: python
212+
213+
index = pd.date_range(
214+
start='2020-12-28 00:00:00',
215+
end='2020-12-28 02:00:00',
216+
freq='1H',
217+
)
218+
a = pd.Series(
219+
data=range(3),
220+
index=index,
221+
)
222+
223+
.. code-block:: ipython
224+
225+
In [4]: a.to_json(date_format='iso')
226+
Out[4]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}'
227+
228+
In [5]: pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
229+
Out[5]: array([False, False, False])
230+
231+
*New Behavior*
232+
233+
.. ipython:: python
234+
235+
a.to_json(date_format='iso')
236+
# Roundtripping now works
237+
pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
201238
202239
.. ---------------------------------------------------------------------------
203240
.. _whatsnew_150.api_breaking:
@@ -426,6 +463,48 @@ As ``group_keys=True`` is the default value of :meth:`DataFrame.groupby` and
426463
raise a ``FutureWarning``. This can be silenced and the previous behavior
427464
retained by specifying ``group_keys=False``.
428465

466+
.. _whatsnew_150.deprecations.numeric_only_default:
467+
468+
``numeric_only`` default value
469+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
470+
471+
Across the DataFrame operations such as ``min``, ``sum``, and ``idxmax``, the default
472+
value of the ``numeric_only`` argument, if it exists at all, was inconsistent.
473+
Furthermore, operations with the default value ``None`` can lead to surprising
474+
results. (:issue:`46560`)
475+
476+
.. code-block:: ipython
477+
478+
In [1]: df = pd.DataFrame({"a": [1, 2], "b": ["x", "y"]})
479+
480+
In [2]: # Reading the next line without knowing the contents of df, one would
481+
# expect the result to contain the products for both columns a and b.
482+
df[["a", "b"]].prod()
483+
Out[2]:
484+
a 2
485+
dtype: int64
486+
487+
To avoid this behavior, the specifying the value ``numeric_only=None`` has been
488+
deprecated, and will be removed in a future version of pandas. In the future,
489+
all operations with a ``numeric_only`` argument will default to ``False``. Users
490+
should either call the operation only with columns that can be operated on, or
491+
specify ``numeric_only=True`` to operate only on Boolean, integer, and float columns.
492+
493+
In order to support the transition to the new behavior, the following methods have
494+
gained the ``numeric_only`` argument.
495+
496+
- :meth:`DataFrame.corr`
497+
- :meth:`DataFrame.corrwith`
498+
- :meth:`DataFrame.cov`
499+
- :meth:`DataFrame.idxmin`
500+
- :meth:`DataFrame.idxmax`
501+
- :meth:`.DataFrameGroupBy.idxmin`
502+
- :meth:`.DataFrameGroupBy.idxmax`
503+
- :meth:`.GroupBy.var`
504+
- :meth:`.GroupBy.std`
505+
- :meth:`.GroupBy.sem`
506+
- :meth:`.DataFrameGroupBy.quantile`
507+
429508
.. _whatsnew_150.deprecations.other:
430509

431510
Other Deprecations

pandas/_libs/algos.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def rank_1d(
109109
ascending: bool = ...,
110110
pct: bool = ...,
111111
na_option=...,
112+
mask: npt.NDArray[np.bool_] | None = ...,
112113
) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1]
113114
def rank_2d(
114115
in_arr: np.ndarray, # ndarray[numeric_object_t, ndim=2]

pandas/_libs/algos.pyx

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,7 @@ def rank_1d(
889889
bint ascending=True,
890890
bint pct=False,
891891
na_option="keep",
892+
const uint8_t[:] mask=None,
892893
):
893894
"""
894895
Fast NaN-friendly version of ``scipy.stats.rankdata``.
@@ -918,6 +919,8 @@ def rank_1d(
918919
* keep: leave NA values where they are
919920
* top: smallest rank if ascending
920921
* bottom: smallest rank if descending
922+
mask : np.ndarray[bool], optional, default None
923+
Specify locations to be treated as NA, for e.g. Categorical.
921924
"""
922925
cdef:
923926
TiebreakEnumType tiebreak
@@ -927,7 +930,6 @@ def rank_1d(
927930
float64_t[::1] out
928931
ndarray[numeric_object_t, ndim=1] masked_vals
929932
numeric_object_t[:] masked_vals_memview
930-
uint8_t[:] mask
931933
bint keep_na, nans_rank_highest, check_labels, check_mask
932934
numeric_object_t nan_fill_val
933935

@@ -956,6 +958,7 @@ def rank_1d(
956958
or numeric_object_t is object
957959
or (numeric_object_t is int64_t and is_datetimelike)
958960
)
961+
check_mask = check_mask or mask is not None
959962

960963
# Copy values into new array in order to fill missing data
961964
# with mask, without obfuscating location of missing data
@@ -965,7 +968,9 @@ def rank_1d(
965968
else:
966969
masked_vals = values.copy()
967970

968-
if numeric_object_t is object:
971+
if mask is not None:
972+
pass
973+
elif numeric_object_t is object:
969974
mask = missing.isnaobj(masked_vals)
970975
elif numeric_object_t is int64_t and is_datetimelike:
971976
mask = (masked_vals == NPY_NAT).astype(np.uint8)

pandas/_libs/groupby.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ def group_rank(
128128
ascending: bool = ...,
129129
pct: bool = ...,
130130
na_option: Literal["keep", "top", "bottom"] = ...,
131+
mask: npt.NDArray[np.bool_] | None = ...,
131132
) -> None: ...
132133
def group_max(
133134
out: np.ndarray, # groupby_t[:, ::1]

pandas/_libs/groupby.pyx

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1262,6 +1262,7 @@ def group_rank(
12621262
bint ascending=True,
12631263
bint pct=False,
12641264
str na_option="keep",
1265+
const uint8_t[:, :] mask=None,
12651266
) -> None:
12661267
"""
12671268
Provides the rank of values within each group.
@@ -1294,6 +1295,7 @@ def group_rank(
12941295
* keep: leave NA values where they are
12951296
* top: smallest rank if ascending
12961297
* bottom: smallest rank if descending
1298+
mask : np.ndarray[bool] or None, default None
12971299

12981300
Notes
12991301
-----
@@ -1302,18 +1304,25 @@ def group_rank(
13021304
cdef:
13031305
Py_ssize_t i, k, N
13041306
ndarray[float64_t, ndim=1] result
1307+
const uint8_t[:] sub_mask
13051308

13061309
N = values.shape[1]
13071310

13081311
for k in range(N):
1312+
if mask is None:
1313+
sub_mask = None
1314+
else:
1315+
sub_mask = mask[:, k]
1316+
13091317
result = rank_1d(
13101318
values=values[:, k],
13111319
labels=labels,
13121320
is_datetimelike=is_datetimelike,
13131321
ties_method=ties_method,
13141322
ascending=ascending,
13151323
pct=pct,
1316-
na_option=na_option
1324+
na_option=na_option,
1325+
mask=sub_mask,
13171326
)
13181327
for i in range(len(result)):
13191328
# TODO: why can't we do out[:, k] = result?

pandas/_libs/src/ujson/python/date_conversions.c

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
5454
PyErr_NoMemory();
5555
return NULL;
5656
}
57-
58-
ret_code = make_iso_8601_datetime(&dts, result, *len, base);
57+
// datetime64 is always naive
58+
ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base);
5959
if (ret_code != 0) {
6060
PyErr_SetString(PyExc_ValueError,
6161
"Could not convert datetime value to string");
@@ -90,7 +90,19 @@ char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base,
9090

9191
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
9292
char *result = PyObject_Malloc(*len);
93-
ret = make_iso_8601_datetime(&dts, result, *len, base);
93+
// Check to see if PyDateTime has a timezone.
94+
// Don't convert to UTC if it doesn't.
95+
int is_tz_aware = 0;
96+
if (PyObject_HasAttrString(obj, "tzinfo")) {
97+
PyObject *offset = extract_utc_offset(obj);
98+
if (offset == NULL) {
99+
PyObject_Free(result);
100+
return NULL;
101+
}
102+
is_tz_aware = offset != Py_None;
103+
Py_DECREF(offset);
104+
}
105+
ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base);
94106

95107
if (ret != 0) {
96108
PyErr_SetString(PyExc_ValueError,

pandas/_libs/src/ujson/python/objToJSON.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,8 +221,18 @@ static PyObject *get_values(PyObject *obj) {
221221
// The special cases to worry about are dt64tz and category[dt64tz].
222222
// In both cases we want the UTC-localized datetime64 ndarray,
223223
// without going through and object array of Timestamps.
224+
if (PyObject_HasAttrString(obj, "tz")) {
225+
PyObject *tz = PyObject_GetAttrString(obj, "tz");
226+
if (tz != Py_None) {
227+
// Go through object array if we have dt64tz, since tz info will
228+
// be lost if values is used directly.
229+
Py_DECREF(tz);
230+
values = PyObject_CallMethod(obj, "__array__", NULL);
231+
return values;
232+
}
233+
Py_DECREF(tz);
234+
}
224235
values = PyObject_GetAttrString(obj, "values");
225-
226236
if (values == NULL) {
227237
// Clear so we can subsequently try another method
228238
PyErr_Clear();

pandas/_libs/tslibs/src/datetime/np_datetime.c

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,31 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a,
331331

332332
return 0;
333333
}
334+
/*
335+
* Returns the offset from utc of the timezone as a timedelta.
336+
* The caller is responsible for ensuring that the tzinfo
337+
* attribute exists on the datetime object.
338+
*
339+
* If the passed object is timezone naive, Py_None is returned.
340+
* If extraction of the offset fails, NULL is returned.
341+
*
342+
* NOTE: This function is not vendored from numpy.
343+
*/
344+
PyObject *extract_utc_offset(PyObject *obj) {
345+
PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo");
346+
if (tmp == NULL) {
347+
return NULL;
348+
}
349+
if (tmp != Py_None) {
350+
PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
351+
if (offset == NULL) {
352+
Py_DECREF(tmp);
353+
return NULL;
354+
}
355+
return offset;
356+
}
357+
return tmp;
358+
}
334359

335360
/*
336361
*
@@ -376,32 +401,22 @@ int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
376401
out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second"));
377402
out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond"));
378403

379-
/* Apply the time zone offset if datetime obj is tz-aware */
380-
if (PyObject_HasAttrString((PyObject*)obj, "tzinfo")) {
381-
tmp = PyObject_GetAttrString(obj, "tzinfo");
382-
if (tmp == NULL) {
383-
return -1;
384-
}
385-
if (tmp == Py_None) {
386-
Py_DECREF(tmp);
387-
} else {
388-
PyObject *offset;
404+
if (PyObject_HasAttrString(obj, "tzinfo")) {
405+
PyObject *offset = extract_utc_offset(obj);
406+
/* Apply the time zone offset if datetime obj is tz-aware */
407+
if (offset != NULL) {
408+
if (offset == Py_None) {
409+
Py_DECREF(offset);
410+
return 0;
411+
}
389412
PyObject *tmp_int;
390413
int seconds_offset, minutes_offset;
391-
392-
/* The utcoffset function should return a timedelta */
393-
offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
394-
if (offset == NULL) {
395-
Py_DECREF(tmp);
396-
return -1;
397-
}
398-
Py_DECREF(tmp);
399-
400414
/*
401415
* The timedelta should have a function "total_seconds"
402416
* which contains the value we want.
403417
*/
404418
tmp = PyObject_CallMethod(offset, "total_seconds", "");
419+
Py_DECREF(offset);
405420
if (tmp == NULL) {
406421
return -1;
407422
}

pandas/_libs/tslibs/src/datetime/np_datetime.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ extern const npy_datetimestruct _M_MAX_DTS;
4848
// stuff pandas needs
4949
// ----------------------------------------------------------------------------
5050

51+
PyObject *extract_utc_offset(PyObject *obj);
52+
5153
int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
5254
npy_datetimestruct *out);
5355

pandas/_libs/tslibs/src/datetime/np_datetime_strings.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
632632
* string was too short).
633633
*/
634634
int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
635-
NPY_DATETIMEUNIT base) {
635+
int utc, NPY_DATETIMEUNIT base) {
636636
char *substr = outstr;
637637
int sublen = outlen;
638638
int tmplen;
@@ -911,13 +911,14 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
911911

912912
add_time_zone:
913913
/* UTC "Zulu" time */
914-
if (sublen < 1) {
915-
goto string_too_short;
914+
if (utc) {
915+
if (sublen < 1) {
916+
goto string_too_short;
917+
}
918+
substr[0] = 'Z';
919+
substr += 1;
920+
sublen -= 1;
916921
}
917-
substr[0] = 'Z';
918-
substr += 1;
919-
sublen -= 1;
920-
921922
/* Add a NULL terminator, and return */
922923
if (sublen > 0) {
923924
substr[0] = '\0';

0 commit comments

Comments
 (0)