Skip to content

Commit c28fd8a

Browse files
committed
Merge branch 'main' into bug_46673
2 parents 3d9e157 + bedd8f0 commit c28fd8a

36 files changed

+744
-116
lines changed

doc/source/whatsnew/v1.4.4.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ including other versions of pandas.
1515
Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`)
18+
- Fixed regression in setting ``None`` or non-string value into a ``string``-dtype Series using a mask (:issue:`47628`)
1819
-
1920

2021
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v1.5.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,7 @@ Missing
915915
^^^^^^^
916916
- Bug in :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``downcast`` keyword not being respected in some cases where there are no NA values present (:issue:`45423`)
917917
- Bug in :meth:`Series.fillna` and :meth:`DataFrame.fillna` with :class:`IntervalDtype` and incompatible value raising instead of casting to a common (usually object) dtype (:issue:`45796`)
918+
- Bug in :meth:`Series.map` not respecting ``na_action`` argument if mapper is a ``dict`` or :class:`Series` (:issue:`47527`)
918919
- Bug in :meth:`DataFrame.interpolate` with object-dtype column not returning a copy with ``inplace=False`` (:issue:`45791`)
919920
- Bug in :meth:`DataFrame.dropna` allows to set both ``how`` and ``thresh`` incompatible arguments (:issue:`46575`)
920921
- Bug in :meth:`DataFrame.fillna` ignored ``axis`` when :class:`DataFrame` is single block (:issue:`47713`)
@@ -957,6 +958,7 @@ I/O
957958
- Bug in :func:`read_sas` that scrambled column names (:issue:`31243`)
958959
- Bug in :func:`read_sas` with RLE-compressed SAS7BDAT files that contain 0x00 control bytes (:issue:`47099`)
959960
- Bug in :func:`read_parquet` with ``use_nullable_dtypes=True`` where ``float64`` dtype was returned instead of nullable ``Float64`` dtype (:issue:`45694`)
961+
- Bug in :meth:`DataFrame.to_json` where ``PeriodDtype`` would not make the serialization roundtrip when read back with :meth:`read_json` (:issue:`44720`)
960962

961963
Period
962964
^^^^^^
@@ -1021,6 +1023,7 @@ Reshaping
10211023
- Bug in :meth:`DataFrame.join` with a list when using suffixes to join DataFrames with duplicate column names (:issue:`46396`)
10221024
- Bug in :meth:`DataFrame.pivot_table` with ``sort=False`` results in sorted index (:issue:`17041`)
10231025
- Bug in :meth:`concat` when ``axis=1`` and ``sort=False`` where the resulting Index was a :class:`Int64Index` instead of a :class:`RangeIndex` (:issue:`46675`)
1026+
- Bug in :meth:`wide_to_long` raises when ``stubnames`` is missing in columns and ``i`` contains string dtype column (:issue:`46044`)
10241027

10251028
Sparse
10261029
^^^^^^

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,4 +127,4 @@ dependencies:
127127
# build the interactive terminal
128128
- jupyterlab >=3.4,<4
129129
- pip:
130-
- jupyterlite==0.1.0b9
130+
- jupyterlite==0.1.0b10

pandas/_libs/tslibs/dtypes.pxd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ from numpy cimport int64_t
33
from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
44

55

6-
cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
6+
cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
77
cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
88
cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil
99
cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1

pandas/_libs/tslibs/dtypes.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ _period_code_map: dict[str, int]
88
def periods_per_day(reso: int) -> int: ...
99
def periods_per_second(reso: int) -> int: ...
1010
def is_supported_unit(reso: int) -> bool: ...
11+
def npy_unit_to_abbrev(reso: int) -> str: ...
1112

1213
class PeriodDtypeBase:
1314
_dtype_code: int # PeriodDtypeCode

pandas/_libs/tslibs/dtypes.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ def is_supported_unit(NPY_DATETIMEUNIT reso):
289289
)
290290

291291

292-
cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
292+
cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
293293
if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
294294
# generic -> default to nanoseconds
295295
return "ns"

pandas/_libs/tslibs/np_datetime.pxd

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,18 @@ cpdef cnp.ndarray astype_overflowsafe(
101101
cnp.ndarray values, # ndarray[datetime64[anyunit]]
102102
cnp.dtype dtype, # ndarray[datetime64[anyunit]]
103103
bint copy=*,
104+
bint round_ok=*,
104105
)
105106
cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1
106107

107108
cdef bint cmp_dtstructs(npy_datetimestruct* left, npy_datetimestruct* right, int op)
108109
cdef get_implementation_bounds(
109110
NPY_DATETIMEUNIT reso, npy_datetimestruct *lower, npy_datetimestruct *upper
110111
)
112+
113+
cdef int64_t convert_reso(
114+
int64_t value,
115+
NPY_DATETIMEUNIT from_reso,
116+
NPY_DATETIMEUNIT to_reso,
117+
bint round_ok,
118+
) except? -1

pandas/_libs/tslibs/np_datetime.pyi

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@ class OutOfBoundsTimedelta(ValueError): ...
99
def py_get_unit_from_dtype(dtype: np.dtype): ...
1010
def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ...
1111
def astype_overflowsafe(
12-
arr: np.ndarray, dtype: np.dtype, copy: bool = ...
12+
arr: np.ndarray,
13+
dtype: np.dtype,
14+
copy: bool = ...,
15+
round_ok: bool = ...,
1316
) -> np.ndarray: ...
1417
def is_unitless(dtype: np.dtype) -> bool: ...
1518
def compare_mismatched_resolutions(

pandas/_libs/tslibs/np_datetime.pyx

Lines changed: 125 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
cimport cython
22
from cpython.datetime cimport (
3+
PyDateTime_CheckExact,
34
PyDateTime_DATE_GET_HOUR,
45
PyDateTime_DATE_GET_MICROSECOND,
56
PyDateTime_DATE_GET_MINUTE,
@@ -229,7 +230,13 @@ def py_td64_to_tdstruct(int64_t td64, NPY_DATETIMEUNIT unit):
229230

230231

231232
cdef inline void pydatetime_to_dtstruct(datetime dt, npy_datetimestruct *dts):
232-
dts.year = PyDateTime_GET_YEAR(dt)
233+
if PyDateTime_CheckExact(dt):
234+
dts.year = PyDateTime_GET_YEAR(dt)
235+
else:
236+
# We use dt.year instead of PyDateTime_GET_YEAR because with Timestamp
237+
# we override year such that PyDateTime_GET_YEAR is incorrect.
238+
dts.year = dt.year
239+
233240
dts.month = PyDateTime_GET_MONTH(dt)
234241
dts.day = PyDateTime_GET_DAY(dt)
235242
dts.hour = PyDateTime_DATE_GET_HOUR(dt)
@@ -282,6 +289,7 @@ cpdef ndarray astype_overflowsafe(
282289
ndarray values,
283290
cnp.dtype dtype,
284291
bint copy=True,
292+
bint round_ok=True,
285293
):
286294
"""
287295
Convert an ndarray with datetime64[X] to datetime64[Y]
@@ -314,20 +322,24 @@ cpdef ndarray astype_overflowsafe(
314322
"datetime64/timedelta64 values and dtype must have a unit specified"
315323
)
316324

317-
if (<object>values).dtype.byteorder == ">":
318-
# GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap
319-
values = values.astype(values.dtype.newbyteorder("<"))
320-
321325
if from_unit == to_unit:
322326
# Check this before allocating result for perf, might save some memory
323327
if copy:
324328
return values.copy()
325329
return values
326330

327331
elif from_unit > to_unit:
328-
# e.g. ns -> us, so there is no risk of overflow, so we can use
329-
# numpy's astype safely. Note there _is_ risk of truncation.
330-
return values.astype(dtype)
332+
if round_ok:
333+
# e.g. ns -> us, so there is no risk of overflow, so we can use
334+
# numpy's astype safely. Note there _is_ risk of truncation.
335+
return values.astype(dtype)
336+
else:
337+
iresult2 = astype_round_check(values.view("i8"), from_unit, to_unit)
338+
return iresult2.view(dtype)
339+
340+
if (<object>values).dtype.byteorder == ">":
341+
# GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap
342+
values = values.astype(values.dtype.newbyteorder("<"))
331343

332344
cdef:
333345
ndarray i8values = values.view("i8")
@@ -356,10 +368,11 @@ cpdef ndarray astype_overflowsafe(
356368
check_dts_bounds(&dts, to_unit)
357369
except OutOfBoundsDatetime as err:
358370
if is_td:
359-
tdval = np.timedelta64(value).view(values.dtype)
371+
from_abbrev = np.datetime_data(values.dtype)[0]
372+
np_val = np.timedelta64(value, from_abbrev)
360373
msg = (
361-
"Cannot convert {tdval} to {dtype} without overflow"
362-
.format(tdval=str(tdval), dtype=str(dtype))
374+
"Cannot convert {np_val} to {dtype} without overflow"
375+
.format(np_val=str(np_val), dtype=str(dtype))
363376
)
364377
raise OutOfBoundsTimedelta(msg) from err
365378
else:
@@ -453,6 +466,52 @@ cdef int op_to_op_code(op):
453466
return Py_GT
454467

455468

469+
cdef ndarray astype_round_check(
470+
ndarray i8values,
471+
NPY_DATETIMEUNIT from_unit,
472+
NPY_DATETIMEUNIT to_unit
473+
):
474+
# cases with from_unit > to_unit, e.g. ns->us, raise if the conversion
475+
# involves truncation, e.g. 1500ns->1us
476+
cdef:
477+
Py_ssize_t i, N = i8values.size
478+
479+
# equiv: iresult = np.empty((<object>i8values).shape, dtype="i8")
480+
ndarray iresult = cnp.PyArray_EMPTY(
481+
i8values.ndim, i8values.shape, cnp.NPY_INT64, 0
482+
)
483+
cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values)
484+
485+
# Note the arguments to_unit, from unit are swapped vs how they
486+
# are passed when going to a higher-frequency reso.
487+
int64_t mult = get_conversion_factor(to_unit, from_unit)
488+
int64_t value, mod
489+
490+
for i in range(N):
491+
# Analogous to: item = i8values[i]
492+
value = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
493+
494+
if value == NPY_DATETIME_NAT:
495+
new_value = NPY_DATETIME_NAT
496+
else:
497+
new_value, mod = divmod(value, mult)
498+
if mod != 0:
499+
# TODO: avoid runtime import
500+
from pandas._libs.tslibs.dtypes import npy_unit_to_abbrev
501+
from_abbrev = npy_unit_to_abbrev(from_unit)
502+
to_abbrev = npy_unit_to_abbrev(to_unit)
503+
raise ValueError(
504+
f"Cannot losslessly cast '{value} {from_abbrev}' to {to_abbrev}"
505+
)
506+
507+
# Analogous to: iresult[i] = new_value
508+
(<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value
509+
510+
cnp.PyArray_MultiIter_NEXT(mi)
511+
512+
return iresult
513+
514+
456515
@cython.overflowcheck(True)
457516
cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1:
458517
"""
@@ -489,5 +548,59 @@ cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT
489548
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit)
490549
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs:
491550
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit)
551+
552+
553+
cdef int64_t convert_reso(
554+
int64_t value,
555+
NPY_DATETIMEUNIT from_reso,
556+
NPY_DATETIMEUNIT to_reso,
557+
bint round_ok,
558+
) except? -1:
559+
cdef:
560+
int64_t res_value, mult, div, mod
561+
562+
if from_reso == to_reso:
563+
return value
564+
565+
elif to_reso < from_reso:
566+
# e.g. ns -> us, no risk of overflow, but can be lossy rounding
567+
mult = get_conversion_factor(to_reso, from_reso)
568+
div, mod = divmod(value, mult)
569+
if mod > 0 and not round_ok:
570+
raise ValueError("Cannot losslessly convert units")
571+
572+
# Note that when mod > 0, we follow np.timedelta64 in always
573+
# rounding down.
574+
res_value = div
575+
576+
elif (
577+
from_reso == NPY_FR_Y
578+
or from_reso == NPY_FR_M
579+
or to_reso == NPY_FR_Y
580+
or to_reso == NPY_FR_M
581+
):
582+
# Converting by multiplying isn't _quite_ right bc the number of
583+
# seconds in a month/year isn't fixed.
584+
res_value = _convert_reso_with_dtstruct(value, from_reso, to_reso)
585+
492586
else:
493-
raise ValueError(from_unit, to_unit)
587+
# e.g. ns -> us, risk of overflow, but no risk of lossy rounding
588+
mult = get_conversion_factor(from_reso, to_reso)
589+
with cython.overflowcheck(True):
590+
# Note: caller is responsible for re-raising as OutOfBoundsTimedelta
591+
res_value = value * mult
592+
593+
return res_value
594+
595+
596+
cdef int64_t _convert_reso_with_dtstruct(
597+
int64_t value,
598+
NPY_DATETIMEUNIT from_unit,
599+
NPY_DATETIMEUNIT to_unit,
600+
) except? -1:
601+
cdef:
602+
npy_datetimestruct dts
603+
604+
pandas_datetime_to_datetimestruct(value, from_unit, &dts)
605+
check_dts_bounds(&dts, to_unit)
606+
return npy_datetimestruct_to_datetime(to_unit, &dts)

pandas/_libs/tslibs/offsets.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ def to_offset(freq: timedelta | str) -> BaseOffset: ...
111111

112112
class Tick(SingleConstructorOffset):
113113
_reso: int
114+
_prefix: str
115+
_td64_unit: str
114116
def __init__(self, n: int = ..., normalize: bool = ...) -> None: ...
115117
@property
116118
def delta(self) -> Timedelta: ...

0 commit comments

Comments
 (0)