Skip to content

Commit 353b95e

Browse files
authored
Merge pull request #182 from pandas-dev/master
Sync Fork from Upstream Repo
2 parents 435f606 + 04f9a4b commit 353b95e

40 files changed

+484
-288
lines changed

doc/source/whatsnew/v1.3.0.rst

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,38 @@ Notable bug fixes
230230

231231
These are bug fixes that might have notable behavior changes.
232232

233+
``Categorical.unique`` now always maintains same dtype as original
234+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
235+
236+
Previously, when calling :meth:`~Categorical.unique` with categorical data, unused categories in the new array
237+
would be removed, meaning that the dtype of the new array would be different than the
238+
original, if some categories are not present in the unique array (:issue:`18291`)
239+
240+
As an example of this, given:
241+
242+
.. ipython:: python
243+
244+
dtype = pd.CategoricalDtype(['bad', 'neutral', 'good'], ordered=True)
245+
cat = pd.Categorical(['good', 'good', 'bad', 'bad'], dtype=dtype)
246+
original = pd.Series(cat)
247+
unique = original.unique()
248+
249+
*pandas < 1.3.0*:
250+
251+
.. code-block:: ipython
252+
253+
In [1]: unique
254+
['good', 'bad']
255+
Categories (2, object): ['bad' < 'good']
256+
In [2]: original.dtype == unique.dtype
257+
False
258+
259+
*pandas >= 1.3.0*
260+
261+
.. ipython:: python
262+
263+
unique
264+
original.dtype == unique.dtype
233265
234266
Preserve dtypes in :meth:`~pandas.DataFrame.combine_first`
235267
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -790,6 +822,9 @@ Groupby/resample/rolling
790822
- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would incorrectly raise a ``ValueError`` when providing ``times`` (:issue:`40164`)
791823
- Bug in :class:`core.window.ewm.ExponentialMovingWindow` when calling ``__getitem__`` would not retain ``com``, ``span``, ``alpha`` or ``halflife`` attributes (:issue:`40164`)
792824
- :class:`core.window.ewm.ExponentialMovingWindow` now raises a ``NotImplementedError`` when specifying ``times`` with ``adjust=False`` due to an incorrect calculation (:issue:`40098`)
825+
- Bug in :meth:`core.window.ewm.ExponentialMovingWindowGroupby.mean` where the times argument was ignored when ``engine='numba'`` (:issue:`40951`)
826+
- Bug in :meth:`core.window.ewm.ExponentialMovingWindowGroupby.mean` where the wrong times were used in case of multiple groups (:issue:`40951`)
827+
- Bug in :class:`core.window.ewm.ExponentialMovingWindowGroupby` where the times vector and values became out of sync for non-trivial groups (:issue:`40951`)
793828
- Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
794829
- Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
795830
- Bug in :meth:`SeriesGroupBy.aggregate` where using a user-defined function to aggregate a ``Series`` with an object-typed :class:`Index` causes an incorrect :class:`Index` shape (issue:`40014`)
@@ -830,7 +865,7 @@ ExtensionArray
830865
- Bug in :meth:`DataFrame.where` when ``other`` is a :class:`Series` with :class:`ExtensionArray` dtype (:issue:`38729`)
831866
- Fixed bug where :meth:`Series.idxmax`, :meth:`Series.idxmin` and ``argmax/min`` fail when the underlying data is :class:`ExtensionArray` (:issue:`32749`, :issue:`33719`, :issue:`36566`)
832867
- Fixed a bug where some properties of subclasses of :class:`PandasExtensionDtype` where improperly cached (:issue:`40329`)
833-
-
868+
- Bug in :meth:`DataFrame.mask` where masking a :class:`Dataframe` with an :class:`ExtensionArray` dtype raises ``ValueError`` (:issue:`40941`)
834869

835870
Styler
836871
^^^^^^

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -791,7 +791,8 @@ cdef class StringHashTable(HashTable):
791791
raise KeyError(key)
792792

793793
@cython.boundscheck(False)
794-
def get_indexer(self, ndarray[object] values):
794+
def get_indexer(self, ndarray[object] values) -> ndarray:
795+
# -> np.ndarray[np.intp]
795796
cdef:
796797
Py_ssize_t i, n = len(values)
797798
ndarray[intp_t] labels = np.empty(n, dtype=np.intp)

pandas/_libs/lib.pyi

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import (
55
Any,
66
Callable,
7+
Generator,
78
)
89

910
import numpy as np
@@ -52,8 +53,7 @@ def is_bool_array(values: np.ndarray, skipna: bool = False): ...
5253

5354
def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> ArrayLike: ...
5455

55-
# TODO: gen: Generator?
56-
def fast_unique_multiple_list_gen(gen: object, sort: bool = True) -> list: ...
56+
def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ...
5757
def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ...
5858
def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ...
5959

@@ -90,10 +90,9 @@ def infer_datetimelike_array(
9090
arr: np.ndarray # np.ndarray[object]
9191
) -> str: ...
9292

93-
# TODO: new_dtype -> np.dtype?
9493
def astype_intsafe(
9594
arr: np.ndarray, # np.ndarray[object]
96-
new_dtype,
95+
new_dtype: np.dtype,
9796
) -> np.ndarray: ...
9897

9998
def fast_zip(ndarrays: list) -> np.ndarray: ... # np.ndarray[object]
@@ -134,15 +133,13 @@ def memory_usage_of_objects(
134133
) -> int: ... # np.int64
135134

136135

137-
# TODO: f: Callable?
138-
# TODO: dtype -> DtypeObj?
139136
def map_infer_mask(
140137
arr: np.ndarray,
141138
f: Callable[[Any], Any],
142139
mask: np.ndarray, # const uint8_t[:]
143140
convert: bool = ...,
144141
na_value: Any = ...,
145-
dtype: Any = ...,
142+
dtype: np.dtype = ...,
146143
) -> ArrayLike: ...
147144

148145
def indices_fast(

pandas/_libs/lib.pyx

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -633,7 +633,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool:
633633

634634
@cython.wraparound(False)
635635
@cython.boundscheck(False)
636-
def astype_intsafe(ndarray[object] arr, new_dtype) -> ndarray:
636+
def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
637637
cdef:
638638
Py_ssize_t i, n = len(arr)
639639
object val
@@ -661,7 +661,8 @@ cpdef ndarray[object] ensure_string_array(
661661
bint copy=True,
662662
bint skipna=True,
663663
):
664-
"""Returns a new numpy array with object dtype and only strings and na values.
664+
"""
665+
Returns a new numpy array with object dtype and only strings and na values.
665666
666667
Parameters
667668
----------
@@ -679,7 +680,7 @@ cpdef ndarray[object] ensure_string_array(
679680
680681
Returns
681682
-------
682-
ndarray
683+
np.ndarray[object]
683684
An array with the input array's elements casted to str or nan-like.
684685
"""
685686
cdef:
@@ -2452,7 +2453,8 @@ no_default = NoDefault.no_default # Sentinel indicating the default value.
24522453
@cython.boundscheck(False)
24532454
@cython.wraparound(False)
24542455
def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True,
2455-
object na_value=no_default, object dtype=object) -> "ArrayLike":
2456+
object na_value=no_default, cnp.dtype dtype=np.dtype(object)
2457+
) -> "ArrayLike":
24562458
"""
24572459
Substitute for np.vectorize with pandas-friendly dtype inference.
24582460

@@ -2472,7 +2474,7 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr
24722474

24732475
Returns
24742476
-------
2475-
ndarray
2477+
np.ndarray or ExtensionArray
24762478
"""
24772479
cdef:
24782480
Py_ssize_t i, n

pandas/_libs/tslibs/fields.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def build_field_sarray(const int64_t[:] dtindex):
9393
return out
9494

9595

96-
def month_position_check(fields, weekdays):
96+
def month_position_check(fields, weekdays) -> str | None:
9797
cdef:
9898
int32_t daysinmonth, y, m, d
9999
bint calendar_end = True
@@ -755,7 +755,7 @@ cdef inline ndarray[int64_t] _roundup_int64(values, int64_t unit):
755755
return _floor_int64(values + unit // 2, unit)
756756

757757

758-
def round_nsint64(values: np.ndarray, mode: RoundTo, nanos) -> np.ndarray:
758+
def round_nsint64(values: np.ndarray, mode: RoundTo, nanos: int) -> np.ndarray:
759759
"""
760760
Applies rounding mode at given frequency
761761

pandas/_libs/window/aggregations.pyx

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1485,8 +1485,7 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end,
14851485
com : float64
14861486
adjust : bool
14871487
ignore_na : bool
1488-
times : ndarray (float64 type)
1489-
halflife : float64
1488+
deltas : ndarray (float64 type)
14901489
14911490
Returns
14921491
-------
@@ -1495,7 +1494,7 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end,
14951494

14961495
cdef:
14971496
Py_ssize_t i, j, s, e, nobs, win_size, N = len(vals), M = len(start)
1498-
const float64_t[:] sub_vals
1497+
const float64_t[:] sub_deltas, sub_vals
14991498
ndarray[float64_t] sub_output, output = np.empty(N, dtype=float)
15001499
float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur
15011500
bint is_observation
@@ -1511,6 +1510,9 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end,
15111510
s = start[j]
15121511
e = end[j]
15131512
sub_vals = vals[s:e]
1513+
# note that len(deltas) = len(vals) - 1 and deltas[i] is to be used in
1514+
# conjunction with vals[i+1]
1515+
sub_deltas = deltas[s:e - 1]
15141516
win_size = len(sub_vals)
15151517
sub_output = np.empty(win_size, dtype=float)
15161518

@@ -1528,7 +1530,7 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end,
15281530
if weighted_avg == weighted_avg:
15291531

15301532
if is_observation or not ignore_na:
1531-
old_wt *= old_wt_factor ** deltas[i - 1]
1533+
old_wt *= old_wt_factor ** sub_deltas[i - 1]
15321534
if is_observation:
15331535

15341536
# avoid numerical errors on constant series

pandas/core/arrays/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,7 @@ def argsort(
609609
610610
Returns
611611
-------
612-
ndarray
612+
np.ndarray[np.intp]
613613
Array of indices that sort ``self``. If NaN values are contained,
614614
NaN values are placed at the end.
615615

pandas/core/arrays/categorical.py

Lines changed: 10 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1599,7 +1599,7 @@ def argsort(self, ascending=True, kind="quicksort", **kwargs):
15991599
16001600
Returns
16011601
-------
1602-
numpy.array
1602+
np.ndarray[np.intp]
16031603
16041604
See Also
16051605
--------
@@ -2127,16 +2127,15 @@ def mode(self, dropna=True):
21272127
def unique(self):
21282128
"""
21292129
Return the ``Categorical`` which ``categories`` and ``codes`` are
2130-
unique. Unused categories are NOT returned.
2130+
unique.
2131+
2132+
.. versionchanged:: 1.3.0
21312133
2132-
- unordered category: values and categories are sorted by appearance
2133-
order.
2134-
- ordered category: values are sorted by appearance order, categories
2135-
keeps existing order.
2134+
Previously, unused categories were dropped from the new categories.
21362135
21372136
Returns
21382137
-------
2139-
unique values : ``Categorical``
2138+
Categorical
21402139
21412140
See Also
21422141
--------
@@ -2146,37 +2145,15 @@ def unique(self):
21462145
21472146
Examples
21482147
--------
2149-
An unordered Categorical will return categories in the
2150-
order of appearance.
2151-
21522148
>>> pd.Categorical(list("baabc")).unique()
21532149
['b', 'a', 'c']
2154-
Categories (3, object): ['b', 'a', 'c']
2155-
2156-
>>> pd.Categorical(list("baabc"), categories=list("abc")).unique()
2157-
['b', 'a', 'c']
2158-
Categories (3, object): ['b', 'a', 'c']
2159-
2160-
An ordered Categorical preserves the category ordering.
2161-
2162-
>>> pd.Categorical(
2163-
... list("baabc"), categories=list("abc"), ordered=True
2164-
... ).unique()
2165-
['b', 'a', 'c']
2150+
Categories (3, object): ['a', 'b', 'c']
2151+
>>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()
2152+
['b', 'a']
21662153
Categories (3, object): ['a' < 'b' < 'c']
21672154
"""
2168-
# unlike np.unique, unique1d does not sort
21692155
unique_codes = unique1d(self.codes)
2170-
cat = self.copy()
2171-
2172-
# keep nan in codes
2173-
cat._ndarray = unique_codes
2174-
2175-
# exclude nan from indexer for categories
2176-
take_codes = unique_codes[unique_codes != -1]
2177-
if self.ordered:
2178-
take_codes = np.sort(take_codes)
2179-
return cat.set_categories(cat.categories.take(take_codes))
2156+
return self._from_backing_data(unique_codes)
21802157

21812158
def _values_for_factorize(self):
21822159
return self._ndarray, -1

pandas/core/arrays/datetimes.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -855,8 +855,9 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise") -> DatetimeArr
855855
This method takes a time zone (tz) naive Datetime Array/Index object
856856
and makes this time zone aware. It does not move the time to another
857857
time zone.
858-
Time zone localization helps to switch from time zone aware to time
859-
zone unaware objects.
858+
859+
This method can also be used to do the inverse -- to create a time
860+
zone unaware object from an aware object. To that end, pass `tz=None`.
860861
861862
Parameters
862863
----------

pandas/core/arrays/timedeltas.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,10 @@ class TimedeltaArray(dtl.TimelikeOps):
135135
# define my properties & methods for delegation
136136
_other_ops: list[str] = []
137137
_bool_ops: list[str] = []
138-
_object_ops = ["freq"]
139-
_field_ops = ["days", "seconds", "microseconds", "nanoseconds"]
140-
_datetimelike_ops = _field_ops + _object_ops + _bool_ops
141-
_datetimelike_methods = [
138+
_object_ops: list[str] = ["freq"]
139+
_field_ops: list[str] = ["days", "seconds", "microseconds", "nanoseconds"]
140+
_datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops
141+
_datetimelike_methods: list[str] = [
142142
"to_pytimedelta",
143143
"total_seconds",
144144
"round",

pandas/core/dtypes/cast.py

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
from __future__ import annotations
66

7-
from contextlib import suppress
87
from datetime import (
98
date,
109
datetime,
@@ -29,7 +28,6 @@
2928
NaT,
3029
OutOfBoundsDatetime,
3130
OutOfBoundsTimedelta,
32-
Period,
3331
Timedelta,
3432
Timestamp,
3533
conversion,
@@ -87,7 +85,6 @@
8785
PeriodDtype,
8886
)
8987
from pandas.core.dtypes.generic import (
90-
ABCDataFrame,
9188
ABCExtensionArray,
9289
ABCSeries,
9390
)
@@ -249,9 +246,6 @@ def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLi
249246
try to cast to the specified dtype (e.g. convert back to bool/int
250247
or could be an astype of float64->float32
251248
"""
252-
if isinstance(result, ABCDataFrame):
253-
# see test_pivot_table_doctest_case
254-
return result
255249
do_round = False
256250

257251
if isinstance(dtype, str):
@@ -278,15 +272,9 @@ def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLi
278272

279273
dtype = np.dtype(dtype)
280274

281-
elif dtype.type is Period:
282-
from pandas.core.arrays import PeriodArray
283-
284-
with suppress(TypeError):
285-
# e.g. TypeError: int() argument must be a string, a
286-
# bytes-like object or a number, not 'Period
287-
288-
# error: "dtype[Any]" has no attribute "freq"
289-
return PeriodArray(result, freq=dtype.freq) # type: ignore[attr-defined]
275+
if not isinstance(dtype, np.dtype):
276+
# enforce our signature annotation
277+
raise TypeError(dtype) # pragma: no cover
290278

291279
converted = maybe_downcast_numeric(result, dtype, do_round)
292280
if converted is not result:
@@ -295,15 +283,7 @@ def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLi
295283
# a datetimelike
296284
# GH12821, iNaT is cast to float
297285
if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]:
298-
if isinstance(dtype, DatetimeTZDtype):
299-
# convert to datetime and change timezone
300-
i8values = result.astype("i8", copy=False)
301-
cls = dtype.construct_array_type()
302-
# equiv: DatetimeArray(i8values).tz_localize("UTC").tz_convert(dtype.tz)
303-
dt64values = i8values.view("M8[ns]")
304-
result = cls._simple_new(dt64values, dtype=dtype)
305-
else:
306-
result = result.astype(dtype)
286+
result = result.astype(dtype)
307287

308288
return result
309289

0 commit comments

Comments
 (0)