From f936c85aa775d54406a36017f94a490180aec4ee Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 20 Apr 2021 10:59:49 -0700 Subject: [PATCH 01/10] TYP: libinterval --- pandas/_libs/interval.pyi | 92 +++++++++++++++++++++++++++ pandas/_libs/tslibs/nattype.pyi | 14 +++- pandas/core/arrays/interval.py | 40 +++++++++--- pandas/core/arrays/period.py | 4 +- pandas/core/arrays/string_.py | 4 +- pandas/core/base.py | 3 +- pandas/core/computation/pytables.py | 6 ++ pandas/core/indexes/datetimes.py | 9 ++- pandas/core/indexes/extension.py | 9 ++- pandas/core/indexes/interval.py | 10 ++- pandas/core/indexes/multi.py | 3 +- pandas/core/indexes/period.py | 12 ++-- pandas/core/indexes/range.py | 3 +- pandas/core/indexes/timedeltas.py | 3 +- pandas/core/reshape/merge.py | 16 +++-- pandas/core/series.py | 3 +- pandas/io/parsers/base_parser.py | 55 ++++++++++++---- pandas/io/parsers/c_parser_wrapper.py | 22 ++----- pandas/io/parsers/python_parser.py | 46 ++++++-------- 19 files changed, 263 insertions(+), 91 deletions(-) create mode 100644 pandas/_libs/interval.pyi diff --git a/pandas/_libs/interval.pyi b/pandas/_libs/interval.pyi new file mode 100644 index 0000000000000..b54016ad8bec6 --- /dev/null +++ b/pandas/_libs/interval.pyi @@ -0,0 +1,92 @@ +import numpy as np + +from pandas import ( + Timedelta, + Timestamp, +) + +VALID_CLOSED: frozenset[str] + + +class IntervalMixin: + closed: str + + @property + def closed_left(self) -> bool: ... + + @property + def closed_right(self) -> bool: ... + + @property + def open_left(self) -> bool: ... + + @property + def open_right(self) -> bool: ... + + @property + def mid(self): ... + + @property + def length(self): ... + + @property + def is_empty(self): ... + + def _check_closed_matches(self, other, name: str = ...) -> None: ... + + +class Interval(IntervalMixin): + left: int | float | Timestamp | Timedelta + right: int | float | Timestamp | Timedelta + + def __init__(self, left, right, closed: str = ...): ... + + def __contains__(self, key) -> bool: ... + def __repr__(self) -> str: ... + def __str__(self) -> str: ... + def __add__(self, y): ... + def __sub__(self, y): ... + def __mul__(self, y): ... + def __truediv__(self, y): ... + def __floordiv__(self, y): ... + + def overlaps(self, other: Interval) -> bool: ... + + +def intervals_to_interval_bounds( + intervals: np.ndarray, + validate_closed: bool = ..., +) -> tuple[np.ndarray, np.ndarray, str]: ... + + +class IntervalTree(IntervalMixin): + def __init__(self, left, right, closed='right', leaf_size=100): ... + + @property + def left_sorter(self) -> np.ndarray: ... # np.ndarray[np.intp] + + @property + def right_sorter(self) -> np.ndarray: ... # np.ndarray[np.intp] + + @property + def is_overlapping(self) -> bool: ... + + @property + def is_monotonic_increasing(self) -> bool: ... + + def get_indexer( + self, + target: np.ndarray, # scalar_t[:] + ) -> np.ndarray: ... # np.ndarray[np.intp] + + def get_indexer_non_unique( + self, + target: np.ndarray, # scalar_t[:] + ) -> tuple[ + np.ndarray, # np.ndarray[np.intp] + np.ndarray, # np.ndarray[np.intp] + ]: ... + + def __repr__(self) -> str: ... + + def clear_mapping(self) -> None: ... diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index 0f81dcb4b2df1..fbeeb1f6070fe 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -1,8 +1,13 @@ -from datetime import datetime +from datetime import ( + datetime, + timedelta, +) import numpy as np +from pandas._libs.tslibs.period import Period + NaT: NaTType iNaT: int nat_strings: set[str] @@ -133,3 +138,10 @@ class NaTType(datetime): # inject Period properties @property def qyear(self) -> float: ... + + def __eq__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... + def __ne__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... + def __lt__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... + def __le__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... + def __gt__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... + def __ge__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 8d3a8feb89d67..1f7c5ebb0452b 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -7,6 +7,7 @@ ) import textwrap from typing import ( + TYPE_CHECKING, Sequence, TypeVar, cast, @@ -85,6 +86,12 @@ unpack_zerodim_and_defer, ) +if TYPE_CHECKING: + from pandas.core.arrays import ( + DatetimeArray, + TimedeltaArray, + ) + IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray") _interval_shared_docs: dict[str, str] = {} @@ -192,6 +199,10 @@ class IntervalArray(IntervalMixin, ExtensionArray): can_hold_na = True _na_value = _fill_value = np.nan + _dtype: IntervalDtype + _left: np.ndarray | DatetimeArray | TimedeltaArray + _right: np.ndarray | DatetimeArray | TimedeltaArray + # --------------------------------------------------------------------- # Constructors @@ -587,7 +598,10 @@ def _validate(self): "location both left and right sides" ) raise ValueError(msg) - if not (self._left[left_mask] <= self._right[left_mask]).all(): + # error: Item "bool" of "Union[Any, bool]" has no attribute "all" + if not ( + self._left[left_mask] <= self._right[left_mask] + ).all(): # type: ignore[union-attr] msg = "left side of interval must be <= right side" raise ValueError(msg) @@ -931,9 +945,9 @@ def shift( from pandas import Index fill_value = Index(self._left, copy=False)._na_value - empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) + empty = type(self).from_breaks([fill_value] * (empty_len + 1)) else: - empty = self._from_sequence([fill_value] * empty_len) + empty = type(self)._from_sequence([fill_value] * empty_len) if periods > 0: a = empty @@ -1362,15 +1376,19 @@ def is_non_overlapping_monotonic(self) -> bool: # at a point when both sides of intervals are included if self.closed == "both": return bool( - (self._right[:-1] < self._left[1:]).all() - or (self._left[:-1] > self._right[1:]).all() + # error: Item "bool" of "Union[Any, bool]" has no attribute "all" + (self._right[:-1] < self._left[1:]).all() # type: ignore[union-attr] + # error: Item "bool" of "Union[Any, bool]" has no attribute "all" + or (self._left[:-1] > self._right[1:]).all() # type: ignore[union-attr] ) # non-strict inequality when closed != 'both'; at least one side is # not included in the intervals, so equality does not imply overlapping return bool( - (self._right[:-1] <= self._left[1:]).all() - or (self._left[:-1] >= self._right[1:]).all() + # error: Item "bool" of "Union[Any, bool]" has no attribute "all" + (self._right[:-1] <= self._left[1:]).all() # type: ignore[union-attr] + # error: Item "bool" of "Union[Any, bool]" has no attribute "all" + or (self._left[:-1] >= self._right[1:]).all() # type: ignore[union-attr] ) # --------------------------------------------------------------------- @@ -1483,7 +1501,9 @@ def putmask(self, mask: np.ndarray, value) -> None: np.putmask(self._right, mask, value_right) else: self._left.putmask(mask, value_left) - self._right.putmask(mask, value_right) + # error: Item "ndarray" of "Union[ndarray, DatetimeArray, TimedeltaArray]" + # has no attribute "putmask" + self._right.putmask(mask, value_right) # type: ignore[union-attr] def insert(self: IntervalArrayT, loc: int, item: Interval) -> IntervalArrayT: """ @@ -1513,7 +1533,9 @@ def delete(self: IntervalArrayT, loc) -> IntervalArrayT: new_right = np.delete(self._right, loc) else: new_left = self._left.delete(loc) - new_right = self._right.delete(loc) + # error: Item "ndarray" of "Union[ndarray, DatetimeArray, TimedeltaArray]" + # has no attribute "delete" + new_right = self._right.delete(loc) # type: ignore[union-attr] return self._shallow_copy(left=new_left, right=new_right) @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 5a9dd0e89bd65..a9c94b615f49c 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -178,8 +178,8 @@ class PeriodArray(dtl.DatelikeOps): "days_in_month", "daysinmonth", ] - _datetimelike_ops = _field_ops + _object_ops + _bool_ops - _datetimelike_methods = ["strftime", "to_timestamp", "asfreq"] + _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + _datetimelike_methods: list[str] = ["strftime", "to_timestamp", "asfreq"] # -------------------------------------------------------------------- # Constructors diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 600aacec9c87a..deee70ff64b61 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -204,9 +204,7 @@ def __init__(self, values, copy=False): values = extract_array(values) super().__init__(values, copy=copy) - # error: Incompatible types in assignment (expression has type "StringDtype", - # variable has type "PandasDtype") - self._dtype = StringDtype() # type: ignore[assignment] + self._dtype = StringDtype() if not isinstance(values, type(self)): self._validate() diff --git a/pandas/core/base.py b/pandas/core/base.py index 42f52618eb07b..5747cb68fde33 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1125,7 +1125,8 @@ def _memory_usage(self, deep: bool = False) -> int: return v @doc( - algorithms.factorize, + # error: Cannot determine type of 'factorize' + algorithms.factorize, # type: ignore[has-type] values="", order="", size_hint="", diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 0e6a7551ab399..59c1628d6daaf 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -605,6 +605,12 @@ def __repr__(self) -> str: def evaluate(self): """ create and return the numexpr condition and filter """ + if self.terms is None: + raise ValueError( + f"cannot process expression [{self.expr}], [{self}] " + "is not a valid condition" + ) + try: self.condition = self.terms.prune(ConditionBinOp) except AttributeError as err: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index f77f28deecf57..ceb9a96366adc 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -268,7 +268,8 @@ class DatetimeIndex(DatetimeTimedeltaMixin): # -------------------------------------------------------------------- # methods that dispatch to DatetimeArray and wrap result - @doc(DatetimeArray.strftime) + # error: Cannot determine type of 'strftime' + @doc(DatetimeArray.strftime) # type: ignore[has-type] def strftime(self, date_format) -> Index: arr = self._data.strftime(date_format) return Index(arr, name=self.name) @@ -278,12 +279,14 @@ def tz_convert(self, tz) -> DatetimeIndex: arr = self._data.tz_convert(tz) return type(self)._simple_new(arr, name=self.name) - @doc(DatetimeArray.tz_localize) + # error: Cannot determine type of 'tz_localize' + @doc(DatetimeArray.tz_localize) # type: ignore[has-type] def tz_localize(self, tz, ambiguous="raise", nonexistent="raise") -> DatetimeIndex: arr = self._data.tz_localize(tz, ambiguous, nonexistent) return type(self)._simple_new(arr, name=self.name) - @doc(DatetimeArray.to_period) + # error: Cannot determine type of 'to_period' + @doc(DatetimeArray.to_period) # type: ignore[has-type] def to_period(self, freq=None) -> PeriodIndex: from pandas.core.indexes.api import PeriodIndex diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 83998a2792a8a..19d64613821e5 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -286,7 +286,9 @@ def __getitem__(self, key): return type(self)(result, name=self._name) # Unpack to ndarray for MPL compat - result = result._ndarray + # error: Item "IntervalArray" of "Union[Any, IntervalArray, + # NDArrayBackedExtensionArray]" has no attribute "_ndarray" + result = result._ndarray # type: ignore[union-attr] # Includes cases where we get a 2D ndarray back for MPL compat deprecate_ndim_indexing(result) @@ -400,8 +402,11 @@ class NDArrayBackedExtensionIndex(ExtensionIndex): _data: NDArrayBackedExtensionArray + # Argument 1 of "_simple_new" is incompatible with supertype "ExtensionIndex"; + # supertype defines the argument type as + # "Union[IntervalArray, NDArrayBackedExtensionArray]" @classmethod - def _simple_new( + def _simple_new( # type: ignore[override] cls, values: NDArrayBackedExtensionArray, name: Hashable = None, diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index d7b5f66bd385f..47379896f103b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -263,6 +263,8 @@ class IntervalIndex(ExtensionIndex): is_non_overlapping_monotonic: bool closed_left: bool closed_right: bool + open_left: bool + open_right: bool # we would like our indexing holder to defer to us _defer_to_indexing = True @@ -442,7 +444,8 @@ def inferred_type(self) -> str: """Return a string of the type inferred from the values""" return "interval" - @Appender(Index.memory_usage.__doc__) + # error: Cannot determine type of 'memory_usage' + @Appender(Index.memory_usage.__doc__) # type: ignore[has-type] def memory_usage(self, deep: bool = False) -> int: # we don't use an explicit engine # so return the bytes here @@ -587,8 +590,9 @@ def _maybe_convert_i8(self, key): # convert left/right and reconstruct left = self._maybe_convert_i8(key.left) right = self._maybe_convert_i8(key.right) - constructor = Interval if scalar else IntervalIndex.from_arrays - return constructor(left, right, closed=self.closed) + if scalar: + return Interval(left, right, closed=self.closed) + return IntervalIndex.from_arrays(left, right, closed=self.closed) if scalar: # Timestamp/Timedelta diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 794f13bbfb6b1..0d85d891d2309 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1242,7 +1242,8 @@ def f(level): return any(f(level) for level in self._inferred_type_levels) - @doc(Index.memory_usage) + # error: Cannot determine type of 'memory_usage' + @doc(Index.memory_usage) # type: ignore[has-type] def memory_usage(self, deep: bool = False) -> int: # we are overwriting our base class to avoid # computing .values here which could materialize diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 18e441ef165c9..6f34d512e0510 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -173,8 +173,9 @@ class PeriodIndex(DatetimeIndexOpsMixin): # methods that dispatch to array and wrap result in Index # These are defined here instead of via inherit_names for mypy + # error: Cannot determine type of 'asfreq' @doc( - PeriodArray.asfreq, + PeriodArray.asfreq, # type: ignore[has-type] other="pandas.arrays.PeriodArray", other_name="PeriodArray", **_shared_doc_kwargs, @@ -191,21 +192,24 @@ def to_timestamp(self, freq=None, how="start") -> DatetimeIndex: # https://github.com/python/mypy/issues/1362 # error: Decorated property not supported @property # type:ignore[misc] - @doc(PeriodArray.hour.fget) + # error: Cannot determine type of 'hour' + @doc(PeriodArray.hour.fget) # type: ignore[has-type] def hour(self) -> Int64Index: return Int64Index(self._data.hour, name=self.name) # https://github.com/python/mypy/issues/1362 # error: Decorated property not supported @property # type:ignore[misc] - @doc(PeriodArray.minute.fget) + # error: Cannot determine type of 'minute' + @doc(PeriodArray.minute.fget) # type: ignore[has-type] def minute(self) -> Int64Index: return Int64Index(self._data.minute, name=self.name) # https://github.com/python/mypy/issues/1362 # error: Decorated property not supported @property # type:ignore[misc] - @doc(PeriodArray.second.fget) + # error: Cannot determine type of 'second' + @doc(PeriodArray.second.fget) # type: ignore[has-type] def second(self) -> Int64Index: return Int64Index(self._data.second, name=self.name) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 1e974063bd839..ae7babf7fe94f 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -461,7 +461,8 @@ def tolist(self) -> list[int]: def __iter__(self): yield from self._range - @doc(Int64Index._shallow_copy) + # error: Cannot determine type of '_shallow_copy' + @doc(Int64Index._shallow_copy) # type: ignore[has-type] def _shallow_copy(self, values, name: Hashable = no_default): name = self.name if name is no_default else name diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index a23dd10bc3c0e..ac7e905088c15 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -127,7 +127,8 @@ def __new__( unit=None, freq=lib.no_default, closed=None, - dtype=TD64NS_DTYPE, + # error: Cannot determine type of 'TD64NS_DTYPE' + dtype=TD64NS_DTYPE, # type: ignore[has-type] copy=False, name=None, ): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8cee0dd2abb88..0179b6cf3274a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1681,9 +1681,12 @@ def _asof_by_function(direction: str): _type_casters = { - "int64_t": ensure_int64, - "double": ensure_float64, - "object": ensure_object, + # error: Cannot determine type of 'ensure_int64' + "int64_t": ensure_int64, # type: ignore[has-type] + # error: Cannot determine type of 'ensure_float64' + "double": ensure_float64, # type: ignore[has-type] + # error: Cannot determine type of 'ensure_object' + "object": ensure_object, # type: ignore[has-type] } @@ -1947,12 +1950,11 @@ def flip(xs) -> np.ndarray: right_by_values = flip(right_by_values) # upcast 'by' parameter because HashTable is limited + # TODO: HashTable not so limited anymore? by_type = _get_cython_type_upcast(left_by_values.dtype) by_type_caster = _type_casters[by_type] - # error: Cannot call function of unknown type - left_by_values = by_type_caster(left_by_values) # type: ignore[operator] - # error: Cannot call function of unknown type - right_by_values = by_type_caster(right_by_values) # type: ignore[operator] + left_by_values = by_type_caster(left_by_values) + right_by_values = by_type_caster(right_by_values) # choose appropriate function by type func = _asof_by_function(self.direction) diff --git a/pandas/core/series.py b/pandas/core/series.py index 440bc4c89e647..4292477b12d46 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2752,7 +2752,8 @@ def __rmatmul__(self, other): """ return self.dot(np.transpose(other)) - @doc(base.IndexOpsMixin.searchsorted, klass="Series") + # error: Cannot determine type of 'searchsorted' + @doc(base.IndexOpsMixin.searchsorted, klass="Series") # type: ignore[has-type] def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index a011a789bf17c..3147a395342ff 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -6,6 +6,7 @@ Any, DefaultDict, Dict, + Hashable, Iterable, List, Optional, @@ -114,6 +115,11 @@ class ParserBase: + index_col: int | Sequence[int] | None + index_names: list[Hashable] | None + _first_chunk: bool + _implicit_index: bool = False + def __init__(self, kwds): self.names = kwds.get("names") @@ -122,7 +128,7 @@ def __init__(self, kwds): self.index_col = kwds.get("index_col", None) self.unnamed_cols: Set = set() - self.index_names: Optional[List] = None + self.index_names: Optional[list[Hashable]] = None self.col_names = None self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) @@ -170,8 +176,14 @@ def __init__(self, kwds): if self.index_col is not None: is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) if not ( - is_sequence - and all(map(is_integer, self.index_col)) + # error: Argument 2 to "map" has incompatible type + # "Union[int, Sequence[int]]"; expected "Iterable[int]" + ( + is_sequence + and all( + map(is_integer, self.index_col) # type: ignore[arg-type] + ) + ) or is_integer(self.index_col) ): raise ValueError( @@ -284,8 +296,12 @@ def _should_parse_dates(self, i): name = self.index_names[i] else: name = None - j = i if self.index_col is None else self.index_col[i] - + # error: Value of type "Union[int, Sequence[int]]" is not indexable + j = ( + i + if self.index_col is None + else self.index_col[i] # type: ignore[index] + ) if is_scalar(self.parse_dates): return (j == self.parse_dates) or ( name is not None and name == self.parse_dates @@ -313,7 +329,9 @@ def _extract_multi_indexer_columns( ic = [] if not isinstance(ic, (list, tuple, np.ndarray)): - ic = [ic] + # error: List item 0 has incompatible type + # "Union[int, Sequence[int]]"; expected "int" + ic = [ic] # type: ignore[list-item] sic = set(ic) # clean the index_names @@ -329,7 +347,9 @@ def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) columns = list(zip(*(extract(r) for r in header))) - names = ic + columns + # error: No overload variant of "__add__" of "tuple" matches argument + # type "List[Any]" + names = ic + columns # type: ignore[operator] # If we find unnamed columns all in a single # level, then our header was too long. @@ -363,7 +383,12 @@ def _maybe_dedup_names(self, names): if self.mangle_dupe_cols: names = list(names) # so we can index counts: DefaultDict[Union[int, str, Tuple], int] = defaultdict(int) - is_potential_mi = _is_potential_multi_index(names, self.index_col) + # error: Argument 2 to "_is_potential_multi_index" has incompatible + # type "Union[int, Sequence[int], None]"; expected + # "Union[bool, Sequence[int], None]" + is_potential_mi = _is_potential_multi_index( + names, self.index_col # type: ignore[arg-type] + ) for i, col in enumerate(names): cur_count = counts[col] @@ -415,8 +440,6 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): return index, columns - _implicit_index = False - def _get_simple_index(self, data, columns): def ix(col): if not isinstance(col, str): @@ -425,7 +448,11 @@ def ix(col): to_remove = [] index = [] - for idx in self.index_col: + # error: Item "int" of "Union[int, Sequence[int], None]" has no + # attribute "__iter__" (not iterable) + # error: Item "None" of "Union[int, Sequence[int], None]" has no + # attribute "__iter__" (not iterable + for idx in self.index_col: # type: ignore[union-attr] i = ix(idx) to_remove.append(i) index.append(data[i]) @@ -453,7 +480,11 @@ def _get_name(icol): to_remove = [] index = [] - for idx in self.index_col: + # error: Item "int" of "Union[int, Sequence[int], None]" has no + # attribute "__iter__" (not iterable) + # error: Item "None" of "Union[int, Sequence[int], None]" has no + # attribute "__iter__" (not iterable + for idx in self.index_col: # type: ignore[union-attr] name = _get_name(idx) to_remove.append(name) index.append(data[name]) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index abf6128699a21..936924e828b79 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -17,10 +17,7 @@ def __init__(self, src: FilePathOrBuffer, **kwds): ParserBase.__init__(self, kwds) # #2442 - # error: Cannot determine type of 'index_col' - kwds["allow_leading_cols"] = ( - self.index_col is not False # type: ignore[has-type] - ) + kwds["allow_leading_cols"] = self.index_col is not False # GH20529, validate usecol arg before TextReader kwds["usecols"] = self.usecols @@ -66,7 +63,6 @@ def __init__(self, src: FilePathOrBuffer, **kwds): if len(self._reader.header) > 1: # we have a multi index in the columns # error: Cannot determine type of 'names' - # error: Cannot determine type of 'index_names' # error: Cannot determine type of 'col_names' ( self.names, # type: ignore[has-type] @@ -75,7 +71,7 @@ def __init__(self, src: FilePathOrBuffer, **kwds): passed_names, ) = self._extract_multi_indexer_columns( self._reader.header, - self.index_names, # type: ignore[has-type] + self.index_names, self.col_names, # type: ignore[has-type] passed_names, ) @@ -144,10 +140,7 @@ def __init__(self, src: FilePathOrBuffer, **kwds): self.orig_names = self.names # type: ignore[has-type] if not self._has_complex_date_col: - # error: Cannot determine type of 'index_col' - if self._reader.leading_cols == 0 and is_index_col( - self.index_col # type: ignore[has-type] - ): + if self._reader.leading_cols == 0 and is_index_col(self.index_col): self._name_processed = True ( @@ -158,8 +151,7 @@ def __init__(self, src: FilePathOrBuffer, **kwds): ) = self._clean_index_names( # error: Cannot determine type of 'names' self.names, # type: ignore[has-type] - # error: Cannot determine type of 'index_col' - self.index_col, # type: ignore[has-type] + self.index_col, self.unnamed_cols, ) @@ -208,8 +200,7 @@ def read(self, nrows=None): try: data = self._reader.read(nrows) except StopIteration: - # error: Cannot determine type of '_first_chunk' - if self._first_chunk: # type: ignore[has-type] + if self._first_chunk: self._first_chunk = False names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = self._get_empty_meta( @@ -248,7 +239,8 @@ def read(self, nrows=None): if self.index_col is None: values = data.pop(i) else: - values = data.pop(self.index_col[i]) + # error: Value of type "Union[int, Sequence[int]]" is not indexable + values = data.pop(self.index_col[i]) # type: ignore[index] values = self._maybe_parse_dates(values, i, try_parse_dates=True) arrays.append(values) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 9f62d63c680f6..93a34d0c91a6d 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -124,7 +124,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): # The original set is stored in self.original_columns. if len(self.columns) > 1: # we are processing a multi index column - # error: Cannot determine type of 'index_names' # error: Cannot determine type of 'col_names' ( self.columns, @@ -133,7 +132,7 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): _, ) = self._extract_multi_indexer_columns( self.columns, - self.index_names, # type: ignore[has-type] + self.index_names, self.col_names, # type: ignore[has-type] ) # Update list of original names to include all indices. @@ -250,8 +249,7 @@ def read(self, rows=None): try: content = self._get_lines(rows) except StopIteration: - # error: Cannot determine type of '_first_chunk' - if self._first_chunk: # type: ignore[has-type] + if self._first_chunk: content = [] else: self.close() @@ -264,10 +262,9 @@ def read(self, rows=None): if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) - # error: Cannot determine type of 'index_col' index, columns, col_dict = self._get_empty_meta( names, - self.index_col, # type: ignore[has-type] + self.index_col, self.index_names, self.dtype, ) @@ -296,8 +293,9 @@ def _exclude_implicit_index(self, alldata): offset = 0 if self._implicit_index: - # error: Cannot determine type of 'index_col' - offset = len(self.index_col) # type: ignore[has-type] + # error: Argument 1 to "len" has incompatible type + # "Union[int, Sequence[int], None]"; expected "Sized" + offset = len(self.index_col) # type: ignore[arg-type] if self._col_indices is not None and len(names) != len(self._col_indices): names = [names[i] for i in sorted(self._col_indices)] @@ -438,9 +436,12 @@ def _infer_columns(self): # line for the rest of the parsing code if hr == header[-1]: lc = len(this_columns) - # error: Cannot determine type of 'index_col' - sic = self.index_col # type: ignore[has-type] - ic = len(sic) if sic is not None else 0 + sic = self.index_col + # error: Argument 1 to "len" has incompatible type + # "Union[int, Sequence[int]]"; expected "Sized" + ic = ( + len(sic) if sic is not None else 0 # type: ignore[arg-type] + ) unnamed_count = len(this_unnamed_cols) if lc != unnamed_count and lc - ic > unnamed_count: @@ -850,8 +851,7 @@ def _get_index_name(self, columns): if line is not None: # leave it 0, #2442 # Case 1 - # error: Cannot determine type of 'index_col' - index_col = self.index_col # type: ignore[has-type] + index_col = self.index_col if index_col is not False: implicit_first_cols = len(line) - self.num_original_columns @@ -890,20 +890,16 @@ def _rows_to_cols(self, content): col_len = self.num_original_columns if self._implicit_index: - col_len += len(self.index_col) + # error: Argument 1 to "len" has incompatible type + # "Union[int, Sequence[int]]"; expected "Sized" + col_len += len(self.index_col) # type: ignore[arg-type] max_len = max(len(row) for row in content) # Check that there are no rows with too many # elements in their row (rows with too few # elements are padded with NaN). - # error: Non-overlapping identity check (left operand type: "List[int]", - # right operand type: "Literal[False]") - if ( - max_len > col_len - and self.index_col is not False # type: ignore[comparison-overlap] - and self.usecols is None - ): + if max_len > col_len and self.index_col is not False and self.usecols is None: footers = self.skipfooter if self.skipfooter else 0 bad_lines = [] @@ -952,13 +948,13 @@ def _rows_to_cols(self, content): col_indices = self._col_indices if self._implicit_index: + # error: Argument 1 to "len" has incompatible type + # "Union[int, Sequence[int]]"; expected "Sized" + lic = len(self.index_col) # type: ignore[arg-type] zipped_content = [ a for i, a in enumerate(zipped_content) - if ( - i < len(self.index_col) - or i - len(self.index_col) in col_indices - ) + if (i < lic or i - lic in col_indices) ] else: zipped_content = [ From 5f00effb0fdcc3d6a7f07783acff6d76541de4f5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 20 Apr 2021 11:18:04 -0700 Subject: [PATCH 02/10] future import --- pandas/io/parsers/base_parser.py | 36 +++++++++++---------------- pandas/io/parsers/c_parser_wrapper.py | 2 ++ pandas/io/parsers/python_parser.py | 26 +++++++++---------- 3 files changed, 29 insertions(+), 35 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 3147a395342ff..300d4bdace52d 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from collections import defaultdict import csv import datetime @@ -5,15 +7,9 @@ from typing import ( Any, DefaultDict, - Dict, Hashable, Iterable, - List, - Optional, Sequence, - Set, - Tuple, - Union, cast, ) import warnings @@ -123,12 +119,12 @@ class ParserBase: def __init__(self, kwds): self.names = kwds.get("names") - self.orig_names: Optional[List] = None + self.orig_names: list | None = None self.prefix = kwds.pop("prefix", None) self.index_col = kwds.get("index_col", None) - self.unnamed_cols: Set = set() - self.index_names: Optional[list[Hashable]] = None + self.unnamed_cols: set = set() + self.index_names: list[Hashable] | None = None self.col_names = None self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) @@ -212,9 +208,9 @@ def __init__(self, kwds): self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) - self.handles: Optional[IOHandles] = None + self.handles: IOHandles | None = None - def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: + def _open_handles(self, src: FilePathOrBuffer, kwds: dict[str, Any]) -> None: """ Let the readers open IOHanldes after they are done with their potential raises. """ @@ -228,7 +224,7 @@ def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: errors=kwds.get("encoding_errors", "strict"), ) - def _validate_parse_dates_presence(self, columns: List[str]) -> None: + def _validate_parse_dates_presence(self, columns: list[str]) -> None: """ Check if parse_dates are in columns. @@ -382,7 +378,7 @@ def _maybe_dedup_names(self, names): # would be nice! if self.mangle_dupe_cols: names = list(names) # so we can index - counts: DefaultDict[Union[int, str, Tuple], int] = defaultdict(int) + counts: DefaultDict[int | str | tuple, int] = defaultdict(int) # error: Argument 2 to "_is_potential_multi_index" has incompatible # type "Union[int, Sequence[int], None]"; expected # "Union[bool, Sequence[int], None]" @@ -607,8 +603,8 @@ def _convert_to_ndarrays( return result def _set_noconvert_dtype_columns( - self, col_indices: List[int], names: List[Union[int, str, Tuple]] - ) -> Set[int]: + self, col_indices: list[int], names: list[int | str | tuple] + ) -> set[int]: """ Set the columns that should not undergo dtype conversions. @@ -626,7 +622,7 @@ def _set_noconvert_dtype_columns( ------- A set of integers containing the positions of the columns not to convert. """ - usecols: Optional[Union[List[int], List[str]]] + usecols: list[int] | list[str] | None noconvert_columns = set() if self.usecols_dtype == "integer": # A set of integers will be converted to a list in @@ -913,7 +909,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols): return [None] * len(index_col), columns, index_col cp_cols = list(columns) - index_names: List[Optional[Union[int, str]]] = [] + index_names: list[int | str | None] = [] # don't mutate index_col = list(index_col) @@ -939,7 +935,7 @@ def _clean_index_names(self, columns, index_col, unnamed_cols): return index_names, columns, index_col def _get_empty_meta( - self, columns, index_col, index_names, dtype: Optional[DtypeArg] = None + self, columns, index_col, index_names, dtype: DtypeArg | None = None ): columns = list(columns) @@ -1174,9 +1170,7 @@ def _get_col_names(colspec, columns): return colnames -def _is_potential_multi_index( - columns, index_col: Optional[Union[bool, Sequence[int]]] = None -): +def _is_potential_multi_index(columns, index_col: bool | Sequence[int] | None = None): """ Check whether or not the `columns` parameter could be converted into a MultiIndex. diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 936924e828b79..5784e7b58e523 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pandas._libs.parsers as parsers from pandas._typing import FilePathOrBuffer diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 93a34d0c91a6d..e33362f3e142e 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from collections import ( abc, defaultdict, @@ -9,10 +11,6 @@ from typing import ( DefaultDict, Iterator, - List, - Optional, - Set, - Tuple, cast, ) @@ -43,14 +41,14 @@ class PythonParser(ParserBase): - def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): + def __init__(self, f: Union[FilePathOrBuffer, list], **kwds): """ Workhorse function for processing nested list into DataFrame """ ParserBase.__init__(self, kwds) - self.data: Optional[Iterator[str]] = None - self.buf: List = [] + self.data: Iterator[str] | None = None + self.buf: list = [] self.pos = 0 self.line_pos = 0 @@ -109,7 +107,7 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. - self._col_indices: Optional[List[int]] = None + self._col_indices: list[int] | None = None try: ( self.columns, @@ -141,7 +139,7 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): self.columns = self.columns[0] # get popped off for index - self.orig_names: List[Union[int, str, Tuple]] = list(self.columns) + self.orig_names: list[Union[int, str, tuple]] = list(self.columns) # needs to be cleaned/refactored # multiple date column thing turning into a real spaghetti factory @@ -158,7 +156,7 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): self._col_indices = list(range(len(self.columns))) self._validate_parse_dates_presence(self.columns) - no_thousands_columns: Optional[Set[int]] = None + no_thousands_columns: set[int] | None = None if self.parse_dates: no_thousands_columns = self._set_noconvert_dtype_columns( self._col_indices, self.columns @@ -358,7 +356,7 @@ def _infer_columns(self): names = self.names num_original_columns = 0 clear_buffer = True - unnamed_cols: Set[Optional[Union[int, str]]] = set() + unnamed_cols: set[Union[int, str] | None] = set() if self.header is not None: header = self.header @@ -372,7 +370,7 @@ def _infer_columns(self): have_mi_columns = False header = [header] - columns: List[List[Optional[Union[int, str]]]] = [] + columns: list[list[Union[int, str] | None]] = [] for level, hr in enumerate(header): try: line = self._buffered_line() @@ -401,7 +399,7 @@ def _infer_columns(self): line = self.names[:] - this_columns: List[Optional[Union[int, str]]] = [] + this_columns: list[Union[int, str] | None] = [] this_unnamed_cols = [] for i, c in enumerate(line): @@ -1172,7 +1170,7 @@ def _make_reader(self, f): self.infer_nrows, ) - def _remove_empty_lines(self, lines) -> List: + def _remove_empty_lines(self, lines) -> list: """ Returns the list of lines without the empty ones. With fixed-width fields, empty lines become arrays of empty strings. From f31dab5f91581dd1dc7176d6b61428afa0352ed3 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 24 Apr 2021 08:49:51 -0700 Subject: [PATCH 03/10] remove repr from pyi --- pandas/_libs/interval.pyi | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/_libs/interval.pyi b/pandas/_libs/interval.pyi index b54016ad8bec6..0321bd026d83c 100644 --- a/pandas/_libs/interval.pyi +++ b/pandas/_libs/interval.pyi @@ -42,7 +42,6 @@ class Interval(IntervalMixin): def __init__(self, left, right, closed: str = ...): ... def __contains__(self, key) -> bool: ... - def __repr__(self) -> str: ... def __str__(self) -> str: ... def __add__(self, y): ... def __sub__(self, y): ... @@ -60,7 +59,7 @@ def intervals_to_interval_bounds( class IntervalTree(IntervalMixin): - def __init__(self, left, right, closed='right', leaf_size=100): ... + def __init__(self, left, right, closed=..., leaf_size=...): ... @property def left_sorter(self) -> np.ndarray: ... # np.ndarray[np.intp] @@ -87,6 +86,4 @@ class IntervalTree(IntervalMixin): np.ndarray, # np.ndarray[np.intp] ]: ... - def __repr__(self) -> str: ... - def clear_mapping(self) -> None: ... From 07f94e106f29efb07bd407e0f08c4951a64cec09 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 24 Apr 2021 16:17:17 -0700 Subject: [PATCH 04/10] typevar --- pandas/core/arrays/interval.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1f7c5ebb0452b..9d0accb7d2370 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -8,6 +8,7 @@ import textwrap from typing import ( TYPE_CHECKING, + Generic, Sequence, TypeVar, cast, @@ -93,6 +94,7 @@ ) IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray") +S = TypeVar("S", np.ndarray, DatetimeArray, TimedeltaArray) _interval_shared_docs: dict[str, str] = {} @@ -194,14 +196,14 @@ ), } ) -class IntervalArray(IntervalMixin, ExtensionArray): +class IntervalArray(IntervalMixin, ExtensionArray, Generic[S]): ndim = 1 can_hold_na = True _na_value = _fill_value = np.nan _dtype: IntervalDtype - _left: np.ndarray | DatetimeArray | TimedeltaArray - _right: np.ndarray | DatetimeArray | TimedeltaArray + _left: S + _right: S # --------------------------------------------------------------------- # Constructors @@ -599,9 +601,9 @@ def _validate(self): ) raise ValueError(msg) # error: Item "bool" of "Union[Any, bool]" has no attribute "all" - if not ( + if not ( # type: ignore[union-attr] self._left[left_mask] <= self._right[left_mask] - ).all(): # type: ignore[union-attr] + ).all(): msg = "left side of interval must be <= right side" raise ValueError(msg) @@ -1501,9 +1503,7 @@ def putmask(self, mask: np.ndarray, value) -> None: np.putmask(self._right, mask, value_right) else: self._left.putmask(mask, value_left) - # error: Item "ndarray" of "Union[ndarray, DatetimeArray, TimedeltaArray]" - # has no attribute "putmask" - self._right.putmask(mask, value_right) # type: ignore[union-attr] + self._right.putmask(mask, value_right) def insert(self: IntervalArrayT, loc: int, item: Interval) -> IntervalArrayT: """ @@ -1533,9 +1533,7 @@ def delete(self: IntervalArrayT, loc) -> IntervalArrayT: new_right = np.delete(self._right, loc) else: new_left = self._left.delete(loc) - # error: Item "ndarray" of "Union[ndarray, DatetimeArray, TimedeltaArray]" - # has no attribute "delete" - new_right = self._right.delete(loc) # type: ignore[union-attr] + new_right = self._right.delete(loc) return self._shallow_copy(left=new_left, right=new_right) @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) From 210fde4313582156d187a154168360899a58e247 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 24 Apr 2021 21:37:14 -0700 Subject: [PATCH 05/10] delayed evaluation --- pandas/core/arrays/interval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 9d0accb7d2370..f595f6f095083 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -94,7 +94,7 @@ ) IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray") -S = TypeVar("S", np.ndarray, DatetimeArray, TimedeltaArray) +S = TypeVar("S", np.ndarray, "DatetimeArray", "TimedeltaArray") _interval_shared_docs: dict[str, str] = {} From cecdecbad4bd5d526dfb128b48e845f659df01af Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Apr 2021 11:20:29 -0700 Subject: [PATCH 06/10] mypy following timestamps.pyi --- pandas/core/arrays/interval.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index f595f6f095083..377010cedce18 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -602,7 +602,9 @@ def _validate(self): raise ValueError(msg) # error: Item "bool" of "Union[Any, bool]" has no attribute "all" if not ( # type: ignore[union-attr] - self._left[left_mask] <= self._right[left_mask] + # error: Unsupported operand types for <= ("Timestamp" and "Timedelta") + self._left[left_mask] # type: ignore[operator] + <= self._right[left_mask] ).all(): msg = "left side of interval must be <= right side" raise ValueError(msg) @@ -1379,18 +1381,30 @@ def is_non_overlapping_monotonic(self) -> bool: if self.closed == "both": return bool( # error: Item "bool" of "Union[Any, bool]" has no attribute "all" - (self._right[:-1] < self._left[1:]).all() # type: ignore[union-attr] + # error: Unsupported operand types for > ("Timedelta" and "Timestamp") + ( # type: ignore[union-attr] + self._right[:-1] < self._left[1:] # type: ignore[operator] + ).all() # error: Item "bool" of "Union[Any, bool]" has no attribute "all" - or (self._left[:-1] > self._right[1:]).all() # type: ignore[union-attr] + # error: Unsupported operand types for > ("Timedelta" and "Timestamp") + or ( # type: ignore[union-attr] + self._left[:-1] > self._right[1:] # type: ignore[operator] + ).all() ) # non-strict inequality when closed != 'both'; at least one side is # not included in the intervals, so equality does not imply overlapping return bool( # error: Item "bool" of "Union[Any, bool]" has no attribute "all" - (self._right[:-1] <= self._left[1:]).all() # type: ignore[union-attr] + # error: Unsupported operand types for <= ("Timestamp" and "Timedelta") + ( # type: ignore[union-attr] + self._right[:-1] <= self._left[1:] # type: ignore[operator] + ).all() # error: Item "bool" of "Union[Any, bool]" has no attribute "all" - or (self._left[:-1] >= self._right[1:]).all() # type: ignore[union-attr] + # error: Unsupported operand types for >= ("Timedelta" and "Timestamp") + or ( # type: ignore[union-attr] + self._left[:-1] >= self._right[1:] # type: ignore[operator] + ).all() ) # --------------------------------------------------------------------- From d806625d091031e324ea5d61061b13d6cda1afb7 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 5 May 2021 09:04:27 -0700 Subject: [PATCH 07/10] merge master --- asv_bench/benchmarks/algos/isin.py | 14 + asv_bench/benchmarks/frame_methods.py | 8 + asv_bench/benchmarks/io/style.py | 22 +- ci/code_checks.sh | 13 +- doc/source/ecosystem.rst | 15 + doc/source/user_guide/categorical.rst | 1 + doc/source/user_guide/gotchas.rst | 2 +- doc/source/user_guide/groupby.rst | 31 +- doc/source/user_guide/options.rst | 4 + doc/source/whatsnew/v1.3.0.rst | 41 +- pandas/_libs/arrays.pyi | 45 ++ pandas/_libs/groupby.pyx | 49 +- pandas/_libs/hashtable.pyi | 242 ++++++++++ pandas/_libs/hashtable.pyx | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 7 +- pandas/_libs/indexing.pyx | 5 +- pandas/_libs/internals.pyx | 4 +- pandas/_libs/lib.pyi | 14 +- pandas/_libs/lib.pyx | 83 +++- pandas/_libs/ops.pyi | 16 +- pandas/_libs/ops.pyx | 42 +- pandas/_libs/parsers.pyi | 4 +- pandas/_libs/parsers.pyx | 43 +- pandas/_libs/reduction.pyx | 100 +++-- pandas/_libs/tslibs/dtypes.pyi | 2 + pandas/_libs/tslibs/nattype.pyi | 6 +- pandas/_libs/tslibs/period.pyi | 158 +++++++ pandas/_libs/tslibs/period.pyx | 6 +- pandas/_testing/_warnings.py | 4 +- pandas/_testing/asserters.py | 24 +- pandas/_typing.py | 6 +- pandas/core/algorithms.py | 117 +++-- pandas/core/apply.py | 35 +- pandas/core/arrays/_arrow_utils.py | 191 ++++---- pandas/core/arrays/_mixins.py | 90 +--- pandas/core/arrays/base.py | 10 +- pandas/core/arrays/categorical.py | 112 +++-- pandas/core/arrays/datetimelike.py | 17 +- pandas/core/arrays/datetimes.py | 3 +- pandas/core/arrays/interval.py | 6 +- pandas/core/arrays/numpy_.py | 5 +- pandas/core/arrays/period.py | 23 +- pandas/core/arrays/sparse/array.py | 8 +- pandas/core/arrays/string_.py | 12 +- pandas/core/arrays/string_arrow.py | 161 ++++--- pandas/core/arrays/timedeltas.py | 3 +- pandas/core/base.py | 9 +- pandas/core/config_init.py | 16 + pandas/core/dtypes/cast.py | 2 +- pandas/core/frame.py | 119 ++++- pandas/core/generic.py | 98 +--- pandas/core/groupby/categorical.py | 7 +- pandas/core/groupby/generic.py | 324 +++++--------- pandas/core/groupby/groupby.py | 256 +++++++---- pandas/core/groupby/grouper.py | 41 +- pandas/core/groupby/numba_.py | 17 +- pandas/core/groupby/ops.py | 331 ++++++++------ pandas/core/indexers.py | 5 +- pandas/core/indexes/base.py | 28 +- pandas/core/indexes/datetimelike.py | 34 +- pandas/core/indexes/multi.py | 4 +- pandas/core/indexes/numeric.py | 2 +- pandas/core/indexes/range.py | 44 +- pandas/core/internals/api.py | 7 +- pandas/core/internals/array_manager.py | 252 ++++++----- pandas/core/internals/blocks.py | 7 +- pandas/core/internals/managers.py | 370 ++++++++-------- pandas/core/nanops.py | 2 +- pandas/core/resample.py | 167 +++---- pandas/core/reshape/merge.py | 1 + pandas/core/reshape/reshape.py | 14 +- pandas/core/series.py | 10 +- pandas/core/shared_docs.py | 4 +- pandas/core/sorting.py | 49 +- pandas/core/strings/accessor.py | 44 +- pandas/core/strings/base.py | 6 +- pandas/core/strings/object_array.py | 66 +-- pandas/core/tools/numeric.py | 2 +- pandas/core/window/ewm.py | 85 ++-- pandas/core/window/numba_.py | 16 +- pandas/core/window/rolling.py | 24 +- pandas/io/excel/_base.py | 38 +- pandas/io/formats/style_render.py | 7 +- pandas/io/json/_json.py | 30 +- pandas/io/json/_normalize.py | 136 +++--- pandas/io/json/_table_schema.py | 43 +- pandas/io/parsers/base_parser.py | 65 +-- pandas/io/parsers/c_parser_wrapper.py | 4 +- pandas/io/parsers/python_parser.py | 2 +- pandas/io/sql.py | 258 ++++++++--- .../arrays/categorical/test_analytics.py | 8 +- pandas/tests/arrays/categorical/test_api.py | 27 +- pandas/tests/arrays/string_/test_string.py | 20 + pandas/tests/arrays/test_datetimelike.py | 2 +- pandas/tests/dtypes/test_inference.py | 171 ++++++- pandas/tests/extension/base/casting.py | 19 +- pandas/tests/extension/json/array.py | 3 +- pandas/tests/frame/indexing/test_setitem.py | 44 +- pandas/tests/generic/test_series.py | 51 ++- .../tests/groupby/aggregate/test_aggregate.py | 71 ++- pandas/tests/groupby/aggregate/test_cython.py | 3 + pandas/tests/groupby/test_bin_groupby.py | 6 +- pandas/tests/groupby/test_categorical.py | 31 +- pandas/tests/groupby/test_function.py | 6 +- pandas/tests/groupby/test_groupby.py | 11 +- .../tests/groupby/transform/test_transform.py | 8 +- .../indexes/categorical/test_category.py | 20 +- .../indexes/categorical/test_constructors.py | 2 +- pandas/tests/indexes/common.py | 297 ++++++++----- pandas/tests/indexes/datetimelike.py | 58 +-- .../indexes/datetimes/test_datetimelike.py | 13 +- pandas/tests/indexes/interval/test_base.py | 16 +- pandas/tests/indexes/multi/test_setops.py | 12 + .../indexes/{ => numeric}/test_numeric.py | 417 ++++++++---------- pandas/tests/indexes/period/test_period.py | 11 +- .../tests/indexes/ranges/test_constructors.py | 4 +- pandas/tests/indexes/ranges/test_range.py | 59 +-- pandas/tests/indexes/test_base.py | 58 +-- .../indexes/timedeltas/test_timedelta.py | 11 +- pandas/tests/indexing/test_iloc.py | 6 +- pandas/tests/internals/test_api.py | 10 + pandas/tests/internals/test_internals.py | 24 +- pandas/tests/io/json/test_readlines.py | 2 +- pandas/tests/io/json/test_ujson.py | 28 +- pandas/tests/io/test_sql.py | 54 +++ pandas/tests/resample/test_datetime_index.py | 5 + .../tests/resample/test_resampler_grouper.py | 8 +- pandas/tests/resample/test_timedelta.py | 2 +- pandas/tests/reshape/concat/test_concat.py | 11 + pandas/tests/reshape/test_crosstab.py | 2 + pandas/tests/reshape/test_pivot.py | 1 - .../scalar/timestamp/test_constructors.py | 7 +- .../series/accessors/test_cat_accessor.py | 6 +- pandas/tests/series/methods/test_clip.py | 20 + pandas/tests/strings/conftest.py | 23 + pandas/tests/strings/test_find_replace.py | 271 +++++++----- pandas/tests/strings/test_strings.py | 39 +- pandas/tests/test_algos.py | 13 +- pandas/tests/util/test_assert_index_equal.py | 24 + pandas/tests/window/test_numba.py | 40 +- 140 files changed, 4235 insertions(+), 2672 deletions(-) create mode 100644 pandas/_libs/arrays.pyi create mode 100644 pandas/_libs/hashtable.pyi create mode 100644 pandas/_libs/tslibs/period.pyi rename pandas/tests/indexes/{ => numeric}/test_numeric.py (56%) diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index a8b8a193dbcfc..44245295beafc 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -9,6 +9,8 @@ date_range, ) +from ..pandas_vb_common import tm + class IsIn: @@ -22,6 +24,9 @@ class IsIn: "datetime64[ns]", "category[object]", "category[int]", + "str", + "string", + "arrow_string", ] param_names = ["dtype"] @@ -57,6 +62,15 @@ def setup(self, dtype): self.values = np.random.choice(arr, sample_size) self.series = Series(arr).astype("category") + elif dtype in ["str", "string", "arrow_string"]: + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + try: + self.series = Series(tm.makeStringIndex(N), dtype=dtype) + except ImportError: + raise NotImplementedError + self.values = list(self.series[:2]) + else: self.series = Series(np.random.randint(1, 10, N)).astype(dtype) self.values = [1, 2] diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 65167e6467fd5..760da36a30075 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -563,6 +563,14 @@ def time_frame_nunique(self): self.df.nunique() +class SeriesNuniqueWithNan: + def setup(self): + self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float) + + def time_series_nunique_nan(self): + self.ser.nunique() + + class Duplicated: def setup(self): n = 1 << 20 diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index e4369d67ca67e..a01610a69278b 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -1,6 +1,9 @@ import numpy as np -from pandas import DataFrame +from pandas import ( + DataFrame, + IndexSlice, +) class Render: @@ -31,6 +34,14 @@ def peakmem_classes_render(self, cols, rows): self._style_classes() self.st._render_html() + def time_format_render(self, cols, rows): + self._style_format() + self.st.render() + + def peakmem_format_render(self, cols, rows): + self._style_format() + self.st.render() + def _style_apply(self): def _apply_func(s): return [ @@ -43,3 +54,12 @@ def _style_classes(self): classes = self.df.applymap(lambda v: ("cls-1" if v > 0 else "")) classes.index, classes.columns = self.df.index, self.df.columns self.st = self.df.style.set_td_classes(classes) + + def _style_format(self): + ic = int(len(self.df.columns) / 4 * 3) + ir = int(len(self.df.index) / 4 * 3) + # apply a formatting function + # subset is flexible but hinders vectorised solutions + self.st = self.df.style.format( + "{:,.3f}", subset=IndexSlice["row_1":f"row_{ir}", "float_1":f"float_{ic}"] + ) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c178e9f7cecbe..7cc171330e01a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -15,7 +15,7 @@ # $ ./ci/code_checks.sh code # checks on imported code # $ ./ci/code_checks.sh doctests # run doctests # $ ./ci/code_checks.sh docstrings # validate docstring errors -# $ ./ci/code_checks.sh typing # run static type analysis +# $ ./ci/code_checks.sh typing # run static type analysis [[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "typing" ]] || \ { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|typing]"; exit 9999; } @@ -123,6 +123,11 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then MSG='Doctests for directories' ; echo $MSG pytest -q --doctest-modules \ + pandas/_libs/ \ + pandas/api/ \ + pandas/arrays/ \ + pandas/compat/ \ + pandas/core/array_algos/ \ pandas/core/arrays/ \ pandas/core/computation/ \ pandas/core/dtypes/ \ @@ -133,6 +138,12 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pandas/core/strings/ \ pandas/core/tools/ \ pandas/core/window/ \ + pandas/errors/ \ + pandas/io/clipboard/ \ + pandas/io/json/ \ + pandas/io/excel/ \ + pandas/io/parsers/ \ + pandas/io/sas/ \ pandas/tseries/ RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 56aa734deddd6..d53d0556dca04 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -164,6 +164,21 @@ A good implementation for Python users is `has2k1/plotnine `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use. +`Lux `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +`Lux `__ is a Python library that facilitates fast and easy experimentation with data by automating the visual data exploration process. To use Lux, simply add an extra import alongside pandas: + +.. code:: python + + import lux + import pandas as pd + + df = pd.read_csv("data.csv") + df # discover interesting insights! + +By printing out a dataframe, Lux automatically `recommends a set of visualizations `__ that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a `powerful, intuitive language `__ that allow users to create `Altair `__, `matplotlib `__, or `Vega-Lite `__ visualizations without having to think at the level of code. + `Qtpandas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index fba41f73ba819..f65638cd78a2b 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -954,6 +954,7 @@ categorical (categories and ordering). So if you read back the CSV file you have relevant columns back to ``category`` and assign the right categories and categories ordering. .. ipython:: python + :okwarning: import io diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 180f833a2753d..1de978b195382 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -178,7 +178,7 @@ To test for membership in the values, use the method :meth:`~pandas.Series.isin` For ``DataFrames``, likewise, ``in`` applies to the column axis, testing for membership in the list of column names. -.. _udf-mutation: +.. _gotchas.udf-mutation: Mutating with User Defined Function (UDF) methods ------------------------------------------------- diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index afb2e72cbff07..3f596388ca226 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -739,6 +739,26 @@ optimized Cython implementations: Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above code would work even without the special versions via dispatching (see below). +.. _groupby.aggregate.udfs: + +Aggregations with User-Defined Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Users can also provide their own functions for custom aggregations. When aggregating +with a User-Defined Function (UDF), the UDF should not mutate the provided ``Series``, see +:ref:`gotchas.udf-mutation` for more information. + +.. ipython:: python + + animals.groupby("kind")[["height"]].agg(lambda x: set(x)) + +The resulting dtype will reflect that of the aggregating function. If the results from different groups have +different dtypes, then a common dtype will be determined in the same way as ``DataFrame`` construction. + +.. ipython:: python + + animals.groupby("kind")[["height"]].agg(lambda x: x.astype(int).sum()) + .. _groupby.transform: Transformation @@ -759,7 +779,11 @@ as the one being grouped. The transform function must: * (Optionally) operates on the entire group chunk. If this is supported, a fast path is used starting from the *second* chunk. -For example, suppose we wished to standardize the data within each group: +Similar to :ref:`groupby.aggregate.udfs`, the resulting dtype will reflect that of the +transformation function. If the results from different groups have different dtypes, then +a common dtype will be determined in the same way as ``DataFrame`` construction. + +Suppose we wished to standardize the data within each group: .. ipython:: python @@ -1065,13 +1089,16 @@ that is itself a series, and possibly upcast the result to a DataFrame: s s.apply(f) - .. note:: ``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to it. So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in the output as well as set the indices. +Similar to :ref:`groupby.aggregate.udfs`, the resulting dtype will reflect that of the +apply function. If the results from different groups have different dtypes, then +a common dtype will be determined in the same way as ``DataFrame`` construction. + Numba Accelerated Routines -------------------------- diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 1fcaac1a91d09..278eb907102ed 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -456,6 +456,10 @@ io.hdf.dropna_table True drop ALL nan rows when appe io.parquet.engine None The engine to use as a default for parquet reading and writing. If None then try 'pyarrow' and 'fastparquet' +io.sql.engine None The engine to use as a default for + sql reading and writing, with SQLAlchemy + as a higher level interface. If None + then try 'sqlalchemy' mode.chained_assignment warn Controls ``SettingWithCopyWarning``: 'raise', 'warn', or None. Raise an exception, warn, or no action if diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6bbaa934eaa12..8c6ad2ca07aec 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -197,7 +197,7 @@ Other enhancements - Add support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) - :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`) - :class:`pandas.ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`) -- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`) +- :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support ``Numba`` execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.applymap` can now accept kwargs to pass on to func (:issue:`39987`) @@ -298,6 +298,36 @@ Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` combined.dtypes +Group by methods agg and transform no longer changes return dtype for callables +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously the methods :meth:`.DataFrameGroupBy.aggregate`, +:meth:`.SeriesGroupBy.aggregate`, :meth:`.DataFrameGroupBy.transform`, and +:meth:`.SeriesGroupBy.transform` might cast the result dtype when the argument ``func`` +is callable, possibly leading to undesirable results (:issue:`21240`). The cast would +occur if the result is numeric and casting back to the input dtype does not change any +values as measured by ``np.allclose``. Now no such casting occurs. + +.. ipython:: python + + df = pd.DataFrame({'key': [1, 1], 'a': [True, False], 'b': [True, True]}) + df + +*pandas 1.2.x* + +.. code-block:: ipython + + In [5]: df.groupby('key').agg(lambda x: x.sum()) + Out[5]: + a b + key + 1 True 2 + +*pandas 1.3.0* + +.. ipython:: python + + df.groupby('key').agg(lambda x: x.sum()) Try operating inplace when setting values with ``loc`` and ``iloc`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -609,7 +639,7 @@ Deprecations - Deprecated using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`) - Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) - Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`) -- The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories` is deprecated and will be removed in a future version (:issue:`37643`) +- The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` is deprecated and will be removed in a future version (:issue:`37643`) - Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) .. --------------------------------------------------------------------------- @@ -635,7 +665,7 @@ Performance improvements - Performance improvement in the conversion of pyarrow boolean array to a pandas nullable boolean array (:issue:`41051`) - Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) - Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`) -- +- Performance improvement in :meth:`Series.nunique` with nan values (:issue:`40865`) .. --------------------------------------------------------------------------- @@ -696,6 +726,7 @@ Numeric - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) - Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`) - Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`) +- Bug in :meth:`Series.clip` would fail if series contains NA values and has nullable int or float as a data type (:issue:`40851`) Conversion ^^^^^^^^^^ @@ -749,8 +780,10 @@ Indexing - Bug in :meth:`DatetimeIndex.insert` when inserting ``np.datetime64("NaT")`` into a timezone-aware index incorrectly treating the timezone-naive value as timezone-aware (:issue:`39769`) - Bug in incorrectly raising in :meth:`Index.insert`, when setting a new column that cannot be held in the existing ``frame.columns``, or in :meth:`Series.reset_index` or :meth:`DataFrame.reset_index` instead of casting to a compatible dtype (:issue:`39068`) - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`) +- Bug in :meth:`RangeIndex.astype` where when converting to :class:`CategoricalIndex`, the categories became a :class:`Int64Index` instead of a :class:`RangeIndex` (:issue:`41263`) - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) +- Bug in :meth:`DataFrame.__setitem__` and :meth:`DataFrame.iloc.__setitem__` raising ``ValueError`` when trying to index with a row-slice and setting a list as values (:issue:`40440`) - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) - Bug in :meth:`DataFrame.loc` incorrectly matching non-boolean index elements (:issue:`20432`) - Bug in :meth:`Series.__delitem__` with ``ExtensionDtype`` incorrectly casting to ``ndarray`` (:issue:`40386`) @@ -851,6 +884,7 @@ Groupby/resample/rolling - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`) - Bug in :meth:`DataFrame.rolling` returning mean zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`) - Bug in :meth:`DataFrame.rolling` returning sum not zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`) +- Bug in :meth:`SeriesGroupBy.agg` failing to retain ordered :class:`CategoricalDtype` on order-preserving aggregations (:issue:`41147`) - Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`) Reshaping @@ -912,6 +946,7 @@ Other - Bug in :meth:`Series.where` with numeric dtype and ``other = None`` not casting to ``nan`` (:issue:`39761`) - :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) - Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`) +- Bug in :func:`pandas.testing.assert_index_equal` with ``exact=True`` not raising when comparing :class:`CategoricalIndex` instances with ``Int64Index`` and ``RangeIndex`` categories (:issue:`41263`) - Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`) - Bug in :func:`pandas.util.show_versions` where console JSON output was not proper JSON (:issue:`39701`) - Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised ValueError when called on an empty DataFrame (:issue:`40393`) diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi new file mode 100644 index 0000000000000..0ca501c5b712c --- /dev/null +++ b/pandas/_libs/arrays.pyi @@ -0,0 +1,45 @@ +from typing import Sequence + +import numpy as np + +from pandas._typing import ( + DtypeObj, + Shape, +) + +class NDArrayBacked: + _dtype: DtypeObj + _ndarray: np.ndarray + + def __init__(self, values: np.ndarray, dtype: DtypeObj): ... + + @classmethod + def _simple_new(cls, values: np.ndarray, dtype: DtypeObj): ... + + def _from_backing_data(self, values: np.ndarray): ... + + def __setstate__(self, state): ... + + def __len__(self) -> int: ... + + @property + def shape(self) -> Shape: ... + + @property + def ndim(self) -> int: ... + + @property + def size(self) -> int: ... + + @property + def nbytes(self) -> int: ... + + def copy(self): ... + def delete(self, loc, axis=0): ... + def swapaxes(self, axis1, axis2): ... + def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ... + def reshape(self, *args, **kwargs): ... + def ravel(self, order="C"): ... + + @property + def T(self): ... diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3fa92ce2229c3..8637d50745195 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -469,18 +469,19 @@ def group_any_all(int8_t[::1] out, # group_add, group_prod, group_var, group_mean, group_ohlc # ---------------------------------------------------------------------- -ctypedef fused complexfloating_t: +ctypedef fused add_t: float64_t float32_t complex64_t complex128_t + object @cython.wraparound(False) @cython.boundscheck(False) -def group_add(complexfloating_t[:, ::1] out, +def group_add(add_t[:, ::1] out, int64_t[::1] counts, - ndarray[complexfloating_t, ndim=2] values, + ndarray[add_t, ndim=2] values, const intp_t[:] labels, Py_ssize_t min_count=0) -> None: """ @@ -488,8 +489,8 @@ def group_add(complexfloating_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - complexfloating_t val, count, t, y - complexfloating_t[:, ::1] sumx, compensation + add_t val, t, y + add_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -503,7 +504,8 @@ def group_add(complexfloating_t[:, ::1] out, N, K = (values).shape - with nogil: + if add_t is object: + # NB: this does not use 'compensation' like the non-object track does. for i in range(N): lab = labels[i] if lab < 0: @@ -516,9 +518,13 @@ def group_add(complexfloating_t[:, ::1] out, # not nan if val == val: nobs[lab, j] += 1 - y = val - compensation[lab, j] - t = sumx[lab, j] + y - compensation[lab, j] = t - sumx[lab, j] - y + + if nobs[lab, j] == 1: + # i.e. we havent added anything yet; avoid TypeError + # if e.g. val is a str and sumx[lab, j] is 0 + t = val + else: + t = sumx[lab, j] + val sumx[lab, j] = t for i in range(ncounts): @@ -527,6 +533,31 @@ def group_add(complexfloating_t[:, ::1] out, out[i, j] = NAN else: out[i, j] = sumx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + y = val - compensation[lab, j] + t = sumx[lab, j] + y + compensation[lab, j] = t - sumx[lab, j] - y + sumx[lab, j] = t + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] < min_count: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] @cython.wraparound(False) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi new file mode 100644 index 0000000000000..b6278b3956a1d --- /dev/null +++ b/pandas/_libs/hashtable.pyi @@ -0,0 +1,242 @@ +from typing import ( + Any, + Hashable, + Literal, +) + +import numpy as np + +def unique_label_indices( + labels: np.ndarray, # const int64_t[:] +) -> np.ndarray: ... + + +class Factorizer: + table: PyObjectHashTable + uniques: ObjectVector + count: int + + def __init__(self, size_hint: int): ... + def get_count(self) -> int: ... + + def factorize( + self, + values: np.ndarray, # np.ndarray[object] + sort: bool = ..., + na_sentinel=..., + na_value=..., + ) -> np.ndarray: ... # np.ndarray[intp] + + def unique( + self, + values: np.ndarray, # np.ndarray[object] + ) -> np.ndarray: ... # np.ndarray[object] + + +class Int64Factorizer: + table: Int64HashTable + uniques: Int64Vector + count: int + + def __init__(self, size_hint: int): ... + def get_count(self) -> int: ... + + def factorize( + self, + values: np.ndarray, # const int64_t[:] + sort: bool = ..., + na_sentinel=..., + na_value=..., + ) -> np.ndarray: ... # np.ndarray[intp] + + +class Int64Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.int64] + +class Int32Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.int32] + +class Int16Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.int16] + +class Int8Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.int8] + +class UInt64Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint64] + +class UInt32Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint32] + +class UInt16Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint16] + +class UInt8Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.uint8] + +class Float64Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.float64] + +class Float32Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.float32] + +class Complex128Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex128] + +class Complex64Vector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[np.complex64] + +class StringVector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[object] + +class ObjectVector: + def __init__(self): ... + def __len__(self) -> int: ... + def to_array(self) -> np.ndarray: ... # np.ndarray[object] + + +class HashTable: + # NB: The base HashTable class does _not_ actually have these methods; + # we are putting the here for the sake of mypy to avoid + # reproducing them in each subclass below. + def __init__(self, size_hint: int = ...): ... + def __len__(self) -> int: ... + def __contains__(self, key: Hashable) -> bool: ... + def sizeof(self, deep: bool = ...) -> int: ... + def get_state(self) -> dict[str, int]: ... + + # TODO: `item` type is subclass-specific + def get_item(self, item): ... # TODO: return type? + def set_item(self, item) -> None: ... + + # FIXME: we don't actually have this for StringHashTable or ObjectHashTable? + def map( + self, + keys: np.ndarray, # np.ndarray[subclass-specific] + values: np.ndarray, # const int64_t[:] values + ) -> None: ... + + def map_locations( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + ) -> None: ... + + def lookup( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + ) -> np.ndarray: ... # np.ndarray[np.intp] + + def get_labels( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + uniques, # SubclassTypeVector + count_prior: int = ..., + na_sentinel: int = ..., + na_value: object = ..., + ) -> np.ndarray: ... # np.ndarray[intp_t] + + def unique( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + return_inverse: bool = ..., + ) -> tuple[ + np.ndarray, # np.ndarray[subclass-specific] + np.ndarray, # np.ndarray[np.intp], + ] | np.ndarray: ... # np.ndarray[subclass-specific] + + def _unique( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + uniques, # FooVector + count_prior: int = ..., + na_sentinel: int = ..., + na_value: object = ..., + ignore_na: bool = ..., + return_inverse: bool = ..., + ) -> tuple[ + np.ndarray, # np.ndarray[subclass-specific] + np.ndarray, # np.ndarray[np.intp], + ] | np.ndarray: ... # np.ndarray[subclass-specific] + + def factorize( + self, + values: np.ndarray, # np.ndarray[subclass-specific] + na_sentinel: int = ..., + na_value: object = ..., + mask=..., + ) -> tuple[ + np.ndarray, # np.ndarray[subclass-specific] + np.ndarray, # np.ndarray[np.intp], + ]: ... + +class Complex128HashTable(HashTable): ... +class Complex64HashTable(HashTable): ... +class Float64HashTable(HashTable): ... +class Float32HashTable(HashTable): ... + +class Int64HashTable(HashTable): + # Only Int64HashTable has get_labels_groupby + def get_labels_groupby( + self, + values: np.ndarray, # const int64_t[:] + ) -> tuple[ + np.ndarray, # np.ndarray[np.intp] + np.ndarray, # np.ndarray[np.int64] + ]: ... + +class Int32HashTable(HashTable): ... +class Int16HashTable(HashTable): ... +class Int8HashTable(HashTable): ... +class UInt64HashTable(HashTable): ... +class UInt32HashTable(HashTable): ... +class UInt16HashTable(HashTable): ... +class UInt8HashTable(HashTable): ... + +class StringHashTable(HashTable): ... +class PyObjectHashTable(HashTable): ... + + +def duplicated_int64( + values: np.ndarray, # const int64_t[:] values + keep: Literal["last", "first", False] = ..., +) -> np.ndarray: ... # np.ndarray[bool] +# TODO: Is it actually bool or is it uint8? + +def mode_int64( + values: np.ndarray, # const int64_t[:] values + dropna: bool, +) -> np.ndarray: ... # np.ndarray[np.int64] + +def value_count_int64( + values: np.ndarray, # const int64_t[:] + dropna: bool, +) -> tuple[ + np.ndarray, # np.ndarray[np.int64] + np.ndarray, # np.ndarray[np.int64] +]: ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 1e2a336f12444..4566f22be2c36 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -163,7 +163,7 @@ cdef class Int64Factorizer: @cython.wraparound(False) @cython.boundscheck(False) -def unique_label_indices(const int64_t[:] labels): +def unique_label_indices(const int64_t[:] labels) -> ndarray: """ Indices of the first occurrences of the unique labels *excluding* -1. equivalent to: diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a25867c4a3b0c..4cacd3245f9d8 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -680,6 +680,7 @@ cdef class {{name}}HashTable(HashTable): def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) @@ -1012,7 +1013,7 @@ cdef class StringHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) + labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques """ uniques = ObjectVector() @@ -1045,7 +1046,7 @@ cdef class StringHashTable(HashTable): ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[intp] The labels from values to uniques """ uniques_vector = ObjectVector() @@ -1056,6 +1057,7 @@ cdef class StringHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) @@ -1310,6 +1312,7 @@ cdef class PyObjectHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): + # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index 7966fe8d4f045..bdbaa05138072 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -3,9 +3,10 @@ cdef class NDFrameIndexerBase: A base class for _NDFrameIndexer for fast instantiation and attribute access. """ cdef public: - object obj, name, _ndim + str name + object obj, _ndim - def __init__(self, name, obj): + def __init__(self, name: str, obj): self.obj = obj self.name = name self._ndim = None diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index f3bc70ad8a26b..77bb462c6df4a 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -372,7 +372,9 @@ cdef slice indexer_as_slice(intp_t[:] vals): @cython.boundscheck(False) @cython.wraparound(False) -def get_blkno_indexers(int64_t[:] blknos, bint group=True): +def get_blkno_indexers( + int64_t[:] blknos, bint group=True +) -> list[tuple[int, slice | np.ndarray]]: """ Enumerate contiguous runs of integers in ndarray. diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 8af7c1a026fc6..9dbc47f1d40f7 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -114,12 +114,24 @@ def maybe_convert_objects( convert_to_nullable_integer: bool = ..., ) -> ArrayLike: ... +@overload def maybe_convert_numeric( values: np.ndarray, # np.ndarray[object] na_values: set, convert_empty: bool = True, coerce_numeric: bool = False, -) -> np.ndarray: ... + convert_to_masked_nullable: Literal[False] = ..., +) -> tuple[np.ndarray, None]: ... + +@overload +def maybe_convert_numeric( + values: np.ndarray, # np.ndarray[object] + na_values: set, + convert_empty: bool = True, + coerce_numeric: bool = False, + *, + convert_to_masked_nullable: Literal[True], +) -> tuple[np.ndarray, np.ndarray]: ... # TODO: restrict `arr`? def ensure_string_array( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c18cd56be6cc4..e1cb744c7033c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2029,7 +2029,8 @@ def maybe_convert_numeric( set na_values, bint convert_empty=True, bint coerce_numeric=False, -) -> ndarray: + bint convert_to_masked_nullable=False, +) -> tuple[np.ndarray, np.ndarray | None]: """ Convert object array to a numeric array if possible. @@ -2053,14 +2054,20 @@ def maybe_convert_numeric( numeric array has no suitable numerical dtype to return (i.e. uint64, int32, uint8). If set to False, the original object array will be returned. Otherwise, a ValueError will be raised. - + convert_to_masked_nullable : bool, default False + Whether to return a mask for the converted values. This also disables + upcasting for ints with nulls to float64. Returns ------- np.ndarray Array of converted object values to numerical ones. + + Optional[np.ndarray] + If convert_to_masked_nullable is True, + returns a boolean mask for the converted values, otherwise returns None. """ if len(values) == 0: - return np.array([], dtype='i8') + return (np.array([], dtype='i8'), None) # fastpath for ints - try to convert all based on first value cdef: @@ -2070,7 +2077,7 @@ def maybe_convert_numeric( try: maybe_ints = values.astype('i8') if (maybe_ints == values).all(): - return maybe_ints + return (maybe_ints, None) except (ValueError, OverflowError, TypeError): pass @@ -2084,21 +2091,40 @@ def maybe_convert_numeric( ndarray[int64_t] ints = np.empty(n, dtype='i8') ndarray[uint64_t] uints = np.empty(n, dtype='u8') ndarray[uint8_t] bools = np.empty(n, dtype='u1') + ndarray[uint8_t] mask = np.zeros(n, dtype="u1") float64_t fval + bint allow_null_in_int = convert_to_masked_nullable for i in range(n): val = values[i] + # We only want to disable NaNs showing as float if + # a) convert_to_masked_nullable = True + # b) no floats have been seen ( assuming an int shows up later ) + # However, if no ints present (all null array), we need to return floats + allow_null_in_int = convert_to_masked_nullable and not seen.float_ if val.__hash__ is not None and val in na_values: - seen.saw_null() + if allow_null_in_int: + seen.null_ = True + mask[i] = 1 + else: + if convert_to_masked_nullable: + mask[i] = 1 + seen.saw_null() floats[i] = complexes[i] = NaN elif util.is_float_object(val): fval = val if fval != fval: seen.null_ = True - + if allow_null_in_int: + mask[i] = 1 + else: + if convert_to_masked_nullable: + mask[i] = 1 + seen.float_ = True + else: + seen.float_ = True floats[i] = complexes[i] = fval - seen.float_ = True elif util.is_integer_object(val): floats[i] = complexes[i] = val @@ -2121,7 +2147,13 @@ def maybe_convert_numeric( floats[i] = uints[i] = ints[i] = bools[i] = val seen.bool_ = True elif val is None or val is C_NA: - seen.saw_null() + if allow_null_in_int: + seen.null_ = True + mask[i] = 1 + else: + if convert_to_masked_nullable: + mask[i] = 1 + seen.saw_null() floats[i] = complexes[i] = NaN elif hasattr(val, '__len__') and len(val) == 0: if convert_empty or seen.coerce_numeric: @@ -2142,9 +2174,11 @@ def maybe_convert_numeric( if fval in na_values: seen.saw_null() floats[i] = complexes[i] = NaN + mask[i] = 1 else: if fval != fval: seen.null_ = True + mask[i] = 1 floats[i] = fval @@ -2152,7 +2186,10 @@ def maybe_convert_numeric( as_int = int(val) if as_int in na_values: - seen.saw_null() + mask[i] = 1 + seen.null_ = True + if not allow_null_in_int: + seen.float_ = True else: seen.saw_int(as_int) @@ -2180,22 +2217,34 @@ def maybe_convert_numeric( floats[i] = NaN if seen.check_uint64_conflict(): - return values + return (values, None) + + # This occurs since we disabled float nulls showing as null in anticipation + # of seeing ints that were never seen. So then, we return float + if allow_null_in_int and seen.null_ and not seen.int_: + seen.float_ = True if seen.complex_: - return complexes + return (complexes, None) elif seen.float_: - return floats + if seen.null_ and convert_to_masked_nullable: + return (floats, mask.view(np.bool_)) + return (floats, None) elif seen.int_: + if seen.null_ and convert_to_masked_nullable: + if seen.uint_: + return (uints, mask.view(np.bool_)) + else: + return (ints, mask.view(np.bool_)) if seen.uint_: - return uints + return (uints, None) else: - return ints + return (ints, None) elif seen.bool_: - return bools.view(np.bool_) + return (bools.view(np.bool_), None) elif seen.uint_: - return uints - return ints + return (uints, None) + return (ints, None) @cython.boundscheck(False) diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi index b4f42f217a5db..11d67dfb93d5f 100644 --- a/pandas/_libs/ops.pyi +++ b/pandas/_libs/ops.pyi @@ -1,6 +1,8 @@ from typing import ( Any, Callable, + Literal, + overload, ) import numpy as np @@ -35,9 +37,19 @@ def vec_binop( op: _BinOp, # binary operator ) -> np.ndarray: ... +@overload +def maybe_convert_bool( + arr: np.ndarray, # np.ndarray[object] + true_values=..., + false_values=..., + convert_to_masked_nullable: Literal[False] = ..., +) -> tuple[np.ndarray, None]: ... +@overload def maybe_convert_bool( arr: np.ndarray, # np.ndarray[object] true_values=..., - false_values=... -) -> np.ndarray: ... + false_values=..., + *, + convert_to_masked_nullable: Literal[True], +) -> tuple[np.ndarray, np.ndarray]: ... diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 7951bb5c093ef..ac8a7f2cc57f7 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -24,10 +24,7 @@ import_array() from pandas._libs.missing cimport checknull -from pandas._libs.util cimport ( - UINT8_MAX, - is_nan, -) +from pandas._libs.util cimport is_nan @cython.wraparound(False) @@ -212,7 +209,7 @@ def scalar_binop(object[:] values, object val, object op) -> ndarray: else: result[i] = op(x, val) - return maybe_convert_bool(result.base) + return maybe_convert_bool(result.base)[0] @cython.wraparound(False) @@ -254,21 +251,25 @@ def vec_binop(object[:] left, object[:] right, object op) -> ndarray: else: raise - return maybe_convert_bool(result.base) # `.base` to access np.ndarray + return maybe_convert_bool(result.base)[0] # `.base` to access np.ndarray def maybe_convert_bool(ndarray[object] arr, - true_values=None, false_values=None) -> ndarray: + true_values=None, + false_values=None, + convert_to_masked_nullable=False + ) -> tuple[np.ndarray, np.ndarray | None]: cdef: Py_ssize_t i, n ndarray[uint8_t] result + ndarray[uint8_t] mask object val set true_vals, false_vals - int na_count = 0 + bint has_na = False n = len(arr) result = np.empty(n, dtype=np.uint8) - + mask = np.zeros(n, dtype=np.uint8) # the defaults true_vals = {'True', 'TRUE', 'true'} false_vals = {'False', 'FALSE', 'false'} @@ -291,16 +292,19 @@ def maybe_convert_bool(ndarray[object] arr, result[i] = 1 elif val in false_vals: result[i] = 0 - elif isinstance(val, float): - result[i] = UINT8_MAX - na_count += 1 + elif is_nan(val): + mask[i] = 1 + result[i] = 0 # Value here doesn't matter, will be replaced w/ nan + has_na = True else: - return arr + return (arr, None) - if na_count > 0: - mask = result == UINT8_MAX - arr = result.view(np.bool_).astype(object) - np.putmask(arr, mask, np.nan) - return arr + if has_na: + if convert_to_masked_nullable: + return (result.view(np.bool_), mask.view(np.bool_)) + else: + arr = result.view(np.bool_).astype(object) + np.putmask(arr, mask, np.nan) + return (arr, None) else: - return result.view(np.bool_) + return (result.view(np.bool_), None) diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi index 18ae23e7fb90d..92b970d47467e 100644 --- a/pandas/_libs/parsers.pyi +++ b/pandas/_libs/parsers.pyi @@ -31,8 +31,8 @@ class TextReader: source, delimiter: bytes | str = ..., # single-character only header=..., - header_start=..., - header_end=..., + header_start: int = ..., # int64_t + header_end: int = ..., # uint64_t index_col=..., names=..., tokenize_chunksize: int = ..., # int64_t diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2abb7e0ea3ac2..8d9f1773590b0 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -101,13 +101,13 @@ from pandas.errors import ( from pandas.core.dtypes.common import ( is_bool_dtype, - is_categorical_dtype, is_datetime64_dtype, is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, ) +from pandas.core.dtypes.dtypes import CategoricalDtype cdef: float64_t INF = np.inf @@ -305,35 +305,36 @@ cdef class TextReader: object na_fvalues object true_values, false_values object handle + object orig_header bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns - uint64_t parser_start + bint mangle_dupe_cols, allow_leading_cols + uint64_t parser_start # this is modified after __init__ list clocks const char *encoding_errors kh_str_starts_t *false_set kh_str_starts_t *true_set + int64_t buffer_lines, skipfooter + list dtype_cast_order # list[np.dtype] + list names # can be None + set noconvert # set[int] cdef public: - int64_t leading_cols, table_width, skipfooter, buffer_lines - bint allow_leading_cols, mangle_dupe_cols - bint delim_whitespace + int64_t leading_cols, table_width object delimiter # bytes or str object converters object na_values - object orig_header, names, header_start, header_end list header # list[list[non-negative integers]] object index_col object skiprows object dtype object usecols - list dtype_cast_order # list[np.dtype] set unnamed_cols # set[str] - set noconvert # set[int] def __cinit__(self, source, delimiter=b',', # bytes | str header=0, - header_start=0, - header_end=0, + int64_t header_start=0, + uint64_t header_end=0, index_col=None, names=None, tokenize_chunksize=DEFAULT_CHUNKSIZE, @@ -457,7 +458,6 @@ cdef class TextReader: self.parser.warn_bad_lines = 0 self.delimiter = delimiter - self.delim_whitespace = delim_whitespace self.na_values = na_values if na_fvalues is None: @@ -502,7 +502,7 @@ cdef class TextReader: # header stuff self.allow_leading_cols = allow_leading_cols - self.leading_cols = 0 + self.leading_cols = 0 # updated in _get_header # TODO: no header vs. header is not the first row self.has_mi_columns = 0 @@ -535,10 +535,11 @@ cdef class TextReader: self.parser.header_end = header self.parser_start = header + 1 self.parser.header = header - prelim_header = [ header ] + prelim_header = [header] self.names = names header, table_width, unnamed_cols = self._get_header(prelim_header) + # header, table_width, and unnamed_cols are set here, never changed self.header = header self.table_width = table_width self.unnamed_cols = unnamed_cols @@ -618,6 +619,11 @@ cdef class TextReader: cdef _get_header(self, list prelim_header): # header is now a list of lists, so field_count should use header[0] + # + # modifies: + # self.parser attributes + # self.parser_start + # self.leading_cols cdef: Py_ssize_t i, start, field_count, passed_count, unnamed_count, level @@ -710,7 +716,7 @@ cdef class TextReader: header.append(this_header) if self.names is not None: - header = [ self.names ] + header = [self.names] elif self.names is not None: # Enforce this unless usecols @@ -721,7 +727,7 @@ cdef class TextReader: if self.parser.lines < 1: self._tokenize_rows(1) - header = [ self.names ] + header = [self.names] if self.parser.lines < 1: field_count = len(header[0]) @@ -778,7 +784,7 @@ cdef class TextReader: """ # Conserve intermediate space # Caller is responsible for concatenating chunks, - # see c_parser_wrapper._concatenatve_chunks + # see c_parser_wrapper._concatenate_chunks cdef: size_t rows_read = 0 list chunks = [] @@ -885,7 +891,7 @@ cdef class TextReader: cdef _start_clock(self): self.clocks.append(time.time()) - cdef _end_clock(self, what): + cdef _end_clock(self, str what): if self.verbose: elapsed = time.time() - self.clocks.pop(-1) print(f'{what} took: {elapsed * 1000:.2f} ms') @@ -1090,7 +1096,7 @@ cdef class TextReader: bint user_dtype, kh_str_starts_t *na_hashset, object na_flist): - if is_categorical_dtype(dtype): + if isinstance(dtype, CategoricalDtype): # TODO: I suspect that _categorical_convert could be # optimized when dtype is an instance of CategoricalDtype codes, cats, na_count = _categorical_convert( @@ -1205,6 +1211,7 @@ cdef class TextReader: return self.converters.get(i) cdef _get_na_list(self, Py_ssize_t i, name): + # Note: updates self.na_values, self.na_fvalues if self.na_values is None: return None, set() diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 5b958163159aa..6b5cedf8a5243 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -21,18 +21,21 @@ from pandas._libs.util cimport ( set_array_not_contiguous, ) -from pandas._libs.lib import ( - is_scalar, - maybe_convert_objects, -) +from pandas._libs.lib import is_scalar + +cdef cnp.dtype _dtype_obj = np.dtype("object") -cpdef check_result_array(object obj, Py_ssize_t cnt): - if (is_array(obj) or - (isinstance(obj, list) and len(obj) == cnt) or - getattr(obj, 'shape', None) == (cnt,)): - raise ValueError('Must produce aggregated value') +cpdef check_result_array(object obj, object dtype): + # Our operation is supposed to be an aggregation/reduction. If + # it returns an ndarray, this likely means an invalid operation has + # been passed. See test_apply_without_aggregation, test_agg_must_agg + if is_array(obj): + if dtype != _dtype_obj: + # If it is object dtype, the function can be a reduction/aggregation + # and still return an ndarray e.g. test_agg_over_numpy_arrays + raise ValueError("Must produce aggregated value") cdef class _BaseGrouper: @@ -53,27 +56,27 @@ cdef class _BaseGrouper: return values, index - cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp, + cdef _init_dummy_series_and_index(self, Slider islider, Slider vslider): + """ + Create Series and Index objects that we will alter in-place while iterating. + """ + cached_index = self.ityp(islider.buf, dtype=self.idtype) + cached_series = self.typ( + vslider.buf, dtype=vslider.buf.dtype, index=cached_index, name=self.name + ) + return cached_index, cached_series + + cdef inline _update_cached_objs(self, object cached_series, object cached_index, Slider islider, Slider vslider): - if cached_typ is None: - cached_ityp = self.ityp(islider.buf, dtype=self.idtype) - cached_typ = self.typ( - vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name - ) - else: - # See the comment in indexes/base.py about _index_data. - # We need this for EA-backed indexes that have a reference - # to a 1-d ndarray like datetime / timedelta / period. - object.__setattr__(cached_ityp, '_index_data', islider.buf) - cached_ityp._engine.clear_mapping() - cached_ityp._cache.clear() # e.g. inferred_freq must go - cached_typ._mgr.set_values(vslider.buf) - object.__setattr__(cached_typ, '_index', cached_ityp) - object.__setattr__(cached_typ, 'name', self.name) - return cached_typ, cached_ityp + # See the comment in indexes/base.py about _index_data. + # We need this for EA-backed indexes that have a reference + # to a 1-d ndarray like datetime / timedelta / period. + cached_index._engine.clear_mapping() + cached_index._cache.clear() # e.g. inferred_freq must go + cached_series._mgr.set_values(vslider.buf) cdef inline object _apply_to_group(self, - object cached_typ, object cached_ityp, + object cached_series, object cached_index, bint initialized): """ Call self.f on our new group, then update to the next group. @@ -81,17 +84,15 @@ cdef class _BaseGrouper: cdef: object res - cached_ityp._engine.clear_mapping() - cached_ityp._cache.clear() # e.g. inferred_freq must go - res = self.f(cached_typ) + # NB: we assume that _update_cached_objs has already cleared cleared + # the cache and engine mapping + res = self.f(cached_series) res = extract_result(res) if not initialized: # On the first pass, we check the output shape to see # if this looks like a reduction. initialized = True - # In all tests other than test_series_grouper and - # test_series_bin_grouper, we have len(self.dummy_arr) == 0 - check_result_array(res, len(self.dummy_arr)) + check_result_array(res, cached_series.dtype) return res, initialized @@ -142,7 +143,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): object res bint initialized = 0 Slider vslider, islider - object cached_typ = None, cached_ityp = None + object cached_series = None, cached_index = None counts = np.zeros(self.ngroups, dtype=np.int64) @@ -162,6 +163,10 @@ cdef class SeriesBinGrouper(_BaseGrouper): result = np.empty(self.ngroups, dtype='O') + cached_index, cached_series = self._init_dummy_series_and_index( + islider, vslider + ) + start = 0 try: for i in range(self.ngroups): @@ -171,10 +176,10 @@ cdef class SeriesBinGrouper(_BaseGrouper): islider.move(start, end) vslider.move(start, end) - cached_typ, cached_ityp = self._update_cached_objs( - cached_typ, cached_ityp, islider, vslider) + self._update_cached_objs( + cached_series, cached_index, islider, vslider) - res, initialized = self._apply_to_group(cached_typ, cached_ityp, + res, initialized = self._apply_to_group(cached_series, cached_index, initialized) start += group_size @@ -185,7 +190,6 @@ cdef class SeriesBinGrouper(_BaseGrouper): islider.reset() vslider.reset() - result = maybe_convert_objects(result) return result, counts @@ -236,7 +240,7 @@ cdef class SeriesGrouper(_BaseGrouper): object res bint initialized = 0 Slider vslider, islider - object cached_typ = None, cached_ityp = None + object cached_series = None, cached_index = None labels = self.labels counts = np.zeros(self.ngroups, dtype=np.int64) @@ -248,6 +252,10 @@ cdef class SeriesGrouper(_BaseGrouper): result = np.empty(self.ngroups, dtype='O') + cached_index, cached_series = self._init_dummy_series_and_index( + islider, vslider + ) + start = 0 try: for i in range(n): @@ -265,10 +273,10 @@ cdef class SeriesGrouper(_BaseGrouper): islider.move(start, end) vslider.move(start, end) - cached_typ, cached_ityp = self._update_cached_objs( - cached_typ, cached_ityp, islider, vslider) + self._update_cached_objs( + cached_series, cached_index, islider, vslider) - res, initialized = self._apply_to_group(cached_typ, cached_ityp, + res, initialized = self._apply_to_group(cached_series, cached_index, initialized) start += group_size @@ -286,25 +294,23 @@ cdef class SeriesGrouper(_BaseGrouper): # have result initialized by this point. assert initialized, "`result` has not been initialized." - result = maybe_convert_objects(result) - return result, counts -cpdef inline extract_result(object res, bint squeeze=True): +cpdef inline extract_result(object res): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ if hasattr(res, "_values"): # Preserve EA res = res._values - if squeeze and res.ndim == 1 and len(res) == 1: + if res.ndim == 1 and len(res) == 1: res = res[0] if hasattr(res, 'values') and is_array(res.values): res = res.values if is_array(res): if res.ndim == 0: res = res.item() - elif squeeze and res.ndim == 1 and len(res) == 1: + elif res.ndim == 1 and len(res) == 1: res = res[0] return res diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index 53752098bafe9..d3aea5b0be796 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -7,6 +7,8 @@ _period_code_map: dict[str, int] class PeriodDtypeBase: + _dtype_code: int # PeriodDtypeCode + # actually __cinit__ def __new__(self, code: int): ... diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index fbeeb1f6070fe..233cd5688cb16 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -1,8 +1,8 @@ - from datetime import ( datetime, timedelta, ) +from typing import Any import numpy as np @@ -139,8 +139,8 @@ class NaTType(datetime): @property def qyear(self) -> float: ... - def __eq__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... - def __ne__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... + def __eq__(self, other: Any) -> bool: ... + def __ne__(self, other: Any) -> bool: ... def __lt__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... def __le__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... def __gt__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi new file mode 100644 index 0000000000000..49e630d605310 --- /dev/null +++ b/pandas/_libs/tslibs/period.pyi @@ -0,0 +1,158 @@ +from typing import Literal + +import numpy as np + +from pandas._libs.tslibs.nattype import NaTType +from pandas._libs.tslibs.offsets import BaseOffset +from pandas._libs.tslibs.timestamps import Timestamp +from pandas._typing import ( + Frequency, + Timezone, +) + +INVALID_FREQ_ERR_MSG: str +DIFFERENT_FREQ: str + +class IncompatibleFrequency(ValueError): ... + +def periodarr_to_dt64arr( + periodarr: np.ndarray, # const int64_t[:] + freq: int, +) -> np.ndarray: ... # np.ndarray[np.int64] + +def period_asfreq_arr( + arr: np.ndarray, # ndarray[int64_t] arr, + freq1: int, + freq2: int, + end: bool, +) -> np.ndarray: ... # np.ndarray[np.int64] + +def get_period_field_arr( + field: str, + arr: np.ndarray, # const int64_t[:] + freq: int, +) -> np.ndarray: ... # np.ndarray[np.int64] + +def from_ordinals( + values: np.ndarray, # const int64_t[:] + freq: Frequency, +) -> np.ndarray: ... # np.ndarray[np.int64] + +def extract_ordinals( + values: np.ndarray, # np.ndarray[object] + freq: Frequency | int, +) -> np.ndarray: ... # np.ndarray[np.int64] + +def extract_freq( + values: np.ndarray, # np.ndarray[object] +) -> BaseOffset: ... + +# exposed for tests +def period_asfreq(ordinal: int, freq1: int, freq2: int, end: bool) -> int: ... + +def period_ordinal( + y: int, m: int, d: int, h: int, min: int, s: int, us: int, ps: int, freq: int +) -> int: ... + +def freq_to_dtype_code(freq: BaseOffset) -> int: ... +def validate_end_alias(how: str) -> Literal["E", "S"]: ... + +class Period: + ordinal: int # int64_t + freq: BaseOffset + + # error: "__new__" must return a class instance (got "Union[Period, NaTType]") + def __new__( # type: ignore[misc] + cls, + value=None, + freq=None, + ordinal=None, + year=None, + month=None, + quarter=None, + day=None, + hour=None, + minute=None, + second=None, + ) -> Period | NaTType: ... + + @classmethod + def _maybe_convert_freq(cls, freq) -> BaseOffset: ... + + @classmethod + def _from_ordinal(cls, ordinal: int, freq) -> Period: ... + + @classmethod + def now(cls, freq=...) -> Period: ... + + def strftime(self, fmt: str) -> str: ... + + def to_timestamp( + self, + freq: str | BaseOffset | None =..., + how: str = ..., + tz: Timezone | None = ..., + ) -> Timestamp: ... + + def asfreq(self, freq, how=...) -> Period: ... + + @property + def freqstr(self) -> str: ... + + @property + def is_leap_year(self) -> bool: ... + + @property + def daysinmonth(self) -> int: ... + + @property + def days_in_month(self) -> int: ... + + @property + def qyear(self) -> int: ... + + @property + def quarter(self) -> int: ... + + @property + def day_of_year(self) -> int: ... + + @property + def weekday(self) -> int: ... + + @property + def day_of_week(self) -> int: ... + + @property + def week(self) -> int: ... + + @property + def weekofyear(self) -> int: ... + + @property + def second(self) -> int: ... + + @property + def minute(self) -> int: ... + + @property + def hour(self) -> int: ... + + @property + def day(self) -> int: ... + + @property + def month(self) -> int: ... + + @property + def year(self) -> int: ... + + @property + def end_time(self) -> Timestamp: ... + + @property + def start_time(self) -> Timestamp: ... + + def __sub__(self, other) -> Period | BaseOffset: ... + + def __add__(self, other) -> Period: ... diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 165f51d06af6d..0bb431bc8e1cd 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1445,7 +1445,7 @@ def from_ordinals(const int64_t[:] values, freq): @cython.wraparound(False) @cython.boundscheck(False) -def extract_ordinals(ndarray[object] values, freq): +def extract_ordinals(ndarray[object] values, freq) -> np.ndarray: # TODO: Change type to const object[:] when Cython supports that. cdef: @@ -1483,7 +1483,7 @@ def extract_ordinals(ndarray[object] values, freq): return ordinals.base # .base to access underlying np.ndarray -def extract_freq(ndarray[object] values): +def extract_freq(ndarray[object] values) -> BaseOffset: # TODO: Change type to const object[:] when Cython supports that. cdef: @@ -2539,7 +2539,7 @@ cdef int64_t _ordinal_from_fields(int year, int month, quarter, int day, minute, second, 0, 0, base) -def validate_end_alias(how): +def validate_end_alias(how: str) -> str: # Literal["E", "S"] how_dict = {'S': 'S', 'E': 'E', 'START': 'S', 'FINISH': 'E', 'BEGIN': 'S', 'END': 'E'} diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index 391226b622a01..5153118e9b142 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -143,8 +143,8 @@ def _assert_caught_no_extra_warnings( for actual_warning in caught_warnings: if _is_unexpected_warning(actual_warning, expected_warning): unclosed = "unclosed transport tuple[np.ndarray, DtypeObj]: values = extract_array(values, extract_numpy=True) # we check some simple dtypes first - if is_object_dtype(values): + if is_object_dtype(values.dtype): return ensure_object(np.asarray(values)), np.dtype("object") - try: - if is_bool_dtype(values): - # we are actually coercing to uint64 - # until our algos support uint8 directly (see TODO) - return np.asarray(values).astype("uint64"), np.dtype("bool") - elif is_signed_integer_dtype(values): - return ensure_int64(values), values.dtype - elif is_unsigned_integer_dtype(values): - return ensure_uint64(values), values.dtype - elif is_float_dtype(values): + elif is_bool_dtype(values.dtype): + if isinstance(values, np.ndarray): + # i.e. actually dtype == np.dtype("bool") + return np.asarray(values).view("uint8"), values.dtype + else: + # i.e. all-bool Categorical, BooleanArray + return np.asarray(values).astype("uint8", copy=False), values.dtype + + elif is_integer_dtype(values.dtype): + return np.asarray(values), values.dtype + + elif is_float_dtype(values.dtype): + # Note: checking `values.dtype == "float128"` raises on Windows and 32bit + # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]" + # has no attribute "itemsize" + if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr] + # we dont (yet) have float128 hashtable support return ensure_float64(values), values.dtype - elif is_complex_dtype(values): - - # ignore the fact that we are casting to float - # which discards complex parts - with catch_warnings(): - simplefilter("ignore", np.ComplexWarning) - values = ensure_float64(values) - return values, np.dtype("float64") + return np.asarray(values), values.dtype - except (TypeError, ValueError, OverflowError): - # if we are trying to coerce to a dtype - # and it is incompatible this will fall through to here - return ensure_object(values), np.dtype("object") + elif is_complex_dtype(values.dtype): + # ignore the fact that we are casting to float + # which discards complex parts + with catch_warnings(): + simplefilter("ignore", np.ComplexWarning) + values = ensure_float64(values) + return values, np.dtype("float64") # datetimelike - if needs_i8_conversion(values.dtype): - if is_period_dtype(values.dtype): - from pandas import PeriodIndex - - values = PeriodIndex(values)._data - elif is_timedelta64_dtype(values.dtype): - from pandas import TimedeltaIndex - - values = TimedeltaIndex(values)._data - else: - # Datetime - if values.ndim > 1 and is_datetime64_ns_dtype(values.dtype): - # Avoid calling the DatetimeIndex constructor as it is 1D only - # Note: this is reached by DataFrame.rank calls GH#27027 - # TODO(EA2D): special case not needed with 2D EAs - asi8 = values.view("i8") - dtype = values.dtype - # error: Incompatible return value type (got "Tuple[Any, - # Union[dtype, ExtensionDtype, None]]", expected - # "Tuple[ndarray, Union[dtype, ExtensionDtype]]") - return asi8, dtype # type: ignore[return-value] - - from pandas import DatetimeIndex - - values = DatetimeIndex(values)._data - dtype = values.dtype - return values.asi8, dtype + elif needs_i8_conversion(values.dtype): + if isinstance(values, np.ndarray): + values = sanitize_to_nanoseconds(values) + npvalues = values.view("i8") + npvalues = cast(np.ndarray, npvalues) + return npvalues, values.dtype elif is_categorical_dtype(values.dtype): values = cast("Categorical", values) values = values.codes dtype = pandas_dtype("category") - - # we are actually coercing to int64 - # until our algos support int* directly (not all do) - values = ensure_int64(values) return values, dtype # we have failed, return object @@ -268,8 +241,15 @@ def _ensure_arraylike(values) -> ArrayLike: _hashtables = { "float64": htable.Float64HashTable, + "float32": htable.Float32HashTable, "uint64": htable.UInt64HashTable, + "uint32": htable.UInt32HashTable, + "uint16": htable.UInt16HashTable, + "uint8": htable.UInt8HashTable, "int64": htable.Int64HashTable, + "int32": htable.Int32HashTable, + "int16": htable.Int16HashTable, + "int8": htable.Int8HashTable, "string": htable.StringHashTable, "object": htable.PyObjectHashTable, } @@ -298,6 +278,10 @@ def _get_values_for_rank(values: ArrayLike) -> np.ndarray: values = cast("Categorical", values)._values_for_rank() values, _ = _ensure_data(values) + if values.dtype.kind in ["i", "u", "f"]: + # rank_t includes only object, int64, uint64, float64 + dtype = values.dtype.kind + "8" + values = values.astype(dtype, copy=False) return values @@ -484,10 +468,9 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) - if is_extension_array_dtype(comps.dtype): - # error: Incompatible return value type (got "Series", expected "ndarray") - # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute "isin" - return comps.isin(values) # type: ignore[return-value,union-attr] + if not isinstance(comps, np.ndarray): + # i.e. Extension Array + return comps.isin(values) elif needs_i8_conversion(comps.dtype): # Dispatch to DatetimeLikeArrayMixin.isin @@ -571,7 +554,7 @@ def factorize_array( Returns ------- - codes : ndarray + codes : ndarray[np.intp] uniques : ndarray """ hash_klass, values = get_data_algo(values) @@ -923,9 +906,9 @@ def value_counts_arraylike(values, dropna: bool): f = getattr(htable, f"value_count_{ndtype}") keys, counts = f(values, dropna) - keys = _reconstruct_data(keys, original.dtype, original) + res_keys = _reconstruct_data(keys, original.dtype, original) - return keys, counts + return res_keys, counts def duplicated(values: ArrayLike, keep: str | bool = "first") -> np.ndarray: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9a75857c2586d..693b1832ed3c9 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -156,7 +156,7 @@ def agg(self) -> FrameOrSeriesUnion | None: kwargs = self.kwargs if isinstance(arg, str): - return self.maybe_apply_str() + return self.apply_str() if is_dict_like(arg): return self.agg_dict_like() @@ -360,7 +360,10 @@ def agg_list_like(self) -> FrameOrSeriesUnion: # raised directly in _aggregate_named pass elif "no results" in str(err): - # raised directly in _aggregate_multiple_funcs + # reached in test_frame_apply.test_nuiscance_columns + # where the colg.aggregate(arg) ends up going through + # the selected_obj.ndim == 1 branch above with arg == ["sum"] + # on a datetime64[ns] column pass else: raise @@ -456,7 +459,7 @@ def agg_dict_like(self) -> FrameOrSeriesUnion: return result - def maybe_apply_str(self) -> FrameOrSeriesUnion: + def apply_str(self) -> FrameOrSeriesUnion: """ Compute apply in case of a string. @@ -465,8 +468,7 @@ def maybe_apply_str(self) -> FrameOrSeriesUnion: result: Series or DataFrame """ # Caller is responsible for checking isinstance(self.f, str) - f = self.f - f = cast(str, f) + f = cast(str, self.f) obj = self.obj @@ -482,7 +484,7 @@ def maybe_apply_str(self) -> FrameOrSeriesUnion: raise ValueError(f"Operation {f} does not support axis=1") return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs) - def maybe_apply_multiple(self) -> FrameOrSeriesUnion | None: + def apply_multiple(self) -> FrameOrSeriesUnion: """ Compute apply in case of a list-like or dict-like. @@ -491,9 +493,6 @@ def maybe_apply_multiple(self) -> FrameOrSeriesUnion | None: result: Series, DataFrame, or None Result when self.f is a list-like or dict-like, None otherwise. """ - # Note: dict-likes are list-like - if not is_list_like(self.f): - return None return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs) def normalize_dictlike_arg( @@ -634,9 +633,8 @@ def dtypes(self) -> Series: def apply(self) -> FrameOrSeriesUnion: """ compute the results """ # dispatch to agg - result = self.maybe_apply_multiple() - if result is not None: - return result + if is_list_like(self.f): + return self.apply_multiple() # all empty if len(self.columns) == 0 and len(self.index) == 0: @@ -644,7 +642,7 @@ def apply(self) -> FrameOrSeriesUnion: # string dispatch if isinstance(self.f, str): - return self.maybe_apply_str() + return self.apply_str() # ufunc elif isinstance(self.f, np.ufunc): @@ -829,7 +827,7 @@ def wrap_results(self, results: ResType, res_index: Index) -> FrameOrSeriesUnion return result - def maybe_apply_str(self) -> FrameOrSeriesUnion: + def apply_str(self) -> FrameOrSeriesUnion: # Caller is responsible for checking isinstance(self.f, str) # TODO: GH#39993 - Avoid special-casing by replacing with lambda if self.f == "size": @@ -837,7 +835,7 @@ def maybe_apply_str(self) -> FrameOrSeriesUnion: obj = self.obj value = obj.shape[self.axis] return obj._constructor_sliced(value, index=self.agg_axis, name="size") - return super().maybe_apply_str() + return super().apply_str() class FrameRowApply(FrameApply): @@ -1005,13 +1003,12 @@ def apply(self) -> FrameOrSeriesUnion: return self.apply_empty_result() # dispatch to agg - result = self.maybe_apply_multiple() - if result is not None: - return result + if is_list_like(self.f): + return self.apply_multiple() if isinstance(self.f, str): # if we are a string, try to dispatch - return self.maybe_apply_str() + return self.apply_str() return self.apply_standard() diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 51e5f36b88c79..6214693f22975 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -1,4 +1,3 @@ -from distutils.version import LooseVersion import json import numpy as np @@ -6,8 +5,6 @@ from pandas.core.arrays.interval import VALID_CLOSED -_pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") - def pyarrow_array_to_numpy_and_mask(arr, dtype): """ @@ -48,97 +45,97 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): return data, mask -if _pyarrow_version_ge_015: - # the pyarrow extension types are only available for pyarrow 0.15+ - - class ArrowPeriodType(pyarrow.ExtensionType): - def __init__(self, freq): - # attributes need to be set first before calling - # super init (as that calls serialize) - self._freq = freq - pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") - - @property - def freq(self): - return self._freq - - def __arrow_ext_serialize__(self): - metadata = {"freq": self.freq} - return json.dumps(metadata).encode() - - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - metadata = json.loads(serialized.decode()) - return ArrowPeriodType(metadata["freq"]) - - def __eq__(self, other): - if isinstance(other, pyarrow.BaseExtensionType): - return type(self) == type(other) and self.freq == other.freq - else: - return NotImplemented - - def __hash__(self): - return hash((str(self), self.freq)) - - def to_pandas_dtype(self): - import pandas as pd - - return pd.PeriodDtype(freq=self.freq) - - # register the type with a dummy instance - _period_type = ArrowPeriodType("D") - pyarrow.register_extension_type(_period_type) - - class ArrowIntervalType(pyarrow.ExtensionType): - def __init__(self, subtype, closed): - # attributes need to be set first before calling - # super init (as that calls serialize) - assert closed in VALID_CLOSED - self._closed = closed - if not isinstance(subtype, pyarrow.DataType): - subtype = pyarrow.type_for_alias(str(subtype)) - self._subtype = subtype - - storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) - pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") - - @property - def subtype(self): - return self._subtype - - @property - def closed(self): - return self._closed - - def __arrow_ext_serialize__(self): - metadata = {"subtype": str(self.subtype), "closed": self.closed} - return json.dumps(metadata).encode() - - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - metadata = json.loads(serialized.decode()) - subtype = pyarrow.type_for_alias(metadata["subtype"]) - closed = metadata["closed"] - return ArrowIntervalType(subtype, closed) - - def __eq__(self, other): - if isinstance(other, pyarrow.BaseExtensionType): - return ( - type(self) == type(other) - and self.subtype == other.subtype - and self.closed == other.closed - ) - else: - return NotImplemented - - def __hash__(self): - return hash((str(self), str(self.subtype), self.closed)) - - def to_pandas_dtype(self): - import pandas as pd - - return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) - - # register the type with a dummy instance - _interval_type = ArrowIntervalType(pyarrow.int64(), "left") - pyarrow.register_extension_type(_interval_type) +class ArrowPeriodType(pyarrow.ExtensionType): + def __init__(self, freq): + # attributes need to be set first before calling + # super init (as that calls serialize) + self._freq = freq + pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") + + @property + def freq(self): + return self._freq + + def __arrow_ext_serialize__(self): + metadata = {"freq": self.freq} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + return ArrowPeriodType(metadata["freq"]) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return type(self) == type(other) and self.freq == other.freq + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), self.freq)) + + def to_pandas_dtype(self): + import pandas as pd + + return pd.PeriodDtype(freq=self.freq) + + +# register the type with a dummy instance +_period_type = ArrowPeriodType("D") +pyarrow.register_extension_type(_period_type) + + +class ArrowIntervalType(pyarrow.ExtensionType): + def __init__(self, subtype, closed): + # attributes need to be set first before calling + # super init (as that calls serialize) + assert closed in VALID_CLOSED + self._closed = closed + if not isinstance(subtype, pyarrow.DataType): + subtype = pyarrow.type_for_alias(str(subtype)) + self._subtype = subtype + + storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) + pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") + + @property + def subtype(self): + return self._subtype + + @property + def closed(self): + return self._closed + + def __arrow_ext_serialize__(self): + metadata = {"subtype": str(self.subtype), "closed": self.closed} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + subtype = pyarrow.type_for_alias(metadata["subtype"]) + closed = metadata["closed"] + return ArrowIntervalType(subtype, closed) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return ( + type(self) == type(other) + and self.subtype == other.subtype + and self.closed == other.closed + ) + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), str(self.subtype), self.closed)) + + def to_pandas_dtype(self): + import pandas as pd + + return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) + + +# register the type with a dummy instance +_interval_type = ArrowIntervalType(pyarrow.int64(), "left") +pyarrow.register_extension_type(_interval_type) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index e97687de34273..7c76a04a605e3 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -11,18 +11,15 @@ import numpy as np from pandas._libs import lib +from pandas._libs.arrays import NDArrayBacked from pandas._typing import ( F, PositionalIndexer2D, Shape, type_t, ) -from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import ( - cache_readonly, - doc, -) +from pandas.util._decorators import doc from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, @@ -69,24 +66,13 @@ def method(self, *args, **kwargs): return cast(F, method) -class NDArrayBackedExtensionArray(ExtensionArray): +class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray): """ ExtensionArray that is backed by a single NumPy ndarray. """ _ndarray: np.ndarray - def _from_backing_data( - self: NDArrayBackedExtensionArrayT, arr: np.ndarray - ) -> NDArrayBackedExtensionArrayT: - """ - Construct a new ExtensionArray `new_array` with `arr` as its _ndarray. - - This should round-trip: - self == self._from_backing_data(self._ndarray) - """ - raise AbstractMethodError(self) - def _box_func(self, x): """ Wrap numpy type in our dtype.type if necessary. @@ -142,46 +128,6 @@ def _validate_fill_value(self, fill_value): # ------------------------------------------------------------------------ - # TODO: make this a cache_readonly; for that to work we need to remove - # the _index_data kludge in libreduction - @property - def shape(self) -> Shape: - return self._ndarray.shape - - def __len__(self) -> int: - return self.shape[0] - - @cache_readonly - def ndim(self) -> int: - return len(self.shape) - - @cache_readonly - def size(self) -> int: - return self._ndarray.size - - @cache_readonly - def nbytes(self) -> int: - return self._ndarray.nbytes - - def reshape( - self: NDArrayBackedExtensionArrayT, *args, **kwargs - ) -> NDArrayBackedExtensionArrayT: - new_data = self._ndarray.reshape(*args, **kwargs) - return self._from_backing_data(new_data) - - def ravel( - self: NDArrayBackedExtensionArrayT, *args, **kwargs - ) -> NDArrayBackedExtensionArrayT: - new_data = self._ndarray.ravel(*args, **kwargs) - return self._from_backing_data(new_data) - - @property - def T(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: - new_data = self._ndarray.T - return self._from_backing_data(new_data) - - # ------------------------------------------------------------------------ - def equals(self, other) -> bool: if type(self) is not type(other): return False @@ -208,24 +154,6 @@ def argmax(self, axis: int = 0, skipna: bool = True): # type:ignore[override] raise NotImplementedError return nargminmax(self, "argmax", axis=axis) - def copy(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: - new_data = self._ndarray.copy() - return self._from_backing_data(new_data) - - def repeat( - self: NDArrayBackedExtensionArrayT, repeats, axis=None - ) -> NDArrayBackedExtensionArrayT: - """ - Repeat elements of an array. - - See Also - -------- - numpy.ndarray.repeat - """ - nv.validate_repeat((), {"axis": axis}) - new_data = self._ndarray.repeat(repeats, axis=axis) - return self._from_backing_data(new_data) - def unique(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: new_data = unique(self._ndarray) return self._from_backing_data(new_data) @@ -418,18 +346,6 @@ def where( res_values = np.where(mask, self._ndarray, value) return self._from_backing_data(res_values) - def delete( - self: NDArrayBackedExtensionArrayT, loc, axis: int = 0 - ) -> NDArrayBackedExtensionArrayT: - res_values = np.delete(self._ndarray, loc, axis=axis) - return self._from_backing_data(res_values) - - def swapaxes( - self: NDArrayBackedExtensionArrayT, axis1, axis2 - ) -> NDArrayBackedExtensionArrayT: - res_values = self._ndarray.swapaxes(axis1, axis2) - return self._from_backing_data(res_values) - # ------------------------------------------------------------------------ # Additional array methods # These are not part of the EA API, but we implement them because diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5a2643dd531ed..2cb30c53b6832 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -530,7 +530,6 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, self.dtype): @@ -540,9 +539,8 @@ def astype(self, dtype, copy=True): return self.copy() # FIXME: Really hard-code here? - if isinstance( - dtype, (ArrowStringDtype, StringDtype) - ): # allow conversion to StringArrays + if isinstance(dtype, StringDtype): + # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) @@ -794,7 +792,7 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: b = empty return self._concat_same_type([a, b]) - def unique(self): + def unique(self: ExtensionArrayT) -> ExtensionArrayT: """ Compute the ExtensionArray of unique values. @@ -1023,7 +1021,7 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: @Substitution(klass="ExtensionArray") @Appender(_extension_array_shared_docs["repeat"]) - def repeat(self, repeats, axis=None): + def repeat(self, repeats: int | Sequence[int], axis: int | None = None): nv.validate_repeat((), {"axis": axis}) ind = np.arange(len(self)).repeat(repeats) return self.take(ind) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6f3643e80a0fa..a82c75f4b2557 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -27,6 +27,7 @@ algos as libalgos, hashtable as htable, ) +from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import no_default from pandas._typing import ( ArrayLike, @@ -349,12 +350,13 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 - _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) _typ = "categorical" _can_hold_na = True + _dtype: CategoricalDtype + def __init__( self, values, @@ -373,8 +375,9 @@ def __init__( # infer categories in a factorization step further below if fastpath: - self._ndarray = coerce_indexer_dtype(values, dtype.categories) - self._dtype = self._dtype.update_dtype(dtype) + codes = coerce_indexer_dtype(values, dtype.categories) + dtype = CategoricalDtype(ordered=False).update_dtype(dtype) + super().__init__(codes, dtype) return if not is_list_like(values): @@ -463,8 +466,11 @@ def __init__( full_codes[~null_mask] = codes codes = full_codes - self._dtype = self._dtype.update_dtype(dtype) - self._ndarray = coerce_indexer_dtype(codes, dtype.categories) + dtype = CategoricalDtype(ordered=False).update_dtype(dtype) + arr = coerce_indexer_dtype(codes, dtype.categories) + # error: Argument 1 to "__init__" of "NDArrayBacked" has incompatible + # type "Union[ExtensionArray, ndarray]"; expected "ndarray" + super().__init__(arr, dtype) # type: ignore[arg-type] @property def dtype(self) -> CategoricalDtype: @@ -513,9 +519,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: raise ValueError("Cannot convert float NaN to integer") elif len(self.codes) == 0 or len(self.categories) == 0: - # error: Incompatible types in assignment (expression has type "ndarray", - # variable has type "Categorical") - result = np.array( # type: ignore[assignment] + result = np.array( self, dtype=dtype, copy=copy, @@ -533,11 +537,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" raise ValueError(msg) - # error: Incompatible types in assignment (expression has type "ndarray", - # variable has type "Categorical") - result = take_nd( # type: ignore[assignment] - new_cats, ensure_platform_int(self._codes) - ) + result = take_nd(new_cats, ensure_platform_int(self._codes)) return result @@ -745,7 +745,7 @@ def categories(self, categories): "new categories need to have the same number of " "items as the old categories!" ) - self._dtype = new_dtype + super().__init__(self._ndarray, new_dtype) @property def ordered(self) -> Ordered: @@ -809,7 +809,7 @@ def _set_categories(self, categories, fastpath=False): "items than the old categories!" ) - self._dtype = new_dtype + super().__init__(self._ndarray, new_dtype) def _set_dtype(self, dtype: CategoricalDtype) -> Categorical: """ @@ -842,7 +842,7 @@ def set_ordered(self, value, inplace=False): inplace = validate_bool_kwarg(inplace, "inplace") new_dtype = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() - cat._dtype = new_dtype + NDArrayBacked.__init__(cat, cat._ndarray, new_dtype) if not inplace: return cat @@ -882,7 +882,9 @@ def as_unordered(self, inplace=False): inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(False, inplace=inplace) - def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): + def set_categories( + self, new_categories, ordered=None, rename=False, inplace=no_default + ): """ Set the categories to the specified new_categories. @@ -916,6 +918,8 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal Whether or not to reorder the categories in-place or return a copy of this categorical with reordered categories. + .. deprecated:: 1.3.0 + Returns ------- Categorical with reordered categories or None if inplace. @@ -933,6 +937,18 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal remove_categories : Remove the specified categories. remove_unused_categories : Remove categories which are not used. """ + if inplace is not no_default: + warn( + "The `inplace` parameter in pandas.Categorical." + "set_categories is deprecated and will be removed in " + "a future version. Removing unused categories will always " + "return a new Categorical object.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + inplace = validate_bool_kwarg(inplace, "inplace") if ordered is None: ordered = self.dtype.ordered @@ -945,17 +961,17 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal ): # remove all _codes which are larger and set to -1/NaN cat._codes[cat._codes >= len(new_dtype.categories)] = -1 + codes = cat._codes else: codes = recode_for_categories( cat.codes, cat.categories, new_dtype.categories ) - cat._ndarray = codes - cat._dtype = new_dtype + NDArrayBacked.__init__(cat, codes, new_dtype) if not inplace: return cat - def rename_categories(self, new_categories, inplace=False): + def rename_categories(self, new_categories, inplace=no_default): """ Rename categories. @@ -980,6 +996,8 @@ def rename_categories(self, new_categories, inplace=False): Whether or not to rename the categories inplace or return a copy of this categorical with renamed categories. + .. deprecated:: 1.3.0 + Returns ------- cat : Categorical or None @@ -1019,6 +1037,18 @@ def rename_categories(self, new_categories, inplace=False): ['A', 'A', 'B'] Categories (2, object): ['A', 'B'] """ + if inplace is not no_default: + warn( + "The `inplace` parameter in pandas.Categorical." + "rename_categories is deprecated and will be removed in " + "a future version. Removing unused categories will always " + "return a new Categorical object.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() @@ -1087,7 +1117,10 @@ def reorder_categories(self, new_categories, ordered=None, inplace=no_default): raise ValueError( "items in new_categories are not the same as in old categories" ) - return self.set_categories(new_categories, ordered=ordered, inplace=inplace) + + with catch_warnings(): + simplefilter("ignore") + return self.set_categories(new_categories, ordered=ordered, inplace=inplace) def add_categories(self, new_categories, inplace=no_default): """ @@ -1149,8 +1182,8 @@ def add_categories(self, new_categories, inplace=no_default): new_dtype = CategoricalDtype(new_categories, self.ordered) cat = self if inplace else self.copy() - cat._dtype = new_dtype - cat._ndarray = coerce_indexer_dtype(cat._ndarray, new_dtype.categories) + codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories) + NDArrayBacked.__init__(cat, codes, new_dtype) if not inplace: return cat @@ -1217,9 +1250,11 @@ def remove_categories(self, removals, inplace=no_default): if len(not_included) != 0: raise ValueError(f"removals must all be in old categories: {not_included}") - return self.set_categories( - new_categories, ordered=self.ordered, rename=False, inplace=inplace - ) + with catch_warnings(): + simplefilter("ignore") + return self.set_categories( + new_categories, ordered=self.ordered, rename=False, inplace=inplace + ) def remove_unused_categories(self, inplace=no_default): """ @@ -1268,9 +1303,8 @@ def remove_unused_categories(self, inplace=no_default): new_dtype = CategoricalDtype._from_fastpath( new_categories, ordered=self.ordered ) - cat._dtype = new_dtype - cat._ndarray = coerce_indexer_dtype(inv, new_dtype.categories) - + new_codes = coerce_indexer_dtype(inv, new_dtype.categories) + NDArrayBacked.__init__(cat, new_codes, new_dtype) if not inplace: return cat @@ -1449,7 +1483,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): def __setstate__(self, state): """Necessary for making this object picklable""" if not isinstance(state, dict): - raise Exception("invalid pickle state") + return super().__setstate__(state) if "_dtype" not in state: state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) @@ -1458,8 +1492,7 @@ def __setstate__(self, state): # backward compat, changed what is property vs attribute state["_ndarray"] = state.pop("_codes") - for k, v in state.items(): - setattr(self, k, v) + super().__setstate__(state) @property def nbytes(self) -> int: @@ -1828,16 +1861,7 @@ def _codes(self) -> np.ndarray: @_codes.setter def _codes(self, value: np.ndarray): - self._ndarray = value - - def _from_backing_data(self, arr: np.ndarray) -> Categorical: - assert isinstance(arr, np.ndarray) - assert arr.dtype == self._ndarray.dtype - - res = object.__new__(type(self)) - res._ndarray = arr - res._dtype = self.dtype - return res + NDArrayBacked.__init__(self, value, self.dtype) def _box_func(self, i: int): if i == -1: @@ -2417,7 +2441,9 @@ def replace(self, to_replace, value, inplace: bool = False): cat.remove_categories(replace_value, inplace=True) else: categories[index] = new_value - cat.rename_categories(categories, inplace=True) + with catch_warnings(): + simplefilter("ignore") + cat.rename_categories(categories, inplace=True) if not inplace: return cat diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 93df88aba2cba..286fd8bf8ba4a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -23,7 +23,6 @@ algos, lib, ) -from pandas._libs.arrays import NDArrayBacked from pandas._libs.tslibs import ( BaseOffset, IncompatibleFrequency, @@ -142,7 +141,7 @@ class InvalidComparison(Exception): pass -class DatetimeLikeArrayMixin(OpsMixin, NDArrayBacked, NDArrayBackedExtensionArray): +class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): """ Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray @@ -584,6 +583,8 @@ def _validate_shift_value(self, fill_value): elif isinstance(fill_value, self._recognized_scalars): fill_value = self._scalar_type(fill_value) else: + new_fill: DatetimeLikeScalar + # only warn if we're not going to raise if self._scalar_type is Period and lib.is_integer(fill_value): # kludge for #31971 since Period(integer) tries to cast to str @@ -1108,7 +1109,10 @@ def _add_timedeltalike_scalar(self, other): # adding a scalar preserves freq new_freq = self.freq - return type(self)._simple_new(new_values, dtype=self.dtype, freq=new_freq) + # error: Unexpected keyword argument "freq" for "_simple_new" of "NDArrayBacked" + return type(self)._simple_new( # type: ignore[call-arg] + new_values, dtype=self.dtype, freq=new_freq + ) def _add_timedelta_arraylike(self, other): """ @@ -1682,13 +1686,16 @@ def strftime(self, date_format: str) -> np.ndarray: """ +TimelikeOpsT = TypeVar("TimelikeOpsT", bound="TimelikeOps") + + class TimelikeOps(DatetimeLikeArrayMixin): """ Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. """ - def copy(self: TimelikeOps) -> TimelikeOps: - result = NDArrayBacked.copy(self) + def copy(self: TimelikeOpsT) -> TimelikeOpsT: + result = super().copy() result._freq = self._freq return result diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 117b267fd49e5..f07a04b8087e0 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -320,8 +320,9 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy: bool = False): if inferred_freq is None and freq is not None: type(self)._validate_frequency(self, freq) + # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked" @classmethod - def _simple_new( + def _simple_new( # type: ignore[override] cls, values: np.ndarray, freq: BaseOffset | None = None, dtype=DT64NS_DTYPE ) -> DatetimeArray: assert isinstance(values, np.ndarray) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 377010cedce18..ad08005fc0f95 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1551,7 +1551,11 @@ def delete(self: IntervalArrayT, loc) -> IntervalArrayT: return self._shallow_copy(left=new_left, right=new_right) @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) - def repeat(self: IntervalArrayT, repeats: int, axis=None) -> IntervalArrayT: + def repeat( + self: IntervalArrayT, + repeats: int | Sequence[int], + axis: int | None = None, + ) -> IntervalArrayT: nv.validate_repeat((), {"axis": axis}) left_repeat = self.left.repeat(repeats) right_repeat = self.right.repeat(repeats) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 52900d9b62dc2..e9d554200805e 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -64,6 +64,7 @@ class PandasArray( _typ = "npy_extension" __array_priority__ = 1000 _ndarray: np.ndarray + _dtype: PandasDtype # ------------------------------------------------------------------------ # Constructors @@ -83,8 +84,8 @@ def __init__(self, values: np.ndarray | PandasArray, copy: bool = False): if copy: values = values.copy() - self._ndarray = values - self._dtype = PandasDtype(values.dtype) + dtype = PandasDtype(values.dtype) + super().__init__(values, dtype) @classmethod def _from_sequence( diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index a9c94b615f49c..101209be30b40 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -181,6 +181,8 @@ class PeriodArray(dtl.DatelikeOps): _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops _datetimelike_methods: list[str] = ["strftime", "to_timestamp", "asfreq"] + _dtype: PeriodDtype + # -------------------------------------------------------------------- # Constructors @@ -210,8 +212,9 @@ def __init__( raise ValueError("freq is not specified and cannot be inferred") NDArrayBacked.__init__(self, values, PeriodDtype(freq)) + # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked" @classmethod - def _simple_new( + def _simple_new( # type: ignore[override] cls, values: np.ndarray, freq: BaseOffset | None = None, @@ -295,9 +298,17 @@ def _generate_range(cls, start, end, periods, freq, fields): # ----------------------------------------------------------------- # DatetimeLike Interface - def _unbox_scalar(self, value: Period | NaTType, setitem: bool = False) -> np.int64: + # error: Argument 1 of "_unbox_scalar" is incompatible with supertype + # "DatetimeLikeArrayMixin"; supertype defines the argument type as + # "Union[Union[Period, Any, Timedelta], NaTType]" + def _unbox_scalar( # type: ignore[override] + self, + value: Period | NaTType, + setitem: bool = False, + ) -> np.int64: if value is NaT: - return np.int64(value.value) + # error: Item "Period" of "Union[Period, NaTType]" has no attribute "value" + return np.int64(value.value) # type: ignore[union-attr] elif isinstance(value, self._scalar_type): self._check_compatible_with(value, setitem=setitem) return np.int64(value.ordinal) @@ -482,9 +493,9 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: freq = Period._maybe_convert_freq(freq) base = freq._period_dtype_code - new_data = self.asfreq(freq, how=how) + new_parr = self.asfreq(freq, how=how) - new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) + new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base) return DatetimeArray(new_data)._with_freq("infer") # -------------------------------------------------------------------- @@ -910,7 +921,7 @@ def raise_on_incompatible(left, right): def period_array( - data: Sequence[Period | None] | AnyArrayLike, + data: Sequence[Period | str | None] | AnyArrayLike, freq: str | Tick | None = None, copy: bool = False, ) -> PeriodArray: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 091efa68c67da..4847372f18239 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -550,7 +550,7 @@ def _from_factorized(cls, values, original): # Data # ------------------------------------------------------------------------ @property - def sp_index(self): + def sp_index(self) -> SparseIndex: """ The SparseIndex containing the location of non- ``fill_value`` points. """ @@ -570,7 +570,7 @@ def sp_values(self) -> np.ndarray: return self._sparse_values @property - def dtype(self): + def dtype(self) -> SparseDtype: return self._dtype @property @@ -597,7 +597,7 @@ def kind(self) -> str: return "block" @property - def _valid_sp_values(self): + def _valid_sp_values(self) -> np.ndarray: sp_vals = self.sp_values mask = notna(sp_vals) return sp_vals[mask] @@ -620,7 +620,7 @@ def nbytes(self) -> int: return self.sp_values.nbytes + self.sp_index.nbytes @property - def density(self): + def density(self) -> float: """ The percent of non- ``fill_value`` points, as decimal. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8608d5894c155..d5675a1027faf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -8,6 +8,7 @@ lib, missing as libmissing, ) +from pandas._libs.arrays import NDArrayBacked from pandas._typing import ( Dtype, Scalar, @@ -207,7 +208,7 @@ def __init__(self, values, copy=False): values = extract_array(values) super().__init__(values, copy=copy) - self._dtype = StringDtype() + NDArrayBacked.__init__(self, self._ndarray, StringDtype()) if not isinstance(values, type(self)): self._validate() @@ -243,9 +244,8 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? - new_string_array = object.__new__(cls) - new_string_array._dtype = StringDtype() - new_string_array._ndarray = result + new_string_array = cls.__new__(cls) + NDArrayBacked.__init__(new_string_array, result, StringDtype()) return new_string_array @@ -323,9 +323,7 @@ def astype(self, dtype, copy=True): values = arr.astype(dtype.numpy_dtype) return IntegerArray(values, mask, copy=False) elif isinstance(dtype, FloatingDtype): - # error: Incompatible types in assignment (expression has type - # "StringArray", variable has type "ndarray") - arr = self.copy() # type: ignore[assignment] + arr = self.copy() mask = self.isna() arr[mask] = "0" values = arr.astype(dtype.numpy_dtype) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3923eae0c8bf4..de987b8d34f08 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -19,12 +19,12 @@ Dtype, NpDtype, PositionalIndexer, + Scalar, type_t, ) from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, @@ -41,6 +41,8 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.integer import Int64Dtype +from pandas.core.arrays.string_ import StringDtype from pandas.core.indexers import ( check_array_indexer, validate_indices, @@ -73,7 +75,7 @@ @register_extension_dtype -class ArrowStringDtype(ExtensionDtype): +class ArrowStringDtype(StringDtype): """ Extension dtype for string data in a ``pyarrow.ChunkedArray``. @@ -109,7 +111,7 @@ def type(self) -> type[str]: return str @classmethod - def construct_array_type(cls) -> type_t[ArrowStringArray]: + def construct_array_type(cls) -> type_t[ArrowStringArray]: # type: ignore[override] """ Return the array type associated with this dtype. @@ -125,7 +127,9 @@ def __hash__(self) -> int: def __repr__(self) -> str: return "ArrowStringDtype" - def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ArrowStringArray: + def __from_arrow__( # type: ignore[override] + self, array: pa.Array | pa.ChunkedArray + ) -> ArrowStringArray: """ Construct StringArray from pyarrow Array/ChunkedArray. """ @@ -229,10 +233,21 @@ def _chk_pyarrow_available(cls) -> None: @classmethod def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): + from pandas.core.arrays.masked import BaseMaskedArray + cls._chk_pyarrow_available() - # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value - scalars = lib.ensure_string_array(scalars, copy=False) - return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) + + if isinstance(scalars, BaseMaskedArray): + # avoid costly conversion to object dtype in ensure_string_array and + # numerical issues with Float32Dtype + na_values = scalars._mask + result = scalars._data + result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + return cls(pa.array(result, mask=na_values, type=pa.string())) + + # convert non-na-likes to str + result = lib.ensure_string_array(scalars, copy=copy) + return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod def _from_sequence_of_strings( @@ -651,6 +666,34 @@ def take( indices_array[indices_array < 0] += len(self._data) return type(self)(self._data.take(indices_array)) + def isin(self, values): + + # pyarrow.compute.is_in added in pyarrow 2.0.0 + if not hasattr(pc, "is_in"): + return super().isin(values) + + value_set = [ + pa_scalar.as_py() + for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] + if pa_scalar.type in (pa.string(), pa.null()) + ] + + # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True + # for null values, so we short-circuit to return all False array. + if not len(value_set): + return np.zeros(len(self), dtype=bool) + + kwargs = {} + if LooseVersion(pa.__version__) < "3.0.0": + # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises + # with unexpected keyword argument in pyarrow 3.0.0+ + kwargs["skip_null"] = True + + result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) + # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls + # to False + return np.array(result, dtype=np.bool_) + def value_counts(self, dropna: bool = True) -> Series: """ Return a Series containing counts of each unique value. @@ -754,17 +797,28 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) - def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - if not regex and case: - result = pc.match_substring(self._data, pat) - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result - else: + def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): + if flags: return super()._str_contains(pat, case, flags, na, regex) + if regex: + # match_substring_regex added in pyarrow 4.0.0 + if hasattr(pc, "match_substring_regex") and case: + result = pc.match_substring_regex(self._data, pat) + else: + return super()._str_contains(pat, case, flags, na, regex) + else: + if case: + result = pc.match_substring(self._data, pat) + else: + result = pc.match_substring(pc.utf8_upper(self._data), pat.upper()) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result + def _str_startswith(self, pat, na=None): + # match_substring_regex added in pyarrow 4.0.0 if hasattr(pc, "match_substring_regex"): result = pc.match_substring_regex(self._data, "^" + re.escape(pat)) result = BooleanDtype().__from_arrow__(result) @@ -775,6 +829,7 @@ def _str_startswith(self, pat, na=None): return super()._str_startswith(pat, na) def _str_endswith(self, pat, na=None): + # match_substring_regex added in pyarrow 4.0.0 if hasattr(pc, "match_substring_regex"): result = pc.match_substring_regex(self._data, re.escape(pat) + "$") result = BooleanDtype().__from_arrow__(result) @@ -784,49 +839,39 @@ def _str_endswith(self, pat, na=None): else: return super()._str_endswith(pat, na) + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None + ): + if not pat.startswith("^"): + pat = "^" + pat + return self._str_contains(pat, case, flags, na, regex=True) + def _str_isalnum(self): - if hasattr(pc, "utf8_is_alnum"): - result = pc.utf8_is_alnum(self._data) - return BooleanDtype().__from_arrow__(result) - else: - return super()._str_isalnum() + result = pc.utf8_is_alnum(self._data) + return BooleanDtype().__from_arrow__(result) def _str_isalpha(self): - if hasattr(pc, "utf8_is_alpha"): - result = pc.utf8_is_alpha(self._data) - return BooleanDtype().__from_arrow__(result) - else: - return super()._str_isalpha() + result = pc.utf8_is_alpha(self._data) + return BooleanDtype().__from_arrow__(result) def _str_isdecimal(self): - if hasattr(pc, "utf8_is_decimal"): - result = pc.utf8_is_decimal(self._data) - return BooleanDtype().__from_arrow__(result) - else: - return super()._str_isdecimal() + result = pc.utf8_is_decimal(self._data) + return BooleanDtype().__from_arrow__(result) def _str_isdigit(self): - if hasattr(pc, "utf8_is_digit"): - result = pc.utf8_is_digit(self._data) - return BooleanDtype().__from_arrow__(result) - else: - return super()._str_isdigit() + result = pc.utf8_is_digit(self._data) + return BooleanDtype().__from_arrow__(result) def _str_islower(self): - if hasattr(pc, "utf8_is_lower"): - result = pc.utf8_is_lower(self._data) - return BooleanDtype().__from_arrow__(result) - else: - return super()._str_islower() + result = pc.utf8_is_lower(self._data) + return BooleanDtype().__from_arrow__(result) def _str_isnumeric(self): - if hasattr(pc, "utf8_is_numeric"): - result = pc.utf8_is_numeric(self._data) - return BooleanDtype().__from_arrow__(result) - else: - return super()._str_isnumeric() + result = pc.utf8_is_numeric(self._data) + return BooleanDtype().__from_arrow__(result) def _str_isspace(self): + # utf8_is_space added in pyarrow 2.0.0 if hasattr(pc, "utf8_is_space"): result = pc.utf8_is_space(self._data) return BooleanDtype().__from_arrow__(result) @@ -834,18 +879,20 @@ def _str_isspace(self): return super()._str_isspace() def _str_istitle(self): - if hasattr(pc, "utf8_is_title"): - result = pc.utf8_is_title(self._data) - return BooleanDtype().__from_arrow__(result) - else: - return super()._str_istitle() + result = pc.utf8_is_title(self._data) + return BooleanDtype().__from_arrow__(result) def _str_isupper(self): - if hasattr(pc, "utf8_is_upper"): - result = pc.utf8_is_upper(self._data) - return BooleanDtype().__from_arrow__(result) + result = pc.utf8_is_upper(self._data) + return BooleanDtype().__from_arrow__(result) + + def _str_len(self): + # utf8_length added in pyarrow 4.0.0 + if hasattr(pc, "utf8_length"): + result = pc.utf8_length(self._data) + return Int64Dtype().__from_arrow__(result) else: - return super()._str_isupper() + return super()._str_len() def _str_lower(self): return type(self)(pc.utf8_lower(self._data)) @@ -855,27 +902,33 @@ def _str_upper(self): def _str_strip(self, to_strip=None): if to_strip is None: + # utf8_trim_whitespace added in pyarrow 4.0.0 if hasattr(pc, "utf8_trim_whitespace"): return type(self)(pc.utf8_trim_whitespace(self._data)) else: + # utf8_trim added in pyarrow 4.0.0 if hasattr(pc, "utf8_trim"): return type(self)(pc.utf8_trim(self._data, characters=to_strip)) return super()._str_strip(to_strip) def _str_lstrip(self, to_strip=None): if to_strip is None: + # utf8_ltrim_whitespace added in pyarrow 4.0.0 if hasattr(pc, "utf8_ltrim_whitespace"): return type(self)(pc.utf8_ltrim_whitespace(self._data)) else: + # utf8_ltrim added in pyarrow 4.0.0 if hasattr(pc, "utf8_ltrim"): return type(self)(pc.utf8_ltrim(self._data, characters=to_strip)) return super()._str_lstrip(to_strip) def _str_rstrip(self, to_strip=None): if to_strip is None: + # utf8_rtrim_whitespace added in pyarrow 4.0.0 if hasattr(pc, "utf8_rtrim_whitespace"): return type(self)(pc.utf8_rtrim_whitespace(self._data)) else: + # utf8_rtrim added in pyarrow 4.0.0 if hasattr(pc, "utf8_rtrim"): return type(self)(pc.utf8_rtrim(self._data, characters=to_strip)) return super()._str_rstrip(to_strip) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 087ce415cc4ba..ea87ac64cfe22 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -234,8 +234,9 @@ def __init__( if inferred_freq is None and freq is not None: type(self)._validate_frequency(self, freq) + # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked" @classmethod - def _simple_new( + def _simple_new( # type: ignore[override] cls, values: np.ndarray, freq: BaseOffset | None = None, dtype=TD64NS_DTYPE ) -> TimedeltaArray: assert dtype == TD64NS_DTYPE, dtype diff --git a/pandas/core/base.py b/pandas/core/base.py index 5747cb68fde33..105b74d91e79e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -16,6 +16,7 @@ import pandas._libs.lib as lib from pandas._typing import ( + ArrayLike, Dtype, DtypeObj, IndexLabel, @@ -996,7 +997,7 @@ def unique(self): values = self._values if not isinstance(values, np.ndarray): - result = values.unique() + result: ArrayLike = values.unique() if self.dtype.kind in ["m", "M"] and isinstance(self, ABCSeries): # GH#31182 Series._values returns EA, unpack for backward-compat if getattr(self.dtype, "tz", None) is None: @@ -1040,8 +1041,10 @@ def nunique(self, dropna: bool = True) -> int: >>> s.nunique() 4 """ - obj = remove_na_arraylike(self) if dropna else self - return len(obj.unique()) + uniqs = self.unique() + if dropna: + uniqs = remove_na_arraylike(uniqs) + return len(uniqs) @property def is_unique(self) -> bool: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index fd49ac0176ce4..baac872a6a466 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -652,6 +652,22 @@ def use_inf_as_na_cb(key): validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]), ) + +# Set up the io.sql specific configuration. +sql_engine_doc = """ +: string + The default sql reader/writer engine. Available options: + 'auto', 'sqlalchemy', the default is 'auto' +""" + +with cf.config_prefix("io.sql"): + cf.register_option( + "engine", + "auto", + sql_engine_doc, + validator=is_one_of_factory(["auto", "sqlalchemy"]), + ) + # -------- # Plotting # --------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d4ecec667cc86..46dc97214e2f6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1356,7 +1356,7 @@ def soft_convert_objects( return converted if numeric and is_object_dtype(values.dtype): - converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True) + converted, _ = lib.maybe_convert_numeric(values, set(), coerce_numeric=True) # If all NaNs, then do not-alter values = converted if not isna(converted).all() else values diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8ac94111cca56..6d3042507d930 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5954,7 +5954,7 @@ def dropna( def drop_duplicates( self, subset: Hashable | Sequence[Hashable] | None = None, - keep: str | bool = "first", + keep: Literal["first"] | Literal["last"] | Literal[False] = "first", inplace: bool = False, ignore_index: bool = False, ) -> DataFrame | None: @@ -6051,7 +6051,7 @@ def drop_duplicates( def duplicated( self, subset: Hashable | Sequence[Hashable] | None = None, - keep: str | bool = "first", + keep: Literal["first"] | Literal["last"] | Literal[False] = "first", ) -> Series: """ Return boolean Series denoting duplicate rows. @@ -6146,7 +6146,7 @@ def duplicated( if self.empty: return self._constructor_sliced(dtype=bool) - def f(vals): + def f(vals) -> tuple[np.ndarray, int]: labels, shape = algorithms.factorize(vals, size_hint=len(self)) return labels.astype("i8", copy=False), len(shape) @@ -6173,7 +6173,14 @@ def f(vals): vals = (col.values for name, col in self.items() if name in subset) labels, shape = map(list, zip(*map(f, vals))) - ids = get_group_index(labels, shape, sort=False, xnull=False) + ids = get_group_index( + labels, + # error: Argument 1 to "tuple" has incompatible type "List[_T]"; + # expected "Iterable[int]" + tuple(shape), # type: ignore[arg-type] + sort=False, + xnull=False, + ) result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) return result.__finalize__(self, method="duplicated") @@ -6219,7 +6226,6 @@ def sort_values( # type: ignore[override] indexer = lexsort_indexer( keys, orders=ascending, na_position=na_position, key=key ) - indexer = ensure_platform_int(indexer) elif len(by): by = by[0] @@ -8553,7 +8559,7 @@ def apply( Notes ----- Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`udf-mutation` + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. Examples @@ -10454,6 +10460,107 @@ def _AXIS_NAMES(self) -> dict[int, str]: boxplot = pandas.plotting.boxplot_frame sparse = CachedAccessor("sparse", SparseFrameAccessor) + # ---------------------------------------------------------------------- + # Internal Interface Methods + + def _to_dict_of_blocks(self, copy: bool = True): + """ + Return a dict of dtype -> Constructor Types that + each is a homogeneous dtype. + + Internal ONLY - only works for BlockManager + """ + mgr = self._mgr + # convert to BlockManager if needed -> this way support ArrayManager as well + mgr = mgr_to_mgr(mgr, "block") + mgr = cast(BlockManager, mgr) + return { + k: self._constructor(v).__finalize__(self) + for k, v, in mgr.to_dict(copy=copy).items() + } + + @property + def values(self) -> np.ndarray: + """ + Return a Numpy representation of the DataFrame. + + .. warning:: + + We recommend using :meth:`DataFrame.to_numpy` instead. + + Only the values in the DataFrame will be returned, the axes labels + will be removed. + + Returns + ------- + numpy.ndarray + The values of the DataFrame. + + See Also + -------- + DataFrame.to_numpy : Recommended alternative to this method. + DataFrame.index : Retrieve the index labels. + DataFrame.columns : Retrieving the column names. + + Notes + ----- + The dtype will be a lower-common-denominator dtype (implicit + upcasting); that is to say if the dtypes (even of numeric types) + are mixed, the one that accommodates all will be chosen. Use this + with care if you are not dealing with the blocks. + + e.g. If the dtypes are float16 and float32, dtype will be upcast to + float32. If dtypes are int32 and uint8, dtype will be upcast to + int32. By :func:`numpy.find_common_type` convention, mixing int64 + and uint64 will result in a float64 dtype. + + Examples + -------- + A DataFrame where all columns are the same type (e.g., int64) results + in an array of the same type. + + >>> df = pd.DataFrame({'age': [ 3, 29], + ... 'height': [94, 170], + ... 'weight': [31, 115]}) + >>> df + age height weight + 0 3 94 31 + 1 29 170 115 + >>> df.dtypes + age int64 + height int64 + weight int64 + dtype: object + >>> df.values + array([[ 3, 94, 31], + [ 29, 170, 115]]) + + A DataFrame with mixed type columns(e.g., str/object, int64, float32) + results in an ndarray of the broadest type that accommodates these + mixed types (e.g., object). + + >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), + ... ('lion', 80.5, 1), + ... ('monkey', np.nan, None)], + ... columns=('name', 'max_speed', 'rank')) + >>> df2.dtypes + name object + max_speed float64 + rank object + dtype: object + >>> df2.values + array([['parrot', 24.0, 'second'], + ['lion', 80.5, 1], + ['monkey', nan, None]], dtype=object) + """ + self._consolidate_inplace() + return self._mgr.as_array(transpose=True) + + @property + def _values(self) -> np.ndarray: + """internal implementation""" + return self.values + DataFrame._add_numeric_operations() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eba4a36315ba4..d225ac6e6881b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5614,85 +5614,12 @@ def _get_bool_data(self): @property def values(self) -> np.ndarray: - """ - Return a Numpy representation of the DataFrame. - - .. warning:: - - We recommend using :meth:`DataFrame.to_numpy` instead. - - Only the values in the DataFrame will be returned, the axes labels - will be removed. - - Returns - ------- - numpy.ndarray - The values of the DataFrame. - - See Also - -------- - DataFrame.to_numpy : Recommended alternative to this method. - DataFrame.index : Retrieve the index labels. - DataFrame.columns : Retrieving the column names. - - Notes - ----- - The dtype will be a lower-common-denominator dtype (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. Use this - with care if you are not dealing with the blocks. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcast to - int32. By :func:`numpy.find_common_type` convention, mixing int64 - and uint64 will result in a float64 dtype. - - Examples - -------- - A DataFrame where all columns are the same type (e.g., int64) results - in an array of the same type. - - >>> df = pd.DataFrame({'age': [ 3, 29], - ... 'height': [94, 170], - ... 'weight': [31, 115]}) - >>> df - age height weight - 0 3 94 31 - 1 29 170 115 - >>> df.dtypes - age int64 - height int64 - weight int64 - dtype: object - >>> df.values - array([[ 3, 94, 31], - [ 29, 170, 115]]) - - A DataFrame with mixed type columns(e.g., str/object, int64, float32) - results in an ndarray of the broadest type that accommodates these - mixed types (e.g., object). - - >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), - ... ('lion', 80.5, 1), - ... ('monkey', np.nan, None)], - ... columns=('name', 'max_speed', 'rank')) - >>> df2.dtypes - name object - max_speed float64 - rank object - dtype: object - >>> df2.values - array([['parrot', 24.0, 'second'], - ['lion', 80.5, 1], - ['monkey', nan, None]], dtype=object) - """ - self._consolidate_inplace() - return self._mgr.as_array(transpose=self._AXIS_REVERSED) + raise AbstractMethodError(self) @property def _values(self) -> np.ndarray: """internal implementation""" - return self.values + raise AbstractMethodError(self) @property def dtypes(self): @@ -5725,23 +5652,6 @@ def dtypes(self): data = self._mgr.get_dtypes() return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) - @final - def _to_dict_of_blocks(self, copy: bool_t = True): - """ - Return a dict of dtype -> Constructor Types that - each is a homogeneous dtype. - - Internal ONLY - only works for BlockManager - """ - mgr = self._mgr - # convert to BlockManager if needed -> this way support ArrayManager as well - mgr = mgr_to_mgr(mgr, "block") - mgr = cast(BlockManager, mgr) - return { - k: self._constructor(v).__finalize__(self) - for k, v, in mgr.to_dict(copy=copy).items() - } - def astype( self: FrameOrSeries, dtype, copy: bool_t = True, errors: str = "raise" ) -> FrameOrSeries: @@ -7404,10 +7314,10 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): with np.errstate(all="ignore"): if upper is not None: - subset = self.to_numpy() <= upper + subset = (self <= upper).to_numpy() result = result.where(subset, upper, axis=None, inplace=False) if lower is not None: - subset = self.to_numpy() >= lower + subset = (self >= lower).to_numpy() result = result.where(subset, lower, axis=None, inplace=False) if np.any(mask): diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 297681f1e10f5..2a2671374efc4 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,7 +1,4 @@ -from typing import ( - Optional, - Tuple, -) +from __future__ import annotations import numpy as np @@ -16,7 +13,7 @@ def recode_for_groupby( c: Categorical, sort: bool, observed: bool -) -> Tuple[Categorical, Optional[Categorical]]: +) -> tuple[Categorical, Categorical | None]: """ Code the categories to ensure we can groupby for categoricals. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 771ba2845db6e..18506b871bda6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -44,10 +44,6 @@ doc, ) -from pandas.core.dtypes.cast import ( - find_common_type, - maybe_downcast_numeric, -) from pandas.core.dtypes.common import ( ensure_int64, is_bool, @@ -73,7 +69,6 @@ validate_func_kwargs, ) from pandas.core.apply import GroupByApply -from pandas.core.arrays import Categorical from pandas.core.base import ( DataError, SpecificationError, @@ -88,7 +83,6 @@ _agg_template, _apply_docs, _transform_template, - get_groupby, group_selection_context, ) from pandas.core.indexes.api import ( @@ -97,7 +91,6 @@ all_indexes_same, ) import pandas.core.indexes.base as ibase -from pandas.core.internals import ArrayManager from pandas.core.series import Series from pandas.core.util.numba_ import maybe_use_numba @@ -227,7 +220,16 @@ def _selection_name(self): ... ) minimum maximum 1 1 2 - 2 3 4""" + 2 3 4 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating function. + + >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) + 1 1.0 + 2 3.0 + dtype: float64""" ) @Appender( @@ -346,30 +348,36 @@ def _aggregate_multiple_funcs(self, arg): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): - output: dict[base.OutputKey, ArrayLike] = {} - # Ideally we would be able to enumerate self._iterate_slices and use - # the index from enumeration as the key of output, but ohlc in particular - # returns a (n x 4) array. Output requires 1D ndarrays as values, so we - # need to slice that up into 1D arrays - idx = 0 - for obj in self._iterate_slices(): - name = obj.name - is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: - continue - result = self.grouper._cython_operation( - "aggregate", obj._values, how, axis=0, min_count=min_count - ) - assert result.ndim == 1 - key = base.OutputKey(label=name, position=idx) - output[key] = result - idx += 1 + obj = self._selected_obj + objvals = obj._values + data = obj._mgr - if not output: + if numeric_only and not is_numeric_dtype(obj.dtype): raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output) + # This is overkill because it is only called once, but is here to + # mirror the array_func used in DataFrameGroupBy._cython_agg_general + def array_func(values: ArrayLike) -> ArrayLike: + try: + result = self.grouper._cython_operation( + "aggregate", values, how, axis=data.ndim - 1, min_count=min_count + ) + except NotImplementedError: + # generally if we have numeric_only=False + # and non-applicable functions + # try to python agg + # TODO: shouldn't min_count matter? + result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) + + return result + + result = array_func(objvals) + + ser = self.obj._constructor( + result, index=self.grouper.result_index, name=obj.name + ) + return self._reindex_output(ser) def _wrap_aggregated_output( self, @@ -494,19 +502,22 @@ def _get_index() -> Index: return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): + # Note: this is very similar to _aggregate_series_pure_python, + # but that does not pin group.name result = {} initialized = False for name, group in self: # Each step of this loop corresponds to # libreduction._BaseGrouper._apply_to_group - group.name = name # NB: libreduction does not pin name + # NB: libreduction does not pin name + object.__setattr__(group, "name", name) output = func(group, *args, **kwargs) output = libreduction.extract_result(output) if not initialized: # We only do this validation on the first iteration - libreduction.check_result_array(output, 0) + libreduction.check_result_array(output, group.dtype) initialized = True result[name] = output @@ -515,40 +526,15 @@ def _aggregate_named(self, func, *args, **kwargs): @Substitution(klass="Series") @Appender(_transform_template) def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + return self._transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) - if maybe_use_numba(engine): - with group_selection_context(self): - data = self._selected_obj - result = self._transform_with_numba( - data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs - ) - return self.obj._constructor( - result.ravel(), index=data.index, name=data.name - ) - - func = com.get_cython_func(func) or func - - if not isinstance(func, str): - return self._transform_general(func, *args, **kwargs) - - elif func not in base.transform_kernel_allowlist: - msg = f"'{func}' is not a valid function name for transform(name)" - raise ValueError(msg) - elif func in base.cythonized_kernels or func in base.transformation_kernels: - # cythonized transform or canned "agg+broadcast" - return getattr(self, func)(*args, **kwargs) - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - # Temporarily set observed for dealing with categoricals. - with com.temp_setattr(self, "observed", True): - result = getattr(self, func)(*args, **kwargs) - return self._transform_fast(result) - - def _transform_general(self, func, *args, **kwargs): + def _transform_general(self, func: Callable, *args, **kwargs) -> Series: """ - Transform with a non-str `func`. + Transform with a callable func`. """ + assert callable(func) klass = type(self._selected_obj) results = [] @@ -570,28 +556,24 @@ def _transform_general(self, func, *args, **kwargs): result = self._set_result_index_ordered(concatenated) else: result = self.obj._constructor(dtype=np.float64) - # we will only try to coerce the result type if - # we have a numeric dtype, as these are *always* user-defined funcs - # the cython take a different path (and casting) - if is_numeric_dtype(result.dtype): - common_dtype = find_common_type([self._selected_obj.dtype, result.dtype]) - if common_dtype is result.dtype: - result = maybe_downcast_numeric(result, self._selected_obj.dtype) result.name = self._selected_obj.name return result - def _transform_fast(self, result) -> Series: + def _can_use_transform_fast(self, result) -> bool: + return True + + def _wrap_transform_fast_result(self, result: Series) -> Series: """ fast version of transform, only applicable to builtin/cythonizable functions """ - ids, _, ngroup = self.grouper.group_info + ids, _, _ = self.grouper.group_info result = result.reindex(self.grouper.result_index, copy=False) out = algorithms.take_nd(result._values, ids) return self.obj._constructor(out, index=self.obj.index, name=self.obj.name) - def filter(self, func, dropna=True, *args, **kwargs): + def filter(self, func, dropna: bool = True, *args, **kwargs): """ Return a copy of a Series excluding elements from groups that do not satisfy the boolean criterion specified by func. @@ -606,7 +588,7 @@ def filter(self, func, dropna=True, *args, **kwargs): Notes ----- Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`udf-mutation` + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. Examples @@ -987,7 +969,17 @@ class DataFrameGroupBy(GroupBy[DataFrame]): ``['column', 'aggfunc']`` to make it clearer what the arguments are. As usual, the aggregation can be a callable or a string alias. - See :ref:`groupby.aggregate.named` for more.""" + See :ref:`groupby.aggregate.named` for more. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating function. + + >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) + B + A + 1 1.0 + 2 3.0""" ) @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame") @@ -1082,83 +1074,22 @@ def _cython_agg_general( # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy data: Manager2D = self._get_data_to_aggregate() + orig = data if numeric_only: data = data.get_numeric_data(copy=False) - using_array_manager = isinstance(data, ArrayManager) - - def cast_agg_result(result: ArrayLike, values: ArrayLike) -> ArrayLike: - # see if we can cast the values to the desired dtype - # this may not be the original dtype - - if isinstance(values, Categorical) and isinstance(result, np.ndarray): - # If the Categorical op didn't raise, it is dtype-preserving - # We get here with how="first", "last", "min", "max" - result = type(values)._from_sequence(result.ravel(), dtype=values.dtype) - # Note this will have result.dtype == dtype from above - - elif ( - not using_array_manager - and isinstance(result.dtype, np.dtype) - and result.ndim == 1 - ): - # We went through a SeriesGroupByPath and need to reshape - # GH#32223 includes case with IntegerArray values - # We only get here with values.dtype == object - result = result.reshape(1, -1) - # test_groupby_duplicate_columns gets here with - # result.dtype == int64, values.dtype=object, how="min" - - return result - - def py_fallback(values: ArrayLike) -> ArrayLike: - # if self.grouper.aggregate fails, we fall back to a pure-python - # solution - - # We get here with a) EADtypes and b) object dtype - obj: FrameOrSeriesUnion - - # call our grouper again with only this block - if values.ndim == 1: - # We only get here with ExtensionArray - - obj = Series(values) - else: - # We only get here with values.dtype == object - # TODO special case not needed with ArrayManager - df = DataFrame(values.T) - # bc we split object blocks in grouped_reduce, we have only 1 col - # otherwise we'd have to worry about block-splitting GH#39329 - assert df.shape[1] == 1 - # Avoid call to self.values that can occur in DataFrame - # reductions; see GH#28949 - obj = df.iloc[:, 0] - - # Create SeriesGroupBy with observed=True so that it does - # not try to add missing categories if grouping over multiple - # Categoricals. This will done by later self._reindex_output() - # Doing it here creates an error. See GH#34951 - sgb = get_groupby(obj, self.grouper, observed=True) - # Note: bc obj is always a Series here, we can ignore axis and pass - # `alt` directly instead of `lambda x: alt(x, axis=self.axis)` - res_ser = sgb.aggregate(alt) # this will go through sgb._python_agg_general - - # unwrap Series to get array - res_values = res_ser._mgr.arrays[0] - return cast_agg_result(res_values, values) - def array_func(values: ArrayLike) -> ArrayLike: - try: result = self.grouper._cython_operation( - "aggregate", values, how, axis=1, min_count=min_count + "aggregate", values, how, axis=data.ndim - 1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False # and non-applicable functions # try to python agg - result = py_fallback(values) + # TODO: shouldn't min_count matter? + result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) return result @@ -1166,7 +1097,8 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - if not len(new_mgr): + if not len(new_mgr) and len(orig): + # If the original Manager was already empty, no need to raise raise DataError("No numeric types to aggregate") return self._wrap_agged_manager(new_mgr) @@ -1379,61 +1311,27 @@ def _transform_general(self, func, *args, **kwargs): @Substitution(klass="DataFrame") @Appender(_transform_template) def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + return self._transform( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) - if maybe_use_numba(engine): - with group_selection_context(self): - data = self._selected_obj - result = self._transform_with_numba( - data, func, *args, engine_kwargs=engine_kwargs, **kwargs - ) - return self.obj._constructor(result, index=data.index, columns=data.columns) - - # optimized transforms - func = com.get_cython_func(func) or func - - if not isinstance(func, str): - return self._transform_general(func, *args, **kwargs) + def _can_use_transform_fast(self, result) -> bool: + return isinstance(result, DataFrame) and result.columns.equals( + self._obj_with_exclusions.columns + ) - elif func not in base.transform_kernel_allowlist: - msg = f"'{func}' is not a valid function name for transform(name)" - raise ValueError(msg) - elif func in base.cythonized_kernels or func in base.transformation_kernels: - # cythonized transformation or canned "reduction+broadcast" - return getattr(self, func)(*args, **kwargs) - # GH 30918 - # Use _transform_fast only when we know func is an aggregation - if func in base.reduction_kernels: - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - # Temporarily set observed for dealing with categoricals. - with com.temp_setattr(self, "observed", True): - result = getattr(self, func)(*args, **kwargs) - - if isinstance(result, DataFrame) and result.columns.equals( - self._obj_with_exclusions.columns - ): - return self._transform_fast(result) - - return self._transform_general(func, *args, **kwargs) - - def _transform_fast(self, result: DataFrame) -> DataFrame: + def _wrap_transform_fast_result(self, result: DataFrame) -> DataFrame: """ Fast transform path for aggregations """ obj = self._obj_with_exclusions # for each col, reshape to size of original frame by take operation - ids, _, ngroup = self.grouper.group_info + ids, _, _ = self.grouper.group_info result = result.reindex(self.grouper.result_index, copy=False) - output = [ - algorithms.take_nd(result.iloc[:, i].values, ids) - for i, _ in enumerate(result.columns) - ] - - return self.obj._constructor._from_arrays( - output, columns=result.columns, index=obj.index - ) + output = result.take(ids, axis=0) + output.index = obj.index + return output def _define_paths(self, func, *args, **kwargs): if isinstance(func, str): @@ -1521,7 +1419,7 @@ def filter(self, func, dropna=True, *args, **kwargs): which group you are working on. Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`udf-mutation` + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. Examples @@ -1630,7 +1528,7 @@ def _gotitem(self, key, ndim: int, subset=None): raise AssertionError("invalid ndim for _gotitem") - def _wrap_frame_output(self, result, obj: DataFrame) -> DataFrame: + def _wrap_frame_output(self, result: dict, obj: DataFrame) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: @@ -1754,11 +1652,16 @@ def _iterate_column_groupbys(self): def _apply_to_column_groupbys(self, func) -> DataFrame: from pandas.core.reshape.concat import concat - return concat( - (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()), - keys=self._selected_obj.columns, - axis=1, - ) + columns = self._selected_obj.columns + results = [ + func(col_groupby) for _, col_groupby in self._iterate_column_groupbys() + ] + + if not len(results): + # concat would raise + return DataFrame([], columns=columns, index=self.grouper.result_index) + else: + return concat(results, keys=columns, axis=1) def count(self) -> DataFrame: """ @@ -1773,8 +1676,6 @@ def count(self) -> DataFrame: ids, _, ngroups = self.grouper.group_info mask = ids != -1 - using_array_manager = isinstance(data, ArrayManager) - def hfunc(bvalues: ArrayLike) -> ArrayLike: # TODO(2DEA): reshape would not be necessary with 2D EAs if bvalues.ndim == 1: @@ -1784,10 +1685,6 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: masked = mask & ~isna(bvalues) counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1) - if using_array_manager: - # count_level_2d return (1, N) array for single column - # -> extract 1D array - counted = counted[0, :] return counted new_mgr = data.grouped_reduce(hfunc) @@ -1850,27 +1747,30 @@ def nunique(self, dropna: bool = True) -> DataFrame: # Try to consolidate with normal wrapping functions obj = self._obj_with_exclusions - axis_number = obj._get_axis_number(self.axis) - other_axis = int(not axis_number) - if axis_number == 0: + if self.axis == 0: iter_func = obj.items else: iter_func = obj.iterrows - results = concat( - [ - SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique( - dropna - ) - for label, content in iter_func() - ], - axis=1, - ) - results = cast(DataFrame, results) + res_list = [ + SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique( + dropna + ) + for label, content in iter_func() + ] + if res_list: + results = concat(res_list, axis=1) + results = cast(DataFrame, results) + else: + # concat would raise + results = DataFrame( + [], index=self.grouper.result_index, columns=obj.columns[:0] + ) - if axis_number == 1: + if self.axis == 1: results = results.T + other_axis = 1 - self.axis results._get_axis(other_axis).names = obj._get_axis(other_axis).names if not self.as_index: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2436391580dcb..0dd456175c295 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -18,6 +18,7 @@ class providing the base-class of operations. from textwrap import dedent import types from typing import ( + TYPE_CHECKING, Callable, Generic, Hashable, @@ -28,6 +29,7 @@ class providing the base-class of operations. Sequence, TypeVar, Union, + cast, ) import numpy as np @@ -58,9 +60,7 @@ class providing the base-class of operations. doc, ) -from pandas.core.dtypes.cast import maybe_downcast_numeric from pandas.core.dtypes.common import ( - ensure_float, is_bool_dtype, is_datetime64_dtype, is_integer_dtype, @@ -100,9 +100,16 @@ class providing the base-class of operations. Index, MultiIndex, ) +from pandas.core.internals.blocks import ensure_block_shape from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter -from pandas.core.util.numba_ import NUMBA_FUNC_CACHE +from pandas.core.util.numba_ import ( + NUMBA_FUNC_CACHE, + maybe_use_numba, +) + +if TYPE_CHECKING: + from typing import Literal _common_see_also = """ See Also @@ -158,6 +165,11 @@ class providing the base-class of operations. side-effects, as they will take effect twice for the first group. + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + Examples -------- {examples} @@ -165,7 +177,7 @@ class providing the base-class of operations. "dataframe_examples": """ >>> df = pd.DataFrame({'A': 'a a b'.split(), ... 'B': [1,2,3], - ... 'C': [4,6, 5]}) + ... 'C': [4,6,5]}) >>> g = df.groupby('A') Notice that ``g`` has two groups, ``a`` and ``b``. @@ -183,13 +195,17 @@ class providing the base-class of operations. Example 2: The function passed to `apply` takes a DataFrame as its argument and returns a Series. `apply` combines the result for - each group together into a new DataFrame: + each group together into a new DataFrame. + + .. versionchanged:: 1.3.0 - >>> g[['B', 'C']].apply(lambda x: x.max() - x.min()) - B C + The resulting dtype will reflect the return value of the passed ``func``. + + >>> g[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) + B C A - a 1 2 - b 0 0 + a 1.0 2.0 + b 0.0 0.0 Example 3: The function passed to `apply` takes a DataFrame as its argument and returns a scalar. `apply` combines the result for @@ -210,12 +226,16 @@ class providing the base-class of operations. Example 1: The function passed to `apply` takes a Series as its argument and returns a Series. `apply` combines the result for - each group together into a new Series: + each group together into a new Series. + + .. versionchanged:: 1.3.0 - >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2) + The resulting dtype will reflect the return value of the passed ``func``. + + >>> g.apply(lambda x: x*2 if x.name == 'a' else x/2) a 0.0 - a 0.5 - b 4.0 + a 2.0 + b 1.0 dtype: float64 Example 2: The function passed to `apply` takes a Series as @@ -367,12 +387,17 @@ class providing the base-class of operations. in the subframe. If f also supports application to the entire subframe, then a fast path is used starting from the second chunk. * f must not mutate groups. Mutation is not supported and may - produce unexpected results. See :ref:`udf-mutation` for more details. + produce unexpected results. See :ref:`gotchas.udf-mutation` for more details. When using ``engine='numba'``, there will be no "fall back" behavior internally. The group data and group index will be passed as numpy arrays to the JITed user defined function, and no alternative execution attempts will be tried. +.. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + Examples -------- @@ -402,6 +427,20 @@ class providing the base-class of operations. 3 3 8.0 4 4 6.0 5 3 8.0 + +.. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + for example: + +>>> grouped[['C', 'D']].transform(lambda x: x.astype(int).max()) + C D +0 5 8 +1 5 9 +2 5 8 +3 5 9 +4 5 8 +5 5 9 """ _agg_template = """ @@ -469,12 +508,16 @@ class providing the base-class of operations. When using ``engine='numba'``, there will be no "fall back" behavior internally. The group data and group index will be passed as numpy arrays to the JITed user defined function, and no alternative execution attempts will be tried. -{examples} Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`udf-mutation` +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. -""" + +.. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. +{examples}""" @final @@ -546,6 +589,7 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): axis: int grouper: ops.BaseGrouper obj: FrameOrSeries + group_keys: bool @final def __len__(self) -> int: @@ -556,26 +600,17 @@ def __repr__(self) -> str: # TODO: Better repr for GroupBy object return object.__repr__(self) - def _assure_grouper(self) -> None: - """ - We create the grouper on instantiation sub-classes may have a - different policy. - """ - pass - @final @property def groups(self) -> dict[Hashable, np.ndarray]: """ Dict {group name -> group labels}. """ - self._assure_grouper() return self.grouper.groups @final @property def ngroups(self) -> int: - self._assure_grouper() return self.grouper.ngroups @final @@ -584,7 +619,6 @@ def indices(self): """ Dict {group name -> group indices}. """ - self._assure_grouper() return self.grouper.indices @final @@ -721,6 +755,7 @@ def get_group(self, name, obj=None): return obj._take_with_is_copy(inds, axis=self.axis) + @final def __iter__(self) -> Iterator[tuple[Hashable, FrameOrSeries]]: """ Groupby iterator. @@ -809,6 +844,7 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): grouper: ops.BaseGrouper as_index: bool + @final def __init__( self, obj: FrameOrSeries, @@ -1074,13 +1110,13 @@ def _numba_prep(self, func, data): raise NotImplementedError( "Numba engine can only be used with a single function." ) - labels, _, n_groups = self.grouper.group_info - sorted_index = get_group_index_sorter(labels, n_groups) - sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) + ids, _, ngroups = self.grouper.group_info + sorted_index = get_group_index_sorter(ids, ngroups) + sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False) sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() - starts, ends = lib.generate_slices(sorted_labels, n_groups) + starts, ends = lib.generate_slices(sorted_ids, ngroups) return starts, ends, sorted_index, sorted_data @final @@ -1224,11 +1260,12 @@ def _python_agg_general(self, func, *args, **kwargs): # iterate through "columns" ex exclusions to populate output dict output: dict[base.OutputKey, ArrayLike] = {} + if self.ngroups == 0: + # agg_series below assumes ngroups > 0 + return self._python_apply_general(f, self._selected_obj) + for idx, obj in enumerate(self._iterate_slices()): name = obj.name - if self.grouper.ngroups == 0: - # agg_series below assumes ngroups > 0 - continue try: # if this function is invalid for this dtype, we will ignore it. @@ -1236,22 +1273,7 @@ def _python_agg_general(self, func, *args, **kwargs): except TypeError: continue - assert result is not None key = base.OutputKey(label=name, position=idx) - - if is_numeric_dtype(obj.dtype): - result = maybe_downcast_numeric(result, obj.dtype) - - if self.grouper._filter_empty_groups: - mask = counts.ravel() > 0 - - # since we are masking, make sure that we have a float object - values = result - if is_numeric_dtype(values.dtype): - values = ensure_float(values) - - result = maybe_downcast_numeric(values[mask], result.dtype) - output[key] = result if not output: @@ -1280,21 +1302,60 @@ def _agg_general( ) except DataError: pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str( - err - ) or "category dtype not supported" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes - pass - else: - raise # apply a non-cython aggregation if result is None: result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) return result.__finalize__(self.obj, method="groupby") + def _agg_py_fallback( + self, values: ArrayLike, ndim: int, alt: Callable + ) -> ArrayLike: + """ + Fallback to pure-python aggregation if _cython_operation raises + NotImplementedError. + """ + # We get here with a) EADtypes and b) object dtype + + if values.ndim == 1: + # For DataFrameGroupBy we only get here with ExtensionArray + ser = Series(values) + else: + # We only get here with values.dtype == object + # TODO: special case not needed with ArrayManager + df = DataFrame(values.T) + # bc we split object blocks in grouped_reduce, we have only 1 col + # otherwise we'd have to worry about block-splitting GH#39329 + assert df.shape[1] == 1 + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + ser = df.iloc[:, 0] + + # Create SeriesGroupBy with observed=True so that it does + # not try to add missing categories if grouping over multiple + # Categoricals. This will done by later self._reindex_output() + # Doing it here creates an error. See GH#34951 + sgb = get_groupby(ser, self.grouper, observed=True) + # For SeriesGroupBy we could just use self instead of sgb + + if self.ngroups > 0: + res_values, _ = self.grouper.agg_series(ser, alt) + else: + # equiv: res_values = self._python_agg_general(alt) + res_values = sgb._python_apply_general(alt, ser)._values + + if isinstance(values, Categorical): + # Because we only get here with known dtype-preserving + # reductions, we cast back to Categorical. + # TODO: if we ever get "rank" working, exclude it here. + res_values = type(values)._from_sequence(res_values, dtype=values.dtype) + + # If we are DataFrameGroupBy and went through a SeriesGroupByPath + # then we need to reshape + # GH#32223 includes case with IntegerArray values, ndarray res_values + # test_groupby_duplicate_columns with object dtype values + return ensure_block_shape(res_values, ndim=ndim) + def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): @@ -1327,8 +1388,55 @@ def _cython_transform( return self._wrap_transformed_output(output) - def transform(self, func, *args, **kwargs): - raise AbstractMethodError(self) + @final + def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + + if maybe_use_numba(engine): + # TODO: tests with self._selected_obj.ndim == 1 on DataFrameGroupBy + with group_selection_context(self): + data = self._selected_obj + df = data if data.ndim == 2 else data.to_frame() + result = self._transform_with_numba( + df, func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + if self.obj.ndim == 2: + return cast(DataFrame, self.obj)._constructor( + result, index=data.index, columns=data.columns + ) + else: + return cast(Series, self.obj)._constructor( + result.ravel(), index=data.index, name=data.name + ) + + # optimized transforms + func = com.get_cython_func(func) or func + + if not isinstance(func, str): + return self._transform_general(func, *args, **kwargs) + + elif func not in base.transform_kernel_allowlist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels or func in base.transformation_kernels: + # cythonized transform or canned "agg+broadcast" + return getattr(self, func)(*args, **kwargs) + + else: + # i.e. func in base.reduction_kernels + + # GH#30918 Use _transform_fast only when we know func is an aggregation + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + # Temporarily set observed for dealing with categoricals. + with com.temp_setattr(self, "observed", True): + result = getattr(self, func)(*args, **kwargs) + + if self._can_use_transform_fast(result): + return self._wrap_transform_fast_result(result) + + # only reached for DataFrameGroupBy + return self._transform_general(func, *args, **kwargs) # ----------------------------------------------------------------- # Utilities @@ -1351,7 +1459,7 @@ def _apply_filter(self, indices, dropna): return filtered @final - def _cumcount_array(self, ascending: bool = True): + def _cumcount_array(self, ascending: bool = True) -> np.ndarray: """ Parameters ---------- @@ -1541,7 +1649,7 @@ def mean(self, numeric_only: bool = True): """ result = self._cython_agg_general( "mean", - alt=lambda x, axis: Series(x).mean(numeric_only=numeric_only), + alt=lambda x: Series(x).mean(numeric_only=numeric_only), numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @@ -1568,7 +1676,7 @@ def median(self, numeric_only=True): """ result = self._cython_agg_general( "median", - alt=lambda x, axis: Series(x).median(axis=axis, numeric_only=numeric_only), + alt=lambda x: Series(x).median(numeric_only=numeric_only), numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @@ -1624,7 +1732,7 @@ def var(self, ddof: int = 1): """ if ddof == 1: return self._cython_agg_general( - "var", alt=lambda x, axis: Series(x).var(ddof=ddof) + "var", alt=lambda x: Series(x).var(ddof=ddof) ) else: func = lambda x: x.var(ddof=ddof) @@ -1816,9 +1924,7 @@ def describe(self, **kwargs): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T - # FIXME: not being consolidated breaks - # test_describe_with_duplicate_output_column_names - return result._consolidate().unstack() + return result.unstack() @final def resample(self, rule, *args, **kwargs): @@ -1973,7 +2079,7 @@ def ewm(self, *args, **kwargs): ) @final - def _fill(self, direction, limit=None): + def _fill(self, direction: Literal["ffill", "bfill"], limit=None): """ Shared function for `pad` and `backfill` to call Cython method. @@ -2706,7 +2812,7 @@ def _get_cythonized_result( grouper = self.grouper - labels, _, ngroups = grouper.group_info + ids, _, ngroups = grouper.group_info output: dict[base.OutputKey, np.ndarray] = {} base_func = getattr(libgroupby, how) @@ -2715,7 +2821,7 @@ def _get_cythonized_result( name = obj.name values = obj._values - if numeric_only and not is_numeric_dtype(values): + if numeric_only and not is_numeric_dtype(values.dtype): continue if aggregate: @@ -2739,15 +2845,15 @@ def _get_cythonized_result( if pre_processing: try: vals, inferences = pre_processing(vals) - except TypeError as e: - error_msg = str(e) + except TypeError as err: + error_msg = str(err) continue vals = vals.astype(cython_dtype, copy=False) if needs_2d: vals = vals.reshape((-1, 1)) func = partial(func, vals) - func = partial(func, labels) + func = partial(func, ids) if min_count is not None: func = partial(func, min_count) @@ -2966,9 +3072,7 @@ def _reindex_output( Object (potentially) re-indexed to include all possible groups. """ groupings = self.grouper.groupings - if groupings is None: - return output - elif len(groupings) == 1: + if len(groupings) == 1: return output # if we only care about the observed values diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 151756b829a1d..f1762a2535ff7 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -249,6 +249,10 @@ class Grouper: Freq: 17T, dtype: int64 """ + axis: int + sort: bool + dropna: bool + _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort") def __new__(cls, *args, **kwargs): @@ -260,7 +264,13 @@ def __new__(cls, *args, **kwargs): return super().__new__(cls) def __init__( - self, key=None, level=None, freq=None, axis=0, sort=False, dropna=True + self, + key=None, + level=None, + freq=None, + axis: int = 0, + sort: bool = False, + dropna: bool = True, ): self.key = key self.level = level @@ -281,11 +291,11 @@ def __init__( def ax(self): return self.grouper - def _get_grouper(self, obj, validate: bool = True): + def _get_grouper(self, obj: FrameOrSeries, validate: bool = True): """ Parameters ---------- - obj : the subject object + obj : Series or DataFrame validate : bool, default True if True, validate the grouper @@ -296,7 +306,9 @@ def _get_grouper(self, obj, validate: bool = True): self._set_grouper(obj) # error: Value of type variable "FrameOrSeries" of "get_grouper" cannot be # "Optional[Any]" - self.grouper, _, self.obj = get_grouper( # type: ignore[type-var] + # error: Incompatible types in assignment (expression has type "BaseGrouper", + # variable has type "None") + self.grouper, _, self.obj = get_grouper( # type: ignore[type-var,assignment] self.obj, [self.key], axis=self.axis, @@ -375,15 +387,19 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): ax = ax.take(indexer) obj = obj.take(indexer, axis=self.axis) - self.obj = obj - self.grouper = ax + # error: Incompatible types in assignment (expression has type + # "FrameOrSeries", variable has type "None") + self.obj = obj # type: ignore[assignment] + # error: Incompatible types in assignment (expression has type "Index", + # variable has type "None") + self.grouper = ax # type: ignore[assignment] return self.grouper @final @property def groups(self): - # error: Item "None" of "Optional[Any]" has no attribute "groups" - return self.grouper.groups # type: ignore[union-attr] + # error: "None" has no attribute "groups" + return self.grouper.groups # type: ignore[attr-defined] @final def __repr__(self) -> str: @@ -428,7 +444,7 @@ def __init__( index: Index, grouper=None, obj: FrameOrSeries | None = None, - name=None, + name: Hashable = None, level=None, sort: bool = True, observed: bool = False, @@ -478,7 +494,12 @@ def __init__( # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) - _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) + _, grouper, _ = self.grouper._get_grouper( + # error: Value of type variable "FrameOrSeries" of "_get_grouper" + # of "Grouper" cannot be "Optional[FrameOrSeries]" + self.obj, # type: ignore[type-var] + validate=False, + ) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index 3ba70baec1561..26070fcb5e89c 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -1,11 +1,10 @@ """Common utilities for Numba operations with groupby ops""" +from __future__ import annotations + import inspect from typing import ( Any, Callable, - Dict, - Optional, - Tuple, ) import numpy as np @@ -57,10 +56,10 @@ def f(values, index, ...): def generate_numba_agg_func( - args: Tuple, - kwargs: Dict[str, Any], + args: tuple, + kwargs: dict[str, Any], func: Callable[..., Scalar], - engine_kwargs: Optional[Dict[str, bool]], + engine_kwargs: dict[str, bool] | None, ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: """ Generate a numba jitted agg function specified by values from engine_kwargs. @@ -117,10 +116,10 @@ def group_agg( def generate_numba_transform_func( - args: Tuple, - kwargs: Dict[str, Any], + args: tuple, + kwargs: dict[str, Any], func: Callable[..., np.ndarray], - engine_kwargs: Optional[Dict[str, bool]], + engine_kwargs: dict[str, bool] | None, ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: """ Generate a numba jitted transform function specified by values from engine_kwargs. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 96f5b2cfb9d81..26812a07b4be3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -14,6 +14,7 @@ Hashable, Iterator, Sequence, + overload, ) import numpy as np @@ -47,23 +48,34 @@ is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, - is_float_dtype, is_integer_dtype, is_numeric_dtype, - is_period_dtype, is_sparse, is_timedelta64_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.generic import ABCCategoricalIndex +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.missing import ( isna, maybe_fill, ) -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + PeriodArray, + TimedeltaArray, +) +from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import ( + Float64Dtype, + FloatingDtype, +) +from pandas.core.arrays.integer import ( + Int64Dtype, + _IntegerDtype, +) from pandas.core.arrays.masked import ( BaseMaskedArray, BaseMaskedDtype, @@ -76,6 +88,7 @@ grouper, ) from pandas.core.indexes.api import ( + CategoricalIndex, Index, MultiIndex, ensure_index, @@ -194,7 +207,7 @@ def get_cython_func_and_vals(self, values: np.ndarray, is_numeric: bool): return func, values - def disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): + def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): """ Check if we can do this operation with our cython functions. @@ -230,7 +243,7 @@ def disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): if how in ["prod", "cumprod"]: raise TypeError(f"timedelta64 type does not support {how} operations") - def get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: + def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: how = self.how kind = self.kind @@ -261,7 +274,15 @@ def get_out_dtype(self, dtype: np.dtype) -> np.dtype: out_dtype = "object" return np.dtype(out_dtype) - def get_result_dtype(self, dtype: DtypeObj) -> DtypeObj: + @overload + def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: + ... + + @overload + def _get_result_dtype(self, dtype: ExtensionDtype) -> ExtensionDtype: + ... + + def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj: """ Get the desired dtype of a result based on the input dtype and how it was computed. @@ -276,13 +297,6 @@ def get_result_dtype(self, dtype: DtypeObj) -> DtypeObj: np.dtype or ExtensionDtype The desired dtype of the result. """ - from pandas.core.arrays.boolean import BooleanDtype - from pandas.core.arrays.floating import Float64Dtype - from pandas.core.arrays.integer import ( - Int64Dtype, - _IntegerDtype, - ) - how = self.how if how in ["add", "cumsum", "sum", "prod"]: @@ -315,15 +329,12 @@ def _ea_wrap_cython_operation( # TODO: general case implementation overridable by EAs. orig_values = values - if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype): + if isinstance(orig_values, (DatetimeArray, PeriodArray)): # All of the functions implemented here are ordinal, so we can # operate on the tz-naive equivalents - npvalues = values.view("M8[ns]") + npvalues = orig_values._ndarray.view("M8[ns]") res_values = self._cython_op_ndim_compat( - # error: Argument 1 to "_cython_op_ndim_compat" of - # "WrappedCythonOp" has incompatible type - # "Union[ExtensionArray, ndarray]"; expected "ndarray" - npvalues, # type: ignore[arg-type] + npvalues, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, @@ -336,14 +347,30 @@ def _ea_wrap_cython_operation( # preserve float64 dtype return res_values - res_values = res_values.astype("i8", copy=False) - # error: Too many arguments for "ExtensionArray" - result = type(orig_values)( # type: ignore[call-arg] - res_values, dtype=orig_values.dtype - ) + res_values = res_values.view("i8") + result = type(orig_values)(res_values, dtype=orig_values.dtype) return result - elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): + elif isinstance(orig_values, TimedeltaArray): + # We have an ExtensionArray but not ExtensionDtype + res_values = self._cython_op_ndim_compat( + orig_values._ndarray, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=None, + **kwargs, + ) + if self.how in ["rank"]: + # i.e. how in WrappedCythonOp.cast_blocklist, since + # other cast_blocklist methods dont go through cython_operation + # preserve float64 dtype + return res_values + + # otherwise res_values has the same dtype as original values + return type(orig_values)(res_values) + + elif isinstance(values.dtype, (BooleanDtype, _IntegerDtype)): # IntegerArray or BooleanArray npvalues = values.to_numpy("float64", na_value=np.nan) res_values = self._cython_op_ndim_compat( @@ -359,17 +386,14 @@ def _ea_wrap_cython_operation( # other cast_blocklist methods dont go through cython_operation return res_values - dtype = self.get_result_dtype(orig_values.dtype) - # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" - # has no attribute "construct_array_type" - cls = dtype.construct_array_type() # type: ignore[union-attr] + dtype = self._get_result_dtype(orig_values.dtype) + cls = dtype.construct_array_type() return cls._from_sequence(res_values, dtype=dtype) - elif is_float_dtype(values.dtype): + elif isinstance(values.dtype, FloatingDtype): # FloatingArray - # error: "ExtensionDtype" has no attribute "numpy_dtype" npvalues = values.to_numpy( - values.dtype.numpy_dtype, # type: ignore[attr-defined] + values.dtype.numpy_dtype, na_value=np.nan, ) res_values = self._cython_op_ndim_compat( @@ -385,10 +409,8 @@ def _ea_wrap_cython_operation( # other cast_blocklist methods dont go through cython_operation return res_values - dtype = self.get_result_dtype(orig_values.dtype) - # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" - # has no attribute "construct_array_type" - cls = dtype.construct_array_type() # type: ignore[union-attr] + dtype = self._get_result_dtype(orig_values.dtype) + cls = dtype.construct_array_type() return cls._from_sequence(res_values, dtype=dtype) raise NotImplementedError( @@ -422,12 +444,13 @@ def _masked_ea_wrap_cython_operation( mask=mask, **kwargs, ) - dtype = self.get_result_dtype(orig_values.dtype) + dtype = self._get_result_dtype(orig_values.dtype) assert isinstance(dtype, BaseMaskedDtype) cls = dtype.construct_array_type() return cls(res_values.astype(dtype.type, copy=False), mask) + @final def _cython_op_ndim_compat( self, values: np.ndarray, @@ -500,7 +523,7 @@ def _call_cython_op( if mask is not None: mask = mask.reshape(values.shape, order="C") - out_shape = self.get_output_shape(ngroups, values) + out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) out_dtype = self.get_out_dtype(values.dtype) @@ -550,12 +573,8 @@ def _call_cython_op( if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here - res_dtype = self.get_result_dtype(orig_values.dtype) - # error: Argument 2 to "maybe_downcast_to_dtype" has incompatible type - # "Union[dtype[Any], ExtensionDtype]"; expected "Union[str, dtype[Any]]" - op_result = maybe_downcast_to_dtype( - result, res_dtype # type: ignore[arg-type] - ) + res_dtype = self._get_result_dtype(orig_values.dtype) + op_result = maybe_downcast_to_dtype(result, res_dtype) else: op_result = result @@ -563,6 +582,62 @@ def _call_cython_op( # expected "ndarray") return op_result # type: ignore[return-value] + @final + def cython_operation( + self, + *, + values: ArrayLike, + axis: int, + min_count: int = -1, + comp_ids: np.ndarray, + ngroups: int, + **kwargs, + ) -> ArrayLike: + """ + Call our cython function, with appropriate pre- and post- processing. + """ + if values.ndim > 2: + raise NotImplementedError("number of dimensions is currently limited to 2") + elif values.ndim == 2: + # Note: it is *not* the case that axis is always 0 for 1-dim values, + # as we can have 1D ExtensionArrays that we need to treat as 2D + assert axis == 1, axis + + dtype = values.dtype + is_numeric = is_numeric_dtype(dtype) + + # can we do this operation with our cython functions + # if not raise NotImplementedError + self._disallow_invalid_ops(dtype, is_numeric) + + if not isinstance(values, np.ndarray): + # i.e. ExtensionArray + if isinstance(values, BaseMaskedArray) and self.uses_mask(): + return self._masked_ea_wrap_cython_operation( + values, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + **kwargs, + ) + else: + return self._ea_wrap_cython_operation( + values, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + **kwargs, + ) + + return self._cython_op_ndim_compat( + values, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=None, + **kwargs, + ) + class BaseGrouper: """ @@ -586,6 +661,8 @@ class BaseGrouper: """ + axis: Index + def __init__( self, axis: Index, @@ -598,7 +675,6 @@ def __init__( ): assert isinstance(axis, Index), axis - self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis self._groupings: list[grouper.Grouping] = list(groupings) self.sort = sort @@ -647,8 +723,8 @@ def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> DataSplitter: __finalize__ has not been called for the subsetted objects returned. """ - comp_ids, _, ngroups = self.group_info - return get_splitter(data, comp_ids, ngroups, axis=axis) + ids, _, ngroups = self.group_info + return get_splitter(data, ids, ngroups, axis=axis) def _get_grouper(self): """ @@ -664,10 +740,10 @@ def _get_group_keys(self): if len(self.groupings) == 1: return self.levels[0] else: - comp_ids, _, ngroups = self.group_info + ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_list(comp_ids, ngroups, self.levels, self.codes) + return get_flattened_list(ids, ngroups, self.levels, self.codes) @final def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @@ -744,9 +820,7 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @cache_readonly def indices(self): """ dict {group name -> group indices} """ - if len(self.groupings) == 1 and isinstance( - self.result_index, ABCCategoricalIndex - ): + if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): # This shows unused categories in indices GH#38642 return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] @@ -770,9 +844,9 @@ def size(self) -> Series: """ Compute group sizes. """ - ids, _, ngroup = self.group_info - if ngroup: - out = np.bincount(ids[ids != -1], minlength=ngroup) + ids, _, ngroups = self.group_info + if ngroups: + out = np.bincount(ids[ids != -1], minlength=ngroups) else: out = [] return Series(out, index=self.result_index, dtype="int64") @@ -799,17 +873,18 @@ def group_info(self): ngroups = len(obs_group_ids) comp_ids = ensure_platform_int(comp_ids) + return comp_ids, obs_group_ids, ngroups @final @cache_readonly def codes_info(self) -> np.ndarray: # return the codes of items in original grouped axis - codes, _, _ = self.group_info + ids, _, _ = self.group_info if self.indexer is not None: - sorter = np.lexsort((codes, self.indexer)) - codes = codes[sorter] - return codes + sorter = np.lexsort((ids, self.indexer)) + ids = ids[sorter] + return ids @final def _get_compressed_codes(self) -> tuple[np.ndarray, np.ndarray]: @@ -829,12 +904,12 @@ def ngroups(self) -> int: @property def reconstructed_codes(self) -> list[np.ndarray]: codes = self.codes - comp_ids, obs_ids, _ = self.group_info - return decons_obs_group_ids(comp_ids, obs_ids, self.shape, codes, xnull=True) + ids, obs_ids, _ = self.group_info + return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) @cache_readonly def result_index(self) -> Index: - if not self.compressed and len(self.groupings) == 1: + if len(self.groupings) == 1: return self.groupings[0].result_index.rename(self.names[0]) codes = self.reconstructed_codes @@ -845,7 +920,9 @@ def result_index(self) -> Index: @final def get_group_levels(self) -> list[Index]: - if not self.compressed and len(self.groupings) == 1: + # Note: only called from _insert_inaxis_grouper_inplace, which + # is only called for BaseGrouper, never for BinGrouper + if len(self.groupings) == 1: return [self.groupings[0].result_index] name_list = [] @@ -868,7 +945,6 @@ def _cython_operation( how: str, axis: int, min_count: int = -1, - mask: np.ndarray | None = None, **kwargs, ) -> ArrayLike: """ @@ -876,50 +952,16 @@ def _cython_operation( """ assert kind in ["transform", "aggregate"] - if values.ndim > 2: - raise NotImplementedError("number of dimensions is currently limited to 2") - elif values.ndim == 2: - # Note: it is *not* the case that axis is always 0 for 1-dim values, - # as we can have 1D ExtensionArrays that we need to treat as 2D - assert axis == 1, axis - - dtype = values.dtype - is_numeric = is_numeric_dtype(dtype) - cy_op = WrappedCythonOp(kind=kind, how=how) - # can we do this operation with our cython functions - # if not raise NotImplementedError - cy_op.disallow_invalid_ops(dtype, is_numeric) - - comp_ids, _, _ = self.group_info + ids, _, _ = self.group_info ngroups = self.ngroups - - func_uses_mask = cy_op.uses_mask() - if is_extension_array_dtype(dtype): - if isinstance(values, BaseMaskedArray) and func_uses_mask: - return cy_op._masked_ea_wrap_cython_operation( - values, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - **kwargs, - ) - else: - return cy_op._ea_wrap_cython_operation( - values, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - **kwargs, - ) - - return cy_op._cython_op_ndim_compat( - values, + return cy_op.cython_operation( + values=values, + axis=axis, min_count=min_count, - ngroups=self.ngroups, - comp_ids=comp_ids, - mask=mask, + comp_ids=ids, + ngroups=ngroups, **kwargs, ) @@ -928,30 +970,33 @@ def agg_series(self, obj: Series, func: F) -> tuple[ArrayLike, np.ndarray]: # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 + cast_back = True if len(obj) == 0: # SeriesGrouper would raise if we were to call _aggregate_series_fast - return self._aggregate_series_pure_python(obj, func) + result, counts = self._aggregate_series_pure_python(obj, func) elif is_extension_array_dtype(obj.dtype): # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider # In the datetime64tz case it would incorrectly cast to tz-naive # TODO: can we get a performant workaround for EAs backed by ndarray? - return self._aggregate_series_pure_python(obj, func) + result, counts = self._aggregate_series_pure_python(obj, func) elif obj.index._has_complex_internals: # Preempt TypeError in _aggregate_series_fast - return self._aggregate_series_pure_python(obj, func) + result, counts = self._aggregate_series_pure_python(obj, func) - try: - return self._aggregate_series_fast(obj, func) - except ValueError as err: - if "Must produce aggregated value" in str(err): - # raised in libreduction - pass - else: - raise - return self._aggregate_series_pure_python(obj, func) + else: + result, counts = self._aggregate_series_fast(obj, func) + cast_back = False + + npvalues = lib.maybe_convert_objects(result, try_float=False) + if cast_back: + # TODO: Is there a documented reason why we dont always cast_back? + out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + else: + out = npvalues + return out, counts def _aggregate_series_fast( self, obj: Series, func: F @@ -963,27 +1008,28 @@ def _aggregate_series_fast( # - ngroups != 0 func = com.is_builtin_func(func) - group_index, _, ngroups = self.group_info + ids, _, ngroups = self.group_info # avoids object / Series creation overhead - indexer = get_group_index_sorter(group_index, ngroups) + indexer = get_group_index_sorter(ids, ngroups) obj = obj.take(indexer) - group_index = group_index.take(indexer) - grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups) - result, counts = grouper.get_result() + ids = ids.take(indexer) + sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups) + result, counts = sgrouper.get_result() return result, counts @final def _aggregate_series_pure_python(self, obj: Series, func: F): - group_index, _, ngroups = self.group_info + ids, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) result = np.empty(ngroups, dtype="O") initialized = False - splitter = get_splitter(obj, group_index, ngroups, axis=0) + # equiv: splitter = self._get_splitter(obj, axis=0) + splitter = get_splitter(obj, ids, ngroups, axis=0) - for label, group in enumerate(splitter): + for i, group in enumerate(splitter): # Each step of this loop corresponds to # libreduction._BaseGrouper._apply_to_group @@ -992,16 +1038,13 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): if not initialized: # We only do this validation on the first iteration - libreduction.check_result_array(res, 0) + libreduction.check_result_array(res, group.dtype) initialized = True - counts[label] = group.shape[0] - result[label] = res - - npvalues = lib.maybe_convert_objects(result, try_float=False) - out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + counts[i] = group.shape[0] + result[i] = res - return out, counts + return result, counts class BinGrouper(BaseGrouper): @@ -1012,7 +1055,6 @@ class BinGrouper(BaseGrouper): ---------- bins : the split index of binlabels to group the item of axis binlabels : the label list - filter_empty : bool, default False mutated : bool, default False indexer : np.ndarray[np.intp] @@ -1034,17 +1076,19 @@ class BinGrouper(BaseGrouper): """ + bins: np.ndarray # np.ndarray[np.int64] + binlabels: Index + mutated: bool + def __init__( self, bins, binlabels, - filter_empty: bool = False, mutated: bool = False, indexer=None, ): self.bins = ensure_int64(bins) self.binlabels = ensure_index(binlabels) - self._filter_empty_groups = filter_empty self.mutated = mutated self.indexer = indexer @@ -1117,7 +1161,7 @@ def indices(self): @cache_readonly def group_info(self): ngroups = self.ngroups - obs_group_ids = np.arange(ngroups) + obs_group_ids = np.arange(ngroups, dtype=np.int64) rep = np.diff(np.r_[0, self.bins]) rep = ensure_platform_int(rep) @@ -1128,7 +1172,7 @@ def group_info(self): return ( ensure_platform_int(comp_ids), - obs_group_ids.astype("int64", copy=False), + obs_group_ids, ngroups, ) @@ -1154,10 +1198,9 @@ def names(self) -> list[Hashable]: @property def groupings(self) -> list[grouper.Grouping]: - return [ - grouper.Grouping(lvl, lvl, in_axis=False, level=None, name=name) - for lvl, name in zip(self.levels, self.names) - ] + lev = self.binlabels + ping = grouper.Grouping(lev, lev, in_axis=False, level=None, name=lev.name) + return [ping] def _aggregate_series_fast( self, obj: Series, func: F @@ -1167,8 +1210,8 @@ def _aggregate_series_fast( # - obj is backed by an ndarray, not ExtensionArray # - ngroups != 0 # - len(self.bins) > 0 - grouper = libreduction.SeriesBinGrouper(obj, func, self.bins) - return grouper.get_result() + sbg = libreduction.SeriesBinGrouper(obj, func, self.bins) + return sbg.get_result() def _is_indexed_like(obj, axes, axis: int) -> bool: diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index aa780787d58b6..4f3f536cd3290 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -164,7 +164,7 @@ def check_setitem_lengths(indexer, value, values) -> bool: # a) not necessarily 1-D indexers, e.g. tuple # b) boolean indexers e.g. BoolArray if is_list_like(value): - if len(indexer) != len(value): + if len(indexer) != len(value) and values.ndim == 1: # boolean with truth values == len of the value is ok too if not ( isinstance(indexer, np.ndarray) @@ -180,7 +180,8 @@ def check_setitem_lengths(indexer, value, values) -> bool: elif isinstance(indexer, slice): if is_list_like(value): - if len(value) != length_of_indexer(indexer, values): + if len(value) != length_of_indexer(indexer, values) and values.ndim == 1: + # In case of two dimensional value is used row-wise and broadcasted raise ValueError( "cannot set using a slice indexer with a " "different length than the value" diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9b3f2d191831d..84f1245299d53 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -907,9 +907,7 @@ def astype(self, dtype, copy=True): elif is_categorical_dtype(dtype): from pandas.core.indexes.category import CategoricalIndex - return CategoricalIndex( - self._values, name=self.name, dtype=dtype, copy=copy - ) + return CategoricalIndex(self, name=self.name, dtype=dtype, copy=copy) elif is_extension_array_dtype(dtype): return Index(np.asarray(self), name=self.name, dtype=dtype, copy=copy) @@ -2923,8 +2921,10 @@ def union(self, other, sort=None): other, result_name = self._convert_can_do_setop(other) if not is_dtype_equal(self.dtype, other.dtype): - if isinstance(self, ABCMultiIndex) and not is_object_dtype( - unpack_nested_dtype(other) + if ( + isinstance(self, ABCMultiIndex) + and not is_object_dtype(unpack_nested_dtype(other)) + and len(other) > 0 ): raise NotImplementedError( "Can only union MultiIndex with MultiIndex or Index of tuples, " @@ -3414,7 +3414,7 @@ def get_indexer( limit: int | None = None, tolerance=None, ) -> np.ndarray: - + # returned ndarray is np.intp method = missing.clean_reindex_fill_method(method) target = ensure_index(target) @@ -4099,7 +4099,10 @@ def _join_multi(self, other: Index, how: str_t): return result @final - def _join_non_unique(self, other, how="left"): + def _join_non_unique( + self, other: Index, how: str_t = "left" + ) -> tuple[Index, np.ndarray, np.ndarray]: + # returned ndarrays are np.intp from pandas.core.reshape.merge import get_join_indexers # We only get here if dtypes match @@ -4125,7 +4128,10 @@ def _join_non_unique(self, other, how="left"): return join_index, left_idx, right_idx @final - def _join_level(self, other, level, how="left", keep_order=True): + def _join_level( + self, other: Index, level, how: str_t = "left", keep_order: bool = True + ) -> tuple[MultiIndex, np.ndarray | None, np.ndarray | None]: + # Any returned ndarrays are np.intp """ The join method *only* affects the level of the resulting MultiIndex. Otherwise it just exactly aligns the Index data to the @@ -5874,7 +5880,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): return start_slice, end_slice - def delete(self, loc) -> Index: + def delete(self: _IndexT, loc) -> _IndexT: """ Make new Index with passed location(-s) deleted. @@ -6425,8 +6431,8 @@ def _maybe_cast_data_without_dtype(subarr): return tda elif inferred == "period": try: - data = PeriodArray._from_sequence(subarr) - return data + parr = PeriodArray._from_sequence(subarr) + return parr except IncompatibleFrequency: pass diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b2d2c98c08f68..b2377f5b27966 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -1,16 +1,14 @@ """ Base and utility classes for tseries type pandas objects. """ +from __future__ import annotations + from datetime import datetime from typing import ( TYPE_CHECKING, Any, - List, - Optional, Sequence, - Tuple, TypeVar, - Union, cast, ) @@ -86,12 +84,12 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): """ _can_hold_strings = False - _data: Union[DatetimeArray, TimedeltaArray, PeriodArray] - freq: Optional[BaseOffset] - freqstr: Optional[str] + _data: DatetimeArray | TimedeltaArray | PeriodArray + freq: BaseOffset | None + freqstr: str | None _resolution_obj: Resolution - _bool_ops: List[str] = [] - _field_ops: List[str] = [] + _bool_ops: list[str] = [] + _field_ops: list[str] = [] # error: "Callable[[Any], Any]" has no attribute "fget" hasnans = cache_readonly( @@ -196,7 +194,7 @@ def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(to_timedelta(tolerance).to_numpy()) return super()._convert_tolerance(tolerance, target) - def tolist(self) -> List: + def tolist(self) -> list: """ Return a list of the underlying data. """ @@ -322,10 +320,10 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): def format( self, name: bool = False, - formatter: Optional[Callable] = None, + formatter: Callable | None = None, na_rep: str = "NaT", - date_format: Optional[str] = None, - ) -> List[str]: + date_format: str | None = None, + ) -> list[str]: """ Render a string representation of the Index. """ @@ -343,8 +341,8 @@ def format( return self._format_with_header(header, na_rep=na_rep, date_format=date_format) def _format_with_header( - self, header: List[str], na_rep: str = "NaT", date_format: Optional[str] = None - ) -> List[str]: + self, header: list[str], na_rep: str = "NaT", date_format: str | None = None + ) -> list[str]: return header + list( self._format_native_types(na_rep=na_rep, date_format=date_format) ) @@ -506,7 +504,7 @@ def shift(self: _T, periods: int = 1, freq=None) -> _T: # -------------------------------------------------------------------- # List-like Methods - def _get_delete_freq(self, loc: Union[int, slice, Sequence[int]]): + def _get_delete_freq(self, loc: int | slice | Sequence[int]): """ Find the `freq` for self.delete(loc). """ @@ -613,6 +611,8 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin): but not PeriodIndex """ + _data: DatetimeArray | TimedeltaArray + # Compat for frequency inference, see GH#23789 _is_monotonic_increasing = Index.is_monotonic_increasing _is_monotonic_decreasing = Index.is_monotonic_decreasing @@ -828,6 +828,6 @@ def join( sort=sort, ) - def _maybe_utc_convert(self: _T, other: Index) -> Tuple[_T, Index]: + def _maybe_utc_convert(self: _T, other: Index) -> tuple[_T, Index]: # Overridden by DatetimeIndex return self, other diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0d85d891d2309..de976039639a7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1612,7 +1612,7 @@ def _inferred_type_levels(self) -> list[str]: @doc(Index.duplicated) def duplicated(self, keep="first") -> np.ndarray: - shape = map(len, self.levels) + shape = tuple(len(lev) for lev in self.levels) ids = get_group_index(self.codes, shape, sort=False, xnull=False) return duplicated_int64(ids, keep) @@ -3598,7 +3598,7 @@ def _get_reconciled_name_object(self, other) -> MultiIndex: def _maybe_match_names(self, other): """ Try to find common names to attach to the result of an operation between - a and b. Return a consensus list of names if they match at least partly + a and b. Return a consensus list of names if they match at least partly or list of None if they have completely different names. """ if len(self.names) != len(other.names): diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 3e8b44dcee831..28f563764ef10 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -98,7 +98,7 @@ def _ensure_array(cls, data, dtype, copy: bool): return subarr @classmethod - def _validate_dtype(cls, dtype: Dtype) -> None: + def _validate_dtype(cls, dtype: Dtype | None) -> None: if dtype is None: return diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ae6623e611180..0299f1403fcfb 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -8,6 +8,8 @@ Any, Callable, Hashable, + List, + cast, ) import warnings @@ -110,13 +112,7 @@ def __new__( copy: bool = False, name: Hashable = None, ) -> RangeIndex: - - # error: Argument 1 to "_validate_dtype" of "NumericIndex" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], Type[int], - # Type[complex], Type[bool], Type[object], None]"; expected - # "Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float], - # Type[int], Type[complex], Type[bool], Type[object]]" - cls._validate_dtype(dtype) # type: ignore[arg-type] + cls._validate_dtype(dtype) name = maybe_extract_name(name, start, cls) # RangeIndex @@ -159,13 +155,7 @@ def from_range( f"{cls.__name__}(...) must be called with object coercible to a " f"range, {repr(data)} was passed" ) - - # error: Argument 1 to "_validate_dtype" of "NumericIndex" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], Type[int], - # Type[complex], Type[bool], Type[object], None]"; expected - # "Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float], - # Type[int], Type[complex], Type[bool], Type[object]]" - cls._validate_dtype(dtype) # type: ignore[arg-type] + cls._validate_dtype(dtype) return cls._simple_new(data, name=name) @classmethod @@ -439,9 +429,8 @@ def _get_indexer( def repeat(self, repeats, axis=None) -> Int64Index: return self._int64index.repeat(repeats, axis=axis) - def delete(self, loc) -> Int64Index: - # error: Incompatible return value type (got "Index", expected "Int64Index") - return self._int64index.delete(loc) # type: ignore[return-value] + def delete(self, loc) -> Int64Index: # type: ignore[override] + return self._int64index.delete(loc) def take( self, indices, axis: int = 0, allow_fill: bool = True, fill_value=None, **kwargs @@ -762,7 +751,7 @@ def symmetric_difference(self, other, result_name: Hashable = None, sort=None): # -------------------------------------------------------------------- - def _concat(self, indexes: list[Index], name: Hashable): + def _concat(self, indexes: list[Index], name: Hashable) -> Index: """ Overriding parent method for the case of all RangeIndex instances. @@ -777,14 +766,15 @@ def _concat(self, indexes: list[Index], name: Hashable): elif len(indexes) == 1: return indexes[0] + rng_indexes = cast(List[RangeIndex], indexes) + start = step = next_ = None # Filter the empty indexes - non_empty_indexes = [obj for obj in indexes if len(obj)] + non_empty_indexes = [obj for obj in rng_indexes if len(obj)] for obj in non_empty_indexes: - # error: "Index" has no attribute "_range" - rng: range = obj._range # type: ignore[attr-defined] + rng = obj._range if start is None: # This is set by the first non-empty index @@ -794,7 +784,8 @@ def _concat(self, indexes: list[Index], name: Hashable): elif step is None: # First non-empty index had only one element if rng.start == start: - result = Int64Index(np.concatenate([x._values for x in indexes])) + values = np.concatenate([x._values for x in rng_indexes]) + result = Int64Index(values) return result.rename(name) step = rng.start - start @@ -803,7 +794,7 @@ def _concat(self, indexes: list[Index], name: Hashable): next_ is not None and rng.start != next_ ) if non_consecutive: - result = Int64Index(np.concatenate([x._values for x in indexes])) + result = Int64Index(np.concatenate([x._values for x in rng_indexes])) return result.rename(name) if step is not None: @@ -812,12 +803,7 @@ def _concat(self, indexes: list[Index], name: Hashable): if non_empty_indexes: # Get the stop value from "next" or alternatively # from the last non-empty index - # error: "Index" has no attribute "stop" - stop = ( - non_empty_indexes[-1].stop # type: ignore[attr-defined] - if next_ is None - else next_ - ) + stop = non_empty_indexes[-1].stop if next_ is None else next_ return RangeIndex(start, stop, step).rename(name) # Here all "indexes" had 0 length, i.e. were empty. diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 2f8686fd38929..37e07af71213e 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -19,6 +19,7 @@ ) from pandas.core.arrays import DatetimeArray +from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( Block, DatetimeTZBlock, @@ -49,7 +50,6 @@ def make_block( values, dtype = extract_pandas_array(values, dtype, ndim) - needs_reshape = False if klass is None: dtype = dtype or values.dtype klass = get_block_type(values, dtype) @@ -57,13 +57,14 @@ def make_block( elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype): # pyarrow calls get here values = DatetimeArray._simple_new(values, dtype=dtype) - needs_reshape = True if not isinstance(placement, BlockPlacement): placement = BlockPlacement(placement) ndim = maybe_infer_ndim(values, placement, ndim) - if needs_reshape: + if is_datetime64tz_dtype(values.dtype): + # GH#41168 ensure we can pass 1D dt64tz values + values = extract_array(values, extract_numpy=True) values = ensure_block_shape(values, ndim) check_ndim(values, placement, ndim) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index a25750e7e1eab..71e6d14e6a716 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -201,47 +201,6 @@ def __repr__(self) -> str: output += f"\n{arr.dtype}" return output - def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: - """ - Apply grouped reduction function columnwise, returning a new ArrayManager. - - Parameters - ---------- - func : grouped reduction function - ignore_failures : bool, default False - Whether to drop columns where func raises TypeError. - - Returns - ------- - ArrayManager - """ - result_arrays: list[np.ndarray] = [] - result_indices: list[int] = [] - - for i, arr in enumerate(self.arrays): - try: - res = func(arr) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - continue - result_arrays.append(res) - result_indices.append(i) - - if len(result_arrays) == 0: - index = Index([None]) # placeholder - else: - index = Index(range(result_arrays[0].shape[0])) - - if ignore_failures: - columns = self.items[np.array(result_indices, dtype="int64")] - else: - columns = self.items - - # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; - # expected "List[Union[ndarray, ExtensionArray]]" - return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type] - def apply( self: T, f, @@ -322,25 +281,6 @@ def apply( # expected "List[Union[ndarray, ExtensionArray]]" return type(self)(result_arrays, new_axes) # type: ignore[arg-type] - def apply_2d(self: T, f, ignore_failures: bool = False, **kwargs) -> T: - """ - Variant of `apply`, but where the function should not be applied to - each column independently, but to the full data as a 2D array. - """ - values = self.as_array() - try: - result = f(values, **kwargs) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - result_arrays = [] - new_axes = [self._axes[0], self.axes[1].take([])] - else: - result_arrays = [result[:, i] for i in range(len(self._axes[1]))] - new_axes = self._axes - - return type(self)(result_arrays, new_axes) - def apply_with_block(self: T, f, align_keys=None, swap_axis=True, **kwargs) -> T: # switch axis to follow BlockManager logic if swap_axis and "axis" in kwargs and self.ndim == 2: @@ -606,67 +546,6 @@ def copy_func(ax): new_arrays = self.arrays return type(self)(new_arrays, new_axes) - def as_array( - self, - transpose: bool = False, - dtype=None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: - """ - Convert the blockmanager data into an numpy array. - - Parameters - ---------- - transpose : bool, default False - If True, transpose the return array. - dtype : object, default None - Data type of the return array. - copy : bool, default False - If True then guarantee that a copy is returned. A value of - False does not guarantee that the underlying data is not - copied. - na_value : object, default lib.no_default - Value to be used as the missing value sentinel. - - Returns - ------- - arr : ndarray - """ - if len(self.arrays) == 0: - arr = np.empty(self.shape, dtype=float) - return arr.transpose() if transpose else arr - - # We want to copy when na_value is provided to avoid - # mutating the original object - copy = copy or na_value is not lib.no_default - - if not dtype: - dtype = interleaved_dtype([arr.dtype for arr in self.arrays]) - - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - elif isinstance(dtype, PandasDtype): - dtype = dtype.numpy_dtype - elif is_extension_array_dtype(dtype): - dtype = "object" - elif is_dtype_equal(dtype, str): - dtype = "object" - - result = np.empty(self.shape_proper, dtype=dtype) - - # error: Incompatible types in assignment (expression has type "Union[ndarray, - # ExtensionArray]", variable has type "ndarray") - for i, arr in enumerate(self.arrays): # type: ignore[assignment] - arr = arr.astype(dtype, copy=copy) - result[:, i] = arr - - if na_value is not lib.no_default: - result[isna(result)] = na_value - - return result - # return arr.transpose() if transpose else arr - def reindex_indexer( self: T, new_axis, @@ -1035,6 +914,55 @@ def idelete(self, indexer): # -------------------------------------------------------------------- # Array-wise Operation + def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: + """ + Apply grouped reduction function columnwise, returning a new ArrayManager. + + Parameters + ---------- + func : grouped reduction function + ignore_failures : bool, default False + Whether to drop columns where func raises TypeError. + + Returns + ------- + ArrayManager + """ + result_arrays: list[np.ndarray] = [] + result_indices: list[int] = [] + + for i, arr in enumerate(self.arrays): + # grouped_reduce functions all expect 2D arrays + arr = ensure_block_shape(arr, ndim=2) + try: + res = func(arr) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + + if res.ndim == 2: + # reverse of ensure_block_shape + assert res.shape[0] == 1 + res = res[0] + + result_arrays.append(res) + result_indices.append(i) + + if len(result_arrays) == 0: + index = Index([None]) # placeholder + else: + index = Index(range(result_arrays[0].shape[0])) + + if ignore_failures: + columns = self.items[np.array(result_indices, dtype="int64")] + else: + columns = self.items + + # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; + # expected "List[Union[ndarray, ExtensionArray]]" + return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type] + def reduce( self: T, func: Callable, ignore_failures: bool = False ) -> tuple[T, np.ndarray]: @@ -1122,6 +1050,27 @@ def quantile( axes = [qs, self._axes[1]] return type(self)(new_arrs, axes) + def apply_2d( + self: ArrayManager, f, ignore_failures: bool = False, **kwargs + ) -> ArrayManager: + """ + Variant of `apply`, but where the function should not be applied to + each column independently, but to the full data as a 2D array. + """ + values = self.as_array() + try: + result = f(values, **kwargs) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + result_arrays = [] + new_axes = [self._axes[0], self.axes[1].take([])] + else: + result_arrays = [result[:, i] for i in range(len(self._axes[1]))] + new_axes = self._axes + + return type(self)(result_arrays, new_axes) + # ---------------------------------------------------------------- def unstack(self, unstacker, fill_value) -> ArrayManager: @@ -1166,6 +1115,67 @@ def unstack(self, unstacker, fill_value) -> ArrayManager: return type(self)(new_arrays, new_axes, verify_integrity=False) + def as_array( + self, + transpose: bool = False, + dtype=None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : bool, default False + If True, transpose the return array. + dtype : object, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + if len(self.arrays) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr + + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default + + if not dtype: + dtype = interleaved_dtype([arr.dtype for arr in self.arrays]) + + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + elif isinstance(dtype, PandasDtype): + dtype = dtype.numpy_dtype + elif is_extension_array_dtype(dtype): + dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" + + result = np.empty(self.shape_proper, dtype=dtype) + + # error: Incompatible types in assignment (expression has type "Union[ndarray, + # ExtensionArray]", variable has type "ndarray") + for i, arr in enumerate(self.arrays): # type: ignore[assignment] + arr = arr.astype(dtype, copy=copy) + result[:, i] = arr + + if na_value is not lib.no_default: + result[isna(result)] = na_value + + return result + # return arr.transpose() if transpose else arr + class SingleArrayManager(BaseArrayManager, SingleDataManager): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 61396fdf372d5..d87e77043a713 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -970,12 +970,7 @@ def setitem(self, indexer, value): values[indexer] = value elif is_ea_value: - # GH#38952 - if values.ndim == 1: - values[indexer] = value - else: - # TODO(EA2D): special case not needed with 2D EA - values[indexer] = value.to_numpy(values.dtype).reshape(-1, 1) + values[indexer] = value else: # error: Argument 1 to "setitem_datetimelike_compat" has incompatible type diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bf81ac529d678..73f463997c085 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -284,53 +284,6 @@ def __repr__(self) -> str: output += f"\n{block}" return output - def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: - """ - Apply grouped reduction function blockwise, returning a new BlockManager. - - Parameters - ---------- - func : grouped reduction function - ignore_failures : bool, default False - Whether to drop blocks where func raises TypeError. - - Returns - ------- - BlockManager - """ - result_blocks: list[Block] = [] - - for blk in self.blocks: - if blk.is_object: - # split on object-dtype blocks bc some columns may raise - # while others do not. - for sb in blk._split(): - try: - applied = sb.apply(func) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - continue - result_blocks = extend_blocks(applied, result_blocks) - else: - try: - applied = blk.apply(func) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - continue - result_blocks = extend_blocks(applied, result_blocks) - - if len(result_blocks) == 0: - index = Index([None]) # placeholder - else: - index = Index(range(result_blocks[0].values.shape[-1])) - - if ignore_failures: - return self._combine(result_blocks, index=index) - - return type(self).from_blocks(result_blocks, [self.axes[0], index]) - def apply( self: T, f, @@ -631,144 +584,6 @@ def copy_func(ax): res.axes = new_axes return res - def as_array( - self, - transpose: bool = False, - dtype: Dtype | None = None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: - """ - Convert the blockmanager data into an numpy array. - - Parameters - ---------- - transpose : bool, default False - If True, transpose the return array. - dtype : object, default None - Data type of the return array. - copy : bool, default False - If True then guarantee that a copy is returned. A value of - False does not guarantee that the underlying data is not - copied. - na_value : object, default lib.no_default - Value to be used as the missing value sentinel. - - Returns - ------- - arr : ndarray - """ - if len(self.blocks) == 0: - arr = np.empty(self.shape, dtype=float) - return arr.transpose() if transpose else arr - - # We want to copy when na_value is provided to avoid - # mutating the original object - copy = copy or na_value is not lib.no_default - - if self.is_single_block: - blk = self.blocks[0] - if blk.is_extension: - # Avoid implicit conversion of extension blocks to object - - # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no - # attribute "to_numpy" - arr = blk.values.to_numpy( # type: ignore[union-attr] - dtype=dtype, na_value=na_value - ).reshape(blk.shape) - else: - arr = np.asarray(blk.get_values()) - if dtype: - # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has - # incompatible type "Union[ExtensionDtype, str, dtype[Any], - # Type[object]]"; expected "Union[dtype[Any], None, type, - # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, - # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" - arr = arr.astype(dtype, copy=False) # type: ignore[arg-type] - else: - arr = self._interleave(dtype=dtype, na_value=na_value) - # The underlying data was copied within _interleave - copy = False - - if copy: - arr = arr.copy() - - if na_value is not lib.no_default: - arr[isna(arr)] = na_value - - return arr.transpose() if transpose else arr - - def _interleave( - self, dtype: Dtype | None = None, na_value=lib.no_default - ) -> np.ndarray: - """ - Return ndarray from blocks with specified item order - Items must be contained in the blocks - """ - if not dtype: - dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) - - # TODO: https://github.com/pandas-dev/pandas/issues/22791 - # Give EAs some input on what happens here. Sparse needs this. - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - elif isinstance(dtype, ExtensionDtype): - dtype = "object" - elif is_dtype_equal(dtype, str): - dtype = "object" - - # error: Argument "dtype" to "empty" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object], None]"; expected - # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int], - # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, - # Tuple[Any, Any]]]" - result = np.empty(self.shape, dtype=dtype) # type: ignore[arg-type] - - itemmask = np.zeros(self.shape[0]) - - for blk in self.blocks: - rl = blk.mgr_locs - if blk.is_extension: - # Avoid implicit conversion of extension blocks to object - - # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no - # attribute "to_numpy" - arr = blk.values.to_numpy( # type: ignore[union-attr] - dtype=dtype, na_value=na_value - ) - else: - # error: Argument 1 to "get_values" of "Block" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object], None]"; expected - # "Union[dtype[Any], ExtensionDtype, None]" - arr = blk.get_values(dtype) # type: ignore[arg-type] - result[rl.indexer] = arr - itemmask[rl.indexer] = 1 - - if not itemmask.all(): - raise AssertionError("Some items were not contained in blocks") - - return result - - def to_dict(self, copy: bool = True): - """ - Return a dict of str(dtype) -> BlockManager - - Parameters - ---------- - copy : bool, default True - - Returns - ------- - values : a dict of dtype -> BlockManager - """ - - bd: dict[str, list[Block]] = {} - for b in self.blocks: - bd.setdefault(str(b.dtype), []).append(b) - - # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} - def consolidate(self: T) -> T: """ Join together blocks having same dtype @@ -1375,6 +1190,53 @@ def idelete(self, indexer) -> BlockManager: # ---------------------------------------------------------------- # Block-wise Operation + def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: + """ + Apply grouped reduction function blockwise, returning a new BlockManager. + + Parameters + ---------- + func : grouped reduction function + ignore_failures : bool, default False + Whether to drop blocks where func raises TypeError. + + Returns + ------- + BlockManager + """ + result_blocks: list[Block] = [] + + for blk in self.blocks: + if blk.is_object: + # split on object-dtype blocks bc some columns may raise + # while others do not. + for sb in blk._split(): + try: + applied = sb.apply(func) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + result_blocks = extend_blocks(applied, result_blocks) + else: + try: + applied = blk.apply(func) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + result_blocks = extend_blocks(applied, result_blocks) + + if len(result_blocks) == 0: + index = Index([None]) # placeholder + else: + index = Index(range(result_blocks[0].values.shape[-1])) + + if ignore_failures: + return self._combine(result_blocks, index=index) + + return type(self).from_blocks(result_blocks, [self.axes[0], index]) + def reduce( self: T, func: Callable, ignore_failures: bool = False ) -> tuple[T, np.ndarray]: @@ -1506,6 +1368,144 @@ def unstack(self, unstacker, fill_value) -> BlockManager: bm = BlockManager(new_blocks, [new_columns, new_index]) return bm + def to_dict(self, copy: bool = True): + """ + Return a dict of str(dtype) -> BlockManager + + Parameters + ---------- + copy : bool, default True + + Returns + ------- + values : a dict of dtype -> BlockManager + """ + + bd: dict[str, list[Block]] = {} + for b in self.blocks: + bd.setdefault(str(b.dtype), []).append(b) + + # TODO(EA2D): the combine will be unnecessary with 2D EAs + return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} + + def as_array( + self, + transpose: bool = False, + dtype: Dtype | None = None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : bool, default False + If True, transpose the return array. + dtype : object, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + if len(self.blocks) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr + + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default + + if self.is_single_block: + blk = self.blocks[0] + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no + # attribute "to_numpy" + arr = blk.values.to_numpy( # type: ignore[union-attr] + dtype=dtype, na_value=na_value + ).reshape(blk.shape) + else: + arr = np.asarray(blk.get_values()) + if dtype: + # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has + # incompatible type "Union[ExtensionDtype, str, dtype[Any], + # Type[object]]"; expected "Union[dtype[Any], None, type, + # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, + # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" + arr = arr.astype(dtype, copy=False) # type: ignore[arg-type] + else: + arr = self._interleave(dtype=dtype, na_value=na_value) + # The underlying data was copied within _interleave + copy = False + + if copy: + arr = arr.copy() + + if na_value is not lib.no_default: + arr[isna(arr)] = na_value + + return arr.transpose() if transpose else arr + + def _interleave( + self, dtype: Dtype | None = None, na_value=lib.no_default + ) -> np.ndarray: + """ + Return ndarray from blocks with specified item order + Items must be contained in the blocks + """ + if not dtype: + dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) + + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + elif isinstance(dtype, ExtensionDtype): + dtype = np.dtype("object") + elif is_dtype_equal(dtype, str): + dtype = np.dtype("object") + + # error: Argument "dtype" to "empty" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object], None]"; expected + # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int], + # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, + # Tuple[Any, Any]]]" + result = np.empty(self.shape, dtype=dtype) # type: ignore[arg-type] + + itemmask = np.zeros(self.shape[0]) + + for blk in self.blocks: + rl = blk.mgr_locs + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + + # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no + # attribute "to_numpy" + arr = blk.values.to_numpy( # type: ignore[union-attr] + dtype=dtype, na_value=na_value + ) + else: + # error: Argument 1 to "get_values" of "Block" has incompatible type + # "Union[ExtensionDtype, str, dtype[Any], Type[object], None]"; expected + # "Union[dtype[Any], ExtensionDtype, None]" + arr = blk.get_values(dtype) # type: ignore[arg-type] + result[rl.indexer] = arr + itemmask[rl.indexer] = 1 + + if not itemmask.all(): + raise AssertionError("Some items were not contained in blocks") + + return result + class SingleBlockManager(BaseBlockManager, SingleDataManager): """ manage a single block with """ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 92618605e47cc..19fd48a772493 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1598,7 +1598,7 @@ def _ensure_numeric(x): elif not (is_float(x) or is_integer(x) or is_complex(x)): try: x = float(x) - except ValueError: + except (TypeError, ValueError): # e.g. "1+1j" or "foo" try: x = complex(x) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 14d065de7f77f..aae6314968695 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -6,6 +6,7 @@ from typing import ( TYPE_CHECKING, Callable, + Hashable, no_type_check, ) @@ -101,8 +102,8 @@ class Resampler(BaseGroupBy, PandasObject): Parameters ---------- - obj : pandas object - groupby : a TimeGrouper object + obj : Series or DataFrame + groupby : TimeGrouper axis : int, default 0 kind : str or None 'period', 'timestamp' to override default index treatment @@ -116,10 +117,8 @@ class Resampler(BaseGroupBy, PandasObject): After resampling, see aggregate, apply, and transform functions. """ - # error: Incompatible types in assignment (expression has type - # "Optional[BinGrouper]", base class "BaseGroupBy" defined the type as - # "BaseGrouper") - grouper: BinGrouper | None # type: ignore[assignment] + grouper: BinGrouper + exclusions: frozenset[Hashable] = frozenset() # for SelectionMixin compat # to the groupby descriptor _attributes = [ @@ -134,7 +133,14 @@ class Resampler(BaseGroupBy, PandasObject): "offset", ] - def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): + def __init__( + self, + obj: FrameOrSeries, + groupby: TimeGrouper, + axis: int = 0, + kind=None, + **kwargs, + ): self.groupby = groupby self.keys = None self.sort = True @@ -143,12 +149,9 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.squeeze = False self.group_keys = True self.as_index = True - self.exclusions = set() - self.binner = None - self.grouper = None - if self.groupby is not None: - self.groupby._set_grouper(self._convert_obj(obj), sort=True) + self.groupby._set_grouper(self._convert_obj(obj), sort=True) + self.binner, self.grouper = self._get_binner() @final def _shallow_copy(self, obj, **kwargs): @@ -183,39 +186,17 @@ def __getattr__(self, attr: str): return object.__getattribute__(self, attr) - def __iter__(self): - """ - Resampler iterator. - - Returns - ------- - Generator yielding sequence of (name, subsetted object) - for each group. - - See Also - -------- - GroupBy.__iter__ : Generator yielding sequence for each group. - """ - self._set_binner() - return super().__iter__() - + # error: Signature of "obj" incompatible with supertype "BaseGroupBy" @property - def obj(self): - return self.groupby.obj + def obj(self) -> FrameOrSeries: # type: ignore[override] + # error: Incompatible return value type (got "Optional[Any]", + # expected "FrameOrSeries") + return self.groupby.obj # type: ignore[return-value] @property def ax(self): return self.groupby.ax - @property - def _typ(self) -> str: - """ - Masquerade for compat as a Series or a DataFrame. - """ - if isinstance(self._selected_obj, ABCSeries): - return "series" - return "dataframe" - @property def _from_selection(self) -> bool: """ @@ -227,32 +208,24 @@ def _from_selection(self) -> bool: self.groupby.key is not None or self.groupby.level is not None ) - def _convert_obj(self, obj): + def _convert_obj(self, obj: FrameOrSeries) -> FrameOrSeries: """ Provide any conversions for the object in order to correctly handle. Parameters ---------- - obj : the object to be resampled + obj : Series or DataFrame Returns ------- - obj : converted object + Series or DataFrame """ return obj._consolidate() def _get_binner_for_time(self): raise AbstractMethodError(self) - def _set_binner(self): - """ - Setup our binners. - - Cache these as we are an immutable object - """ - if self.binner is None: - self.binner, self.grouper = self._get_binner() - + @final def _get_binner(self): """ Create the BinGrouper, assume that self.set_grouper(obj) @@ -263,12 +236,6 @@ def _get_binner(self): bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer) return binner, bin_grouper - def _assure_grouper(self): - """ - Make sure that we are creating our binner & grouper. - """ - self._set_binner() - @Substitution( klass="Resampler", examples=""" @@ -358,7 +325,6 @@ def pipe( ) def aggregate(self, func, *args, **kwargs): - self._set_binner() result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: how = func @@ -409,7 +375,6 @@ def _gotitem(self, key, ndim: int, subset=None): subset : object, default None subset to act on """ - self._set_binner() grouper = self.grouper if subset is None: subset = self.obj @@ -426,7 +391,6 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): Re-evaluate the obj with a groupby aggregation. """ if grouper is None: - self._set_binner() grouper = self.grouper obj = self._selected_obj @@ -439,21 +403,23 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): result = grouped._aggregate_item_by_item(how, *args, **kwargs) else: result = grouped.aggregate(how, *args, **kwargs) - except (DataError, AttributeError, KeyError): + except DataError: + # got TypeErrors on aggregation + result = grouped.apply(how, *args, **kwargs) + except (AttributeError, KeyError): # we have a non-reducing function; try to evaluate # alternatively we want to evaluate only a column of the input + + # test_apply_to_one_column_of_df the function being applied references + # a DataFrame column, but aggregate_item_by_item operates column-wise + # on Series, raising AttributeError or KeyError + # (depending on whether the column lookup uses getattr/__getitem__) result = grouped.apply(how, *args, **kwargs) + except ValueError as err: if "Must produce aggregated value" in str(err): # raised in _aggregate_named - pass - elif "len(index) != len(labels)" in str(err): - # raised in libgroupby validation - pass - elif "No objects to concatenate" in str(err): - # raised in concat call - # In tests this is reached via either - # _apply_to_column_groupbys (ohlc) or DataFrameGroupBy.nunique + # see test_apply_without_aggregation, test_apply_with_mutated_index pass else: raise @@ -1046,11 +1012,9 @@ class _GroupByMixin(PandasObject): _attributes: list[str] # in practice the same as Resampler._attributes - def __init__(self, obj, **kwargs): + def __init__(self, obj, parent=None, groupby=None, **kwargs): # reached via ._gotitem and _get_resampler_for_grouping - parent = kwargs.pop("parent", None) - groupby = kwargs.pop("groupby", None) if parent is None: parent = obj @@ -1059,8 +1023,8 @@ def __init__(self, obj, **kwargs): for attr in self._attributes: setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) - # error: Too many arguments for "__init__" of "object" - super().__init__(None) # type: ignore[call-arg] + self.binner = parent.binner + self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True @@ -1146,7 +1110,6 @@ def _downsample(self, how, **kwargs): how : string / cython mapped function **kwargs : kw args passed to how function """ - self._set_binner() how = com.get_cython_func(how) or how ax = self.ax obj = self._selected_obj @@ -1163,7 +1126,7 @@ def _downsample(self, how, **kwargs): # error: Item "None" of "Optional[Any]" has no attribute "binlabels" if ( (ax.freq is not None or ax.inferred_freq is not None) - and len(self.grouper.binlabels) > len(ax) # type: ignore[union-attr] + and len(self.grouper.binlabels) > len(ax) and how is None ): @@ -1205,7 +1168,6 @@ def _upsample(self, method, limit=None, fill_value=None): .fillna: Fill NA/NaN values using the specified method. """ - self._set_binner() if self.axis: raise AssertionError("axis must be 0") if self._from_selection: @@ -1266,7 +1228,7 @@ def _get_binner_for_time(self): return super()._get_binner_for_time() return self.groupby._get_period_bins(self.ax) - def _convert_obj(self, obj): + def _convert_obj(self, obj: FrameOrSeries) -> FrameOrSeries: obj = super()._convert_obj(obj) if self._from_selection: @@ -1345,7 +1307,6 @@ def _upsample(self, method, limit=None, fill_value=None): if self.kind == "timestamp": return super()._upsample(method, limit=limit, fill_value=fill_value) - self._set_binner() ax = self.ax obj = self.obj new_index = self.binner @@ -1358,9 +1319,7 @@ def _upsample(self, method, limit=None, fill_value=None): new_obj = _take_new_index( obj, indexer, - # error: Argument 3 to "_take_new_index" has incompatible type - # "Optional[Any]"; expected "Index" - new_index, # type: ignore[arg-type] + new_index, axis=self.axis, ) return self._wrap_result(new_obj) @@ -1416,15 +1375,13 @@ def get_resampler(obj, kind=None, **kwds): def get_resampler_for_grouping( - groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs + groupby, rule, how=None, fill_method=None, limit=None, kind=None, on=None, **kwargs ): """ Return our appropriate resampler when grouping as well. """ # .resample uses 'on' similar to how .groupby uses 'key' - kwargs["key"] = kwargs.pop("on", None) - - tg = TimeGrouper(freq=rule, **kwargs) + tg = TimeGrouper(freq=rule, key=on, **kwargs) resampler = tg._get_resampler(groupby.obj, kind=kind) return resampler._get_resampler_for_grouping(groupby=groupby) @@ -1522,20 +1479,20 @@ def __init__( else: try: self.origin = Timestamp(origin) - except Exception as e: + except (ValueError, TypeError) as err: raise ValueError( "'origin' should be equal to 'epoch', 'start', 'start_day', " "'end', 'end_day' or " f"should be a Timestamp convertible type. Got '{origin}' instead." - ) from e + ) from err try: self.offset = Timedelta(offset) if offset is not None else None - except Exception as e: + except (ValueError, TypeError) as err: raise ValueError( "'offset' should be a Timedelta convertible type. " f"Got '{offset}' instead." - ) from e + ) from err # always sort time groupers kwargs["sort"] = True @@ -1596,10 +1553,9 @@ def _get_resampler(self, obj, kind=None): def _get_grouper(self, obj, validate: bool = True): # create the resampler and return our binner r = self._get_resampler(obj) - r._set_binner() return r.binner, r.grouper, r.obj - def _get_time_bins(self, ax): + def _get_time_bins(self, ax: DatetimeIndex): if not isinstance(ax, DatetimeIndex): raise TypeError( "axis must be a DatetimeIndex, but got " @@ -1778,7 +1734,8 @@ def _get_period_bins(self, ax: PeriodIndex): # Get offset for bin edge (not label edge) adjustment start_offset = Period(start, self.freq) - Period(p_start, self.freq) - bin_shift = start_offset.n % freq_mult + # error: Item "Period" of "Union[Period, Any]" has no attribute "n" + bin_shift = start_offset.n % freq_mult # type: ignore[union-attr] start = p_start labels = binner = period_range( @@ -1947,17 +1904,17 @@ def _get_period_range_edges( raise TypeError("'first' and 'last' must be instances of type Period") # GH 23882 - first = first.to_timestamp() - last = last.to_timestamp() - adjust_first = not freq.is_on_offset(first) - adjust_last = freq.is_on_offset(last) + first_ts = first.to_timestamp() + last_ts = last.to_timestamp() + adjust_first = not freq.is_on_offset(first_ts) + adjust_last = freq.is_on_offset(last_ts) - first, last = _get_timestamp_range_edges( - first, last, freq, closed=closed, origin=origin, offset=offset + first_ts, last_ts = _get_timestamp_range_edges( + first_ts, last_ts, freq, closed=closed, origin=origin, offset=offset ) - first = (first + int(adjust_first) * freq).to_period(freq) - last = (last - int(adjust_last) * freq).to_period(freq) + first = (first_ts + int(adjust_first) * freq).to_period(freq) + last = (last_ts - int(adjust_last) * freq).to_period(freq) return first, last @@ -1975,13 +1932,13 @@ def _insert_nat_bin( def _adjust_dates_anchored( - first, - last, - freq, + first: Timestamp, + last: Timestamp, + freq: Tick, closed: Literal["right", "left"] = "right", origin="start_day", offset: Timedelta | None = None, -): +) -> tuple[Timestamp, Timestamp]: # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is # not a multiple of the frequency. See GH 8683 diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 13e977fc1b33c..a2616c49801fa 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2140,6 +2140,7 @@ def _factorize_keys( # "_values_for_factorize" rk, _ = rk._values_for_factorize() # type: ignore[union-attr,assignment] + klass: type[libhashtable.Factorizer] | type[libhashtable.Int64Factorizer] if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 1a4d8dbe2885e..037fe5366255a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -131,19 +131,23 @@ def __init__(self, index: MultiIndex, level=-1, constructor=None): self._make_selectors() @cache_readonly - def _indexer_and_to_sort(self): + def _indexer_and_to_sort( + self, + ) -> tuple[ + np.ndarray, # np.ndarray[np.intp] + list[np.ndarray], # each has _some_ signed integer dtype + ]: v = self.level codes = list(self.index.codes) levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] - sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] + sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = get_group_index_sorter(comp_index, ngroups) - indexer = ensure_platform_int(indexer) return indexer, to_sort @cache_readonly @@ -162,7 +166,7 @@ def _make_selectors(self): # make the mask remaining_labels = self.sorted_labels[:-1] - level_sizes = [len(x) for x in new_levels] + level_sizes = tuple(len(x) for x in new_levels) comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) ngroups = len(obs_ids) @@ -349,7 +353,7 @@ def _unstack_multiple(data, clocs, fill_value=None): rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] - shape = [len(x) for x in clevels] + shape = tuple(len(x) for x in clevels) group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) diff --git a/pandas/core/series.py b/pandas/core/series.py index 195759d836f3b..f214127410170 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -113,15 +113,15 @@ from pandas.core.indexes.accessors import CombinedDatetimelikeProperties from pandas.core.indexes.api import ( CategoricalIndex, + DatetimeIndex, Float64Index, Index, MultiIndex, + PeriodIndex, + TimedeltaIndex, ensure_index, ) import pandas.core.indexes.base as ibase -from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import ( SingleArrayManager, @@ -3087,7 +3087,7 @@ def combine(self, other, func, fill_value=None) -> Series: new_values[:] = [func(lv, other) for lv in self._values] new_name = self.name - # try_float=False is to match _aggregate_series_pure_python + # try_float=False is to match agg_series npvalues = lib.maybe_convert_objects(new_values, try_float=False) res_values = maybe_cast_pointwise_result(npvalues, self.dtype, same_dtype=False) return self._constructor(res_values, index=new_index, name=new_name) @@ -4191,7 +4191,7 @@ def apply( Notes ----- Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`udf-mutation` + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. Examples diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index a4ee4bb636450..a3fa24c7ee1e0 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -42,7 +42,7 @@ `agg` is an alias for `aggregate`. Use the alias. Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`udf-mutation` +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. A passed user-defined-function will be passed a Series for evaluation. @@ -303,7 +303,7 @@ Notes ----- Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`udf-mutation` +behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` for more details. Examples diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index f5cd390f077a6..f6c1afbde0bd9 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -18,7 +18,10 @@ lib, ) from pandas._libs.hashtable import unique_label_indices -from pandas._typing import IndexKeyFunc +from pandas._typing import ( + IndexKeyFunc, + Shape, +) from pandas.core.dtypes.common import ( ensure_int64, @@ -93,7 +96,7 @@ def get_indexer_indexer( return indexer -def get_group_index(labels, shape, sort: bool, xnull: bool): +def get_group_index(labels, shape: Shape, sort: bool, xnull: bool): """ For the particular label_list, gets the offsets into the hypothetical list representing the totally ordered cartesian product of all possible label @@ -108,7 +111,7 @@ def get_group_index(labels, shape, sort: bool, xnull: bool): ---------- labels : sequence of arrays Integers identifying levels at each location - shape : sequence of ints + shape : tuple[int, ...] Number of unique levels at each location sort : bool If the ranks of returned ids should match lexical ranks of labels @@ -134,33 +137,36 @@ def _int64_cut_off(shape) -> int: return i return len(shape) - def maybe_lift(lab, size): + def maybe_lift(lab, size) -> tuple[np.ndarray, int]: # promote nan values (assigned -1 label in lab array) # so that all output values are non-negative return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - labels = map(ensure_int64, labels) + labels = [ensure_int64(x) for x in labels] + lshape = list(shape) if not xnull: - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + for i, (lab, size) in enumerate(zip(labels, shape)): + lab, size = maybe_lift(lab, size) + labels[i] = lab + lshape[i] = size labels = list(labels) - shape = list(shape) # Iteratively process all the labels in chunks sized so less # than _INT64_MAX unique int ids will be required for each chunk while True: # how many levels can be done without overflow: - nlev = _int64_cut_off(shape) + nlev = _int64_cut_off(lshape) # compute flat ids for the first `nlev` levels - stride = np.prod(shape[1:nlev], dtype="i8") + stride = np.prod(lshape[1:nlev], dtype="i8") out = stride * labels[0].astype("i8", subok=False, copy=False) for i in range(1, nlev): - if shape[i] == 0: - stride = 0 + if lshape[i] == 0: + stride = np.int64(0) else: - stride //= shape[i] + stride //= lshape[i] out += labels[i] * stride if xnull: # exclude nulls @@ -169,7 +175,7 @@ def maybe_lift(lab, size): mask |= lab == -1 out[mask] = -1 - if nlev == len(shape): # all levels done! + if nlev == len(lshape): # all levels done! break # compress what has been done so far in order to avoid overflow @@ -177,12 +183,12 @@ def maybe_lift(lab, size): comp_ids, obs_ids = compress_group_index(out, sort=sort) labels = [comp_ids] + labels[nlev:] - shape = [len(obs_ids)] + shape[nlev:] + lshape = [len(obs_ids)] + lshape[nlev:] return out -def get_compressed_ids(labels, sizes) -> tuple[np.ndarray, np.ndarray]: +def get_compressed_ids(labels, sizes: Shape) -> tuple[np.ndarray, np.ndarray]: """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -191,7 +197,7 @@ def get_compressed_ids(labels, sizes) -> tuple[np.ndarray, np.ndarray]: Parameters ---------- labels : list of label arrays - sizes : list of size of the levels + sizes : tuple[int] of size of the levels Returns ------- @@ -252,12 +258,11 @@ def decons_obs_group_ids(comp_ids: np.ndarray, obs_ids, shape, labels, xnull: bo return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] # TODO: unique_label_indices only used here, should take ndarray[np.intp] - i = unique_label_indices(ensure_int64(comp_ids)) - i8copy = lambda a: a.astype("i8", subok=False, copy=True) - return [i8copy(lab[i]) for lab in labels] + indexer = unique_label_indices(ensure_int64(comp_ids)) + return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] -def indexer_from_factorized(labels, shape, compress: bool = True) -> np.ndarray: +def indexer_from_factorized(labels, shape: Shape, compress: bool = True) -> np.ndarray: # returned ndarray is np.intp ids = get_group_index(labels, shape, sort=True, xnull=False) @@ -334,7 +339,7 @@ def lexsort_indexer( shape.append(n) labels.append(codes) - return indexer_from_factorized(labels, shape) + return indexer_from_factorized(labels, tuple(shape)) def nargsort( @@ -576,7 +581,7 @@ def get_indexer_dict( """ shape = [len(x) for x in keys] - group_index = get_group_index(label_list, shape, sort=True, xnull=True) + group_index = get_group_index(label_list, tuple(shape), sort=True, xnull=True) if np.all(group_index == -1): # Short-circuit, lib.indices_fast will return the same return {} diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 85a58d3d99795..f8df05a7022d1 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -19,6 +19,7 @@ is_categorical_dtype, is_integer, is_list_like, + is_re, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -154,11 +155,10 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = isinstance(data.dtype, (StringDtype, ArrowStringDtype)) + self._is_string = isinstance(data.dtype, StringDtype) self._data = data self._index = self._name = None @@ -195,8 +195,6 @@ def _validate(data): ------- dtype : inferred dtype of data """ - from pandas import StringDtype - if isinstance(data, ABCMultiIndex): raise AttributeError( "Can only use .str accessor with Index, not MultiIndex" @@ -208,10 +206,6 @@ def _validate(data): values = getattr(data, "values", data) # Series / Index values = getattr(values, "categories", values) # categorical / normal - # explicitly allow StringDtype - if isinstance(values.dtype, StringDtype): - return "string" - inferred_dtype = lib.infer_dtype(values, skipna=True) if inferred_dtype not in allowed_types: @@ -1132,6 +1126,14 @@ def contains(self, pat, case=True, flags=0, na=None, regex=True): 4 False dtype: bool """ + if regex and re.compile(pat).groups: + warnings.warn( + "This pattern has match groups. To actually get the " + "groups, use str.extract.", + UserWarning, + stacklevel=3, + ) + result = self._data.array._str_contains(pat, case, flags, na, regex) return self._wrap_result(result, fill_value=na, returns_string=False) @@ -1333,6 +1335,29 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): ) warnings.warn(msg, FutureWarning, stacklevel=3) regex = True + + # Check whether repl is valid (GH 13438, GH 15055) + if not (isinstance(repl, str) or callable(repl)): + raise TypeError("repl must be a string or callable") + + is_compiled_re = is_re(pat) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError( + "case and flags cannot be set when pat is a compiled regex" + ) + elif case is None: + # not a compiled regex, set default case + case = True + + elif is_compiled_re: + raise ValueError( + "Cannot use a compiled regex as replacement pattern with regex=False" + ) + elif callable(repl): + raise ValueError("Cannot use a callable replacement when regex=False") + result = self._data.array._str_replace( pat, repl, n=n, case=case, flags=flags, regex=regex ) @@ -3002,9 +3027,8 @@ def _result_dtype(arr): # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype - if isinstance(arr.dtype, (StringDtype, ArrowStringDtype)): + if isinstance(arr.dtype, StringDtype): return arr.dtype.name else: return object diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index b8033668aa18f..a77f8861a7c02 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -61,11 +61,7 @@ def _str_repeat(self, repeats): @abc.abstractmethod def _str_match( - self, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, + self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan ): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index b794690ccc5af..869eabc76b555 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -7,7 +7,6 @@ Union, ) import unicodedata -import warnings import numpy as np @@ -115,22 +114,14 @@ def _str_pad(self, width, side="left", fillchar=" "): raise ValueError("Invalid side") return self._str_map(f) - def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): if regex: if not case: flags |= re.IGNORECASE - regex = re.compile(pat, flags=flags) + pat = re.compile(pat, flags=flags) - if regex.groups > 0: - warnings.warn( - "This pattern has match groups. To actually get the " - "groups, use str.extract.", - UserWarning, - stacklevel=3, - ) - - f = lambda x: regex.search(x) is not None + f = lambda x: pat.search(x) is not None else: if case: f = lambda x: pat in x @@ -147,41 +138,20 @@ def _str_endswith(self, pat, na=None): f = lambda x: x.endswith(pat) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): - raise TypeError("repl must be a string or callable") - + def _str_replace(self, pat, repl, n=-1, case: bool = True, flags=0, regex=True): is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): - n = n if n >= 0 else 0 - compiled = re.compile(pat, flags=flags) - f = lambda x: compiled.sub(repl=repl, string=x, count=n) - else: - f = lambda x: x.replace(pat, repl, n) + + if case is False: + # add case flag, if provided + flags |= re.IGNORECASE + + if regex and (is_compiled_re or len(pat) > 1 or flags or callable(repl)): + if not is_compiled_re: + pat = re.compile(pat, flags=flags) + + n = n if n >= 0 else 0 + f = lambda x: pat.sub(repl=repl, string=x, count=n) else: - if is_compiled_re: - raise ValueError( - "Cannot use a compiled regex as replacement pattern with " - "regex=False" - ) - if callable(repl): - raise ValueError("Cannot use a callable replacement when regex=False") f = lambda x: x.replace(pat, repl, n) return self._str_map(f, dtype=str) @@ -216,11 +186,7 @@ def rep(x, r): return result def _str_match( - self, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = None, + self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None ): if not case: flags |= re.IGNORECASE diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index b7116ee95949b..6f5e8ab900dfd 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -180,7 +180,7 @@ def to_numeric(arg, errors="raise", downcast=None): values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: - values = lib.maybe_convert_numeric( + values, _ = lib.maybe_convert_numeric( values, set(), coerce_numeric=coerce_numeric ) except (ValueError, TypeError): diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 4a210d8b47e9b..08a65964f278e 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -29,16 +29,18 @@ args_compat, create_section_header, kwargs_compat, + numba_notes, template_header, template_returns, template_see_also, + window_agg_numba_parameters, ) from pandas.core.window.indexers import ( BaseIndexer, ExponentialMovingWindowIndexer, GroupbyIndexer, ) -from pandas.core.window.numba_ import generate_numba_groupby_ewma_func +from pandas.core.window.numba_ import generate_numba_ewma_func from pandas.core.window.rolling import ( BaseWindow, BaseWindowGroupby, @@ -372,26 +374,41 @@ def aggregate(self, func, *args, **kwargs): template_header, create_section_header("Parameters"), args_compat, + window_agg_numba_parameters, kwargs_compat, create_section_header("Returns"), template_returns, create_section_header("See Also"), - template_see_also[:-1], + template_see_also, + create_section_header("Notes"), + numba_notes.replace("\n", "", 1), window_method="ewm", aggregation_description="(exponential weighted moment) mean", agg_method="mean", ) - def mean(self, *args, **kwargs): - nv.validate_window_func("mean", args, kwargs) - window_func = window_aggregations.ewma - window_func = partial( - window_func, - com=self._com, - adjust=self.adjust, - ignore_na=self.ignore_na, - deltas=self._deltas, - ) - return self._apply(window_func) + def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): + if maybe_use_numba(engine): + ewma_func = generate_numba_ewma_func( + engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas + ) + return self._apply( + ewma_func, + numba_cache_key=(lambda x: x, "ewma"), + ) + elif engine in ("cython", None): + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + nv.validate_window_func("mean", args, kwargs) + window_func = partial( + window_aggregations.ewma, + com=self._com, + adjust=self.adjust, + ignore_na=self.ignore_na, + deltas=self._deltas, + ) + return self._apply(window_func) + else: + raise ValueError("engine must be either 'numba' or 'cython'") @doc( template_header, @@ -635,45 +652,3 @@ def _get_window_indexer(self) -> GroupbyIndexer: window_indexer=ExponentialMovingWindowIndexer, ) return window_indexer - - def mean(self, engine=None, engine_kwargs=None): - """ - Parameters - ---------- - engine : str, default None - * ``'cython'`` : Runs mean through C-extensions from cython. - * ``'numba'`` : Runs mean through JIT compiled code from numba. - Only available when ``raw`` is set to ``True``. - * ``None`` : Defaults to ``'cython'`` or globally setting - ``compute.use_numba`` - - .. versionadded:: 1.2.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}``. - - .. versionadded:: 1.2.0 - - Returns - ------- - Series or DataFrame - Return type is determined by the caller. - """ - if maybe_use_numba(engine): - groupby_ewma_func = generate_numba_groupby_ewma_func( - engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas - ) - return self._apply( - groupby_ewma_func, - numba_cache_key=(lambda x: x, "groupby_ewma"), - ) - elif engine in ("cython", None): - if engine_kwargs is not None: - raise ValueError("cython engine does not accept engine_kwargs") - return super().mean() - else: - raise ValueError("engine must be either 'numba' or 'cython'") diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index d84dea7ee622c..9407efd0bef2b 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -80,7 +80,7 @@ def roll_apply( return roll_apply -def generate_numba_groupby_ewma_func( +def generate_numba_ewma_func( engine_kwargs: Optional[Dict[str, bool]], com: float, adjust: bool, @@ -88,7 +88,7 @@ def generate_numba_groupby_ewma_func( deltas: np.ndarray, ): """ - Generate a numba jitted groupby ewma function specified by values + Generate a numba jitted ewma function specified by values from engine_kwargs. Parameters @@ -106,14 +106,14 @@ def generate_numba_groupby_ewma_func( """ nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - cache_key = (lambda x: x, "groupby_ewma") + cache_key = (lambda x: x, "ewma") if cache_key in NUMBA_FUNC_CACHE: return NUMBA_FUNC_CACHE[cache_key] numba = import_optional_dependency("numba") @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def groupby_ewma( + def ewma( values: np.ndarray, begin: np.ndarray, end: np.ndarray, @@ -121,15 +121,15 @@ def groupby_ewma( ) -> np.ndarray: result = np.empty(len(values)) alpha = 1.0 / (1.0 + com) + old_wt_factor = 1.0 - alpha + new_wt = 1.0 if adjust else alpha + for i in numba.prange(len(begin)): start = begin[i] stop = end[i] window = values[start:stop] sub_result = np.empty(len(window)) - old_wt_factor = 1.0 - alpha - new_wt = 1.0 if adjust else alpha - weighted_avg = window[0] nobs = int(not np.isnan(weighted_avg)) sub_result[0] = weighted_avg if nobs >= minimum_periods else np.nan @@ -166,7 +166,7 @@ def groupby_ewma( return result - return groupby_ewma + return ewma def generate_numba_table_func( diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 7c867487e92b0..b51875134c614 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -101,6 +101,7 @@ DataFrame, Series, ) + from pandas.core.groupby.ops import BaseGrouper from pandas.core.internals import Block # noqa:F401 @@ -381,10 +382,11 @@ def _apply_series( """ obj = self._create_data(self._selected_obj) - try: + if name == "count": # GH 12541: Special case for count where we support date-like types - input = obj.values if name != "count" else notna(obj.values).astype(int) - values = self._prep_values(input) + obj = notna(obj).astype(int) + try: + values = self._prep_values(obj._values) except (TypeError, NotImplementedError) as err: raise DataError("No numeric types to aggregate") from err @@ -537,18 +539,22 @@ class BaseWindowGroupby(BaseWindow): Provide the groupby windowing facilities. """ + _grouper: BaseGrouper + _as_index: bool _attributes = ["_grouper"] def __init__( self, obj: FrameOrSeries, *args, - _grouper=None, - _as_index=True, + _grouper: BaseGrouper, + _as_index: bool = True, **kwargs, ): - if _grouper is None: - raise ValueError("Must pass a Grouper object.") + from pandas.core.groupby.ops import BaseGrouper + + if not isinstance(_grouper, BaseGrouper): + raise ValueError("Must pass a BaseGrouper object.") self._grouper = _grouper self._as_index = _as_index # GH 32262: It's convention to keep the grouping column in @@ -658,7 +664,9 @@ def _apply_pairwise( # When we evaluate the pairwise=True result, repeat the groupby # labels by the number of columns in the original object groupby_codes = self._grouper.codes - groupby_levels = self._grouper.levels + # error: Incompatible types in assignment (expression has type + # "List[Index]", variable has type "List[Union[ndarray, Index]]") + groupby_levels = self._grouper.levels # type: ignore[assignment] group_indices = self._grouper.indices.values() if group_indices: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 3c9dd90c0a0cb..d26a991ba2820 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -707,30 +707,45 @@ class ExcelWriter(metaclass=abc.ABCMeta): -------- Default usage: - >>> with ExcelWriter('path_to_file.xlsx') as writer: + >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) + >>> with ExcelWriter("path_to_file.xlsx") as writer: ... df.to_excel(writer) To write to separate sheets in a single file: - >>> with ExcelWriter('path_to_file.xlsx') as writer: - ... df1.to_excel(writer, sheet_name='Sheet1') - ... df2.to_excel(writer, sheet_name='Sheet2') + >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) + >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) + >>> with ExcelWriter("path_to_file.xlsx") as writer: + ... df1.to_excel(writer, sheet_name="Sheet1") + ... df2.to_excel(writer, sheet_name="Sheet2") You can set the date format or datetime format: - >>> with ExcelWriter('path_to_file.xlsx', - ... date_format='YYYY-MM-DD', - ... datetime_format='YYYY-MM-DD HH:MM:SS') as writer: + >>> from datetime import date, datetime + >>> df = pd.DataFrame( + ... [ + ... [date(2014, 1, 31), date(1999, 9, 24)], + ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], + ... ], + ... index=["Date", "Datetime"], + ... columns=["X", "Y"], + ... ) + >>> with ExcelWriter( + ... "path_to_file.xlsx", + ... date_format="YYYY-MM-DD", + ... datetime_format="YYYY-MM-DD HH:MM:SS" + ... ) as writer: ... df.to_excel(writer) You can also append to an existing Excel file: - >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer: - ... df.to_excel(writer, sheet_name='Sheet3') + >>> with ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer: + ... df.to_excel(writer, sheet_name="Sheet3") You can store Excel file in RAM: >>> import io + >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) >>> buffer = io.BytesIO() >>> with pd.ExcelWriter(buffer) as writer: ... df.to_excel(writer) @@ -738,8 +753,9 @@ class ExcelWriter(metaclass=abc.ABCMeta): You can pack Excel file into zip archive: >>> import zipfile - >>> with zipfile.ZipFile('path_to_file.zip', 'w') as zf: - ... with zf.open('filename.xlsx', 'w') as buffer: + >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) + >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf: + ... with zf.open("filename.xlsx", "w") as buffer: ... with pd.ExcelWriter(buffer) as writer: ... df.to_excel(writer) """ diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 0253852fbb39a..4aaf1eecde5e8 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -486,13 +486,8 @@ def format( formatter = {col: formatter for col in columns} for col in columns: - try: - format_func = formatter[col] - except KeyError: - format_func = None - format_func = _maybe_wrap_formatter( - format_func, + formatter.get(col), na_rep=na_rep, precision=precision, decimal=decimal, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index b7493ebeadf34..259850e9a7233 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -526,9 +526,13 @@ def read_json( Encoding/decoding a Dataframe using ``'split'`` formatted JSON: >>> df.to_json(orient='split') - '{{"columns":["col 1","col 2"], - "index":["row 1","row 2"], - "data":[["a","b"],["c","d"]]}}' + '\ +{{\ +"columns":["col 1","col 2"],\ +"index":["row 1","row 2"],\ +"data":[["a","b"],["c","d"]]\ +}}\ +' >>> pd.read_json(_, orient='split') col 1 col 2 row 1 a b @@ -538,6 +542,7 @@ def read_json( >>> df.to_json(orient='index') '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}' + >>> pd.read_json(_, orient='index') col 1 col 2 row 1 a b @@ -556,13 +561,18 @@ def read_json( Encoding with Table Schema >>> df.to_json(orient='table') - '{{"schema": {{"fields": [{{"name": "index", "type": "string"}}, - {{"name": "col 1", "type": "string"}}, - {{"name": "col 2", "type": "string"}}], - "primaryKey": "index", - "pandas_version": "0.20.0"}}, - "data": [{{"index": "row 1", "col 1": "a", "col 2": "b"}}, - {{"index": "row 2", "col 1": "c", "col 2": "d"}}]}}' + '\ +{{"schema":{{"fields":[\ +{{"name":"index","type":"string"}},\ +{{"name":"col 1","type":"string"}},\ +{{"name":"col 2","type":"string"}}],\ +"primaryKey":["index"],\ +"pandas_version":"0.20.0"}},\ +"data":[\ +{{"index":"row 1","col 1":"a","col 2":"b"}},\ +{{"index":"row 2","col 1":"c","col 2":"d"}}]\ +}}\ +' """ if orient == "table" and dtype: raise ValueError("cannot pass both dtype and orient='table'") diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 3d07b9d98f9a9..5927d6482d3b0 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -70,15 +70,17 @@ def nested_to_record( Examples -------- - IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), - nested=dict(e=dict(c=1,d=2),d=2))) - Out[52]: - {'dict1.c': 1, - 'dict1.d': 2, - 'flat1': 1, - 'nested.d': 2, - 'nested.e.c': 1, - 'nested.e.d': 2} + >>> nested_to_record( + ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2)) + ... ) + {\ +'flat1': 1, \ +'dict1.c': 1, \ +'dict1.d': 2, \ +'nested.e.c': 1, \ +'nested.e.d': 2, \ +'nested.d': 2\ +} """ singleton = False if isinstance(ds, dict): @@ -208,18 +210,21 @@ def _simple_json_normalize( Examples -------- - IN[52]: _simple_json_normalize({ - 'flat1': 1, - 'dict1': {'c': 1, 'd': 2}, - 'nested': {'e': {'c': 1, 'd': 2}, 'd': 2} - }) - Out[52]: - {'dict1.c': 1, - 'dict1.d': 2, - 'flat1': 1, - 'nested.d': 2, - 'nested.e.c': 1, - 'nested.e.d': 2} + >>> _simple_json_normalize( + ... { + ... "flat1": 1, + ... "dict1": {"c": 1, "d": 2}, + ... "nested": {"e": {"c": 1, "d": 2}, "d": 2}, + ... } + ... ) + {\ +'flat1': 1, \ +'dict1.c': 1, \ +'dict1.d': 2, \ +'nested.e.c': 1, \ +'nested.e.d': 2, \ +'nested.d': 2\ +} """ normalised_json_object = {} @@ -283,22 +288,30 @@ def _json_normalize( Examples -------- - >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, - ... {'name': {'given': 'Mark', 'family': 'Regner'}}, - ... {'id': 2, 'name': 'Faye Raker'}] + >>> data = [ + ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, + ... {"name": {"given": "Mark", "family": "Regner"}}, + ... {"id": 2, "name": "Faye Raker"}, + ... ] >>> pd.json_normalize(data) id name.first name.last name.given name.family name 0 1.0 Coleen Volk NaN NaN NaN 1 NaN NaN NaN Mark Regner NaN 2 2.0 NaN NaN NaN NaN Faye Raker - >>> data = [{'id': 1, - ... 'name': "Cole Volk", - ... 'fitness': {'height': 130, 'weight': 60}}, - ... {'name': "Mark Reg", - ... 'fitness': {'height': 130, 'weight': 60}}, - ... {'id': 2, 'name': 'Faye Raker', - ... 'fitness': {'height': 130, 'weight': 60}}] + >>> data = [ + ... { + ... "id": 1, + ... "name": "Cole Volk", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + ... { + ... "id": 2, + ... "name": "Faye Raker", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... ] >>> pd.json_normalize(data, max_level=0) id name fitness 0 1.0 Cole Volk {'height': 130, 'weight': 60} @@ -307,32 +320,49 @@ def _json_normalize( Normalizes nested data up to level 1. - >>> data = [{'id': 1, - ... 'name': "Cole Volk", - ... 'fitness': {'height': 130, 'weight': 60}}, - ... {'name': "Mark Reg", - ... 'fitness': {'height': 130, 'weight': 60}}, - ... {'id': 2, 'name': 'Faye Raker', - ... 'fitness': {'height': 130, 'weight': 60}}] + >>> data = [ + ... { + ... "id": 1, + ... "name": "Cole Volk", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}}, + ... { + ... "id": 2, + ... "name": "Faye Raker", + ... "fitness": {"height": 130, "weight": 60}, + ... }, + ... ] >>> pd.json_normalize(data, max_level=1) id name fitness.height fitness.weight 0 1.0 Cole Volk 130 60 1 NaN Mark Reg 130 60 2 2.0 Faye Raker 130 60 - >>> data = [{'state': 'Florida', - ... 'shortname': 'FL', - ... 'info': {'governor': 'Rick Scott'}, - ... 'counties': [{'name': 'Dade', 'population': 12345}, - ... {'name': 'Broward', 'population': 40000}, - ... {'name': 'Palm Beach', 'population': 60000}]}, - ... {'state': 'Ohio', - ... 'shortname': 'OH', - ... 'info': {'governor': 'John Kasich'}, - ... 'counties': [{'name': 'Summit', 'population': 1234}, - ... {'name': 'Cuyahoga', 'population': 1337}]}] - >>> result = pd.json_normalize(data, 'counties', ['state', 'shortname', - ... ['info', 'governor']]) + >>> data = [ + ... { + ... "state": "Florida", + ... "shortname": "FL", + ... "info": {"governor": "Rick Scott"}, + ... "counties": [ + ... {"name": "Dade", "population": 12345}, + ... {"name": "Broward", "population": 40000}, + ... {"name": "Palm Beach", "population": 60000}, + ... ], + ... }, + ... { + ... "state": "Ohio", + ... "shortname": "OH", + ... "info": {"governor": "John Kasich"}, + ... "counties": [ + ... {"name": "Summit", "population": 1234}, + ... {"name": "Cuyahoga", "population": 1337}, + ... ], + ... }, + ... ] + >>> result = pd.json_normalize( + ... data, "counties", ["state", "shortname", ["info", "governor"]] + ... ) >>> result name population state shortname info.governor 0 Dade 12345 Florida FL Rick Scott @@ -341,8 +371,8 @@ def _json_normalize( 3 Summit 1234 Ohio OH John Kasich 4 Cuyahoga 1337 Ohio OH John Kasich - >>> data = {'A': [1, 2]} - >>> pd.json_normalize(data, 'A', record_prefix='Prefix.') + >>> data = {"A": [1, 2]} + >>> pd.json_normalize(data, "A", record_prefix="Prefix.") Prefix.0 0 1 1 2 diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index ea47dca4f079e..87ea109c20f43 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -155,21 +155,25 @@ def convert_json_field_to_pandas_type(field): Examples -------- - >>> convert_json_field_to_pandas_type({'name': 'an_int', - 'type': 'integer'}) + >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"}) 'int64' - >>> convert_json_field_to_pandas_type({'name': 'a_categorical', - 'type': 'any', - 'constraints': {'enum': [ - 'a', 'b', 'c']}, - 'ordered': True}) - 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)' - >>> convert_json_field_to_pandas_type({'name': 'a_datetime', - 'type': 'datetime'}) + + >>> convert_json_field_to_pandas_type( + ... { + ... "name": "a_categorical", + ... "type": "any", + ... "constraints": {"enum": ["a", "b", "c"]}, + ... "ordered": True, + ... } + ... ) + CategoricalDtype(categories=['a', 'b', 'c'], ordered=True) + + >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"}) 'datetime64[ns]' - >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz', - 'type': 'datetime', - 'tz': 'US/Central'}) + + >>> convert_json_field_to_pandas_type( + ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"} + ... ) 'datetime64[ns, US/Central]' """ typ = field["type"] @@ -245,12 +249,13 @@ def build_table_schema( ... 'C': pd.date_range('2016-01-01', freq='d', periods=3), ... }, index=pd.Index(range(3), name='idx')) >>> build_table_schema(df) - {'fields': [{'name': 'idx', 'type': 'integer'}, - {'name': 'A', 'type': 'integer'}, - {'name': 'B', 'type': 'string'}, - {'name': 'C', 'type': 'datetime'}], - 'pandas_version': '0.20.0', - 'primaryKey': ['idx']} + {'fields': \ +[{'name': 'idx', 'type': 'integer'}, \ +{'name': 'A', 'type': 'integer'}, \ +{'name': 'B', 'type': 'string'}, \ +{'name': 'C', 'type': 'datetime'}], \ +'primaryKey': ['idx'], \ +'pandas_version': '0.20.0'} """ if index is True: data = set_default_names(data) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 300d4bdace52d..9350c5c6e37f6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -6,6 +6,7 @@ import itertools from typing import ( Any, + Callable, DefaultDict, Hashable, Iterable, @@ -24,6 +25,7 @@ from pandas._typing import ( DtypeArg, FilePathOrBuffer, + final, ) from pandas.errors import ( ParserError, @@ -111,10 +113,10 @@ class ParserBase: + _implicit_index: bool = False + _first_chunk: bool index_col: int | Sequence[int] | None index_names: list[Hashable] | None - _first_chunk: bool - _implicit_index: bool = False def __init__(self, kwds): @@ -276,15 +278,17 @@ def close(self): if self.handles is not None: self.handles.close() + @final @property - def _has_complex_date_col(self): + def _has_complex_date_col(self) -> bool: return isinstance(self.parse_dates, dict) or ( isinstance(self.parse_dates, list) and len(self.parse_dates) > 0 and isinstance(self.parse_dates[0], list) ) - def _should_parse_dates(self, i): + @final + def _should_parse_dates(self, i: int) -> bool: if isinstance(self.parse_dates, bool): return self.parse_dates else: @@ -307,8 +311,9 @@ def _should_parse_dates(self, i): name is not None and name in self.parse_dates ) + @final def _extract_multi_indexer_columns( - self, header, index_names, col_names, passed_names=False + self, header, index_names, col_names, passed_names: bool = False ): """ extract and return the names, index_names, col_names @@ -370,6 +375,7 @@ def extract(r): return names, index_names, col_names, passed_names + @final def _maybe_dedup_names(self, names): # see gh-7160 and gh-9424: this helps to provide # immediate alleviation of the duplicate names @@ -403,12 +409,14 @@ def _maybe_dedup_names(self, names): return names + @final def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here if _is_potential_multi_index(columns): columns = MultiIndex.from_tuples(columns, names=col_names) return columns + @final def _make_index(self, data, alldata, columns, indexnamerow=False): if not is_index_col(self.index_col) or not self.index_col: index = None @@ -436,6 +444,7 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): return index, columns + @final def _get_simple_index(self, data, columns): def ix(col): if not isinstance(col, str): @@ -462,6 +471,7 @@ def ix(col): return index + @final def _get_complex_date_index(self, data, col_names): def _get_name(icol): if isinstance(icol, str): @@ -493,7 +503,8 @@ def _get_name(icol): return index - def _agg_index(self, index, try_parse_dates=True) -> Index: + @final + def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arrays = [] for i, arr in enumerate(index): @@ -524,8 +535,15 @@ def _agg_index(self, index, try_parse_dates=True) -> Index: return index + @final def _convert_to_ndarrays( - self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None + self, + dct: dict, + na_values, + na_fvalues, + verbose: bool = False, + converters=None, + dtypes=None, ): result = {} for c, values in dct.items(): @@ -602,6 +620,7 @@ def _convert_to_ndarrays( print(f"Filled {na_count} NA values in column {c!s}") return result + @final def _set_noconvert_dtype_columns( self, col_indices: list[int], names: list[int | str | tuple] ) -> set[int]: @@ -703,7 +722,7 @@ def _infer_types(self, values, na_values, try_num_bool=True): if try_num_bool and is_object_dtype(values.dtype): # exclude e.g DatetimeIndex here try: - result = lib.maybe_convert_numeric(values, na_values, False) + result, _ = lib.maybe_convert_numeric(values, na_values, False) except (ValueError, TypeError): # e.g. encountering datetime string gets ValueError # TypeError can be raised in floatify @@ -717,7 +736,7 @@ def _infer_types(self, values, na_values, try_num_bool=True): na_count = parsers.sanitize_objects(values, na_values, False) if result.dtype == np.object_ and try_num_bool: - result = libops.maybe_convert_bool( + result, _ = libops.maybe_convert_bool( np.asarray(values), true_values=self.true_values, false_values=self.false_values, @@ -1037,12 +1056,12 @@ def converter(*date_cols): def _process_date_conversion( data_dict, - converter, + converter: Callable, parse_spec, index_col, index_names, columns, - keep_date_col=False, + keep_date_col: bool = False, ): def _isindex(colspec): return (isinstance(index_col, list) and colspec in index_col) or ( @@ -1104,7 +1123,7 @@ def _isindex(colspec): return data_dict, new_cols -def _try_convert_dates(parser, colspec, data_dict, columns): +def _try_convert_dates(parser: Callable, colspec, data_dict, columns): colset = set(columns) colnames = [] @@ -1158,19 +1177,9 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na): return na_values, na_fvalues -# Seems to be unused -def _get_col_names(colspec, columns): - colset = set(columns) - colnames = [] - for c in colspec: - if c in colset: - colnames.append(c) - elif isinstance(c, int): - colnames.append(columns[c]) - return colnames - - -def _is_potential_multi_index(columns, index_col: bool | Sequence[int] | None = None): +def _is_potential_multi_index( + columns, index_col: bool | Sequence[int] | None = None +) -> bool: """ Check whether or not the `columns` parameter could be converted into a MultiIndex. @@ -1184,12 +1193,12 @@ def _is_potential_multi_index(columns, index_col: bool | Sequence[int] | None = Returns ------- - boolean : Whether or not columns could become a MultiIndex + bool : Whether or not columns could become a MultiIndex """ if index_col is None or isinstance(index_col, bool): index_col = [] - return ( + return bool( len(columns) and not isinstance(columns, MultiIndex) and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) @@ -1218,5 +1227,5 @@ def _validate_parse_dates_arg(parse_dates): return parse_dates -def is_index_col(col): +def is_index_col(col) -> bool: return col is not None and col is not False diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index a2d0a5a2b37f9..238012557b834 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -28,6 +28,7 @@ class CParserWrapper(ParserBase): low_memory: bool + _reader: parsers.TextReader def __init__(self, src: FilePathOrBuffer, **kwds): self.kwds = kwds @@ -55,6 +56,7 @@ def __init__(self, src: FilePathOrBuffer, **kwds): except Exception: self.handles.close() raise + self.unnamed_cols = self._reader.unnamed_cols # error: Cannot determine type of 'names' @@ -314,7 +316,7 @@ def _get_index_names(self): return names, idx_names - def _maybe_parse_dates(self, values, index: int, try_parse_dates=True): + def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): if try_parse_dates and self._should_parse_dates(index): values = self._date_conv(values) return values diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 281dc33ae7bae..d2a626568ac82 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1189,7 +1189,7 @@ def count_empty_vals(vals) -> int: return sum(1 for v in vals if v == "" or v is None) -def _validate_skipfooter_arg(skipfooter): +def _validate_skipfooter_arg(skipfooter: int) -> int: """ Validate the 'skipfooter' parameter. diff --git a/pandas/io/sql.py b/pandas/io/sql.py index d797fa51984d6..04a7ccb538a67 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -27,6 +27,8 @@ import pandas._libs.lib as lib from pandas._typing import DtypeArg +from pandas.compat._optional import import_optional_dependency +from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -36,6 +38,7 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna +from pandas import get_option from pandas.core.api import ( DataFrame, Series, @@ -643,6 +646,8 @@ def to_sql( chunksize: int | None = None, dtype: DtypeArg | None = None, method: str | None = None, + engine: str = "auto", + **engine_kwargs, ) -> None: """ Write records stored in a DataFrame to a SQL database. @@ -689,6 +694,16 @@ def to_sql( section :ref:`insert method `. .. versionadded:: 0.24.0 + + engine : {'auto', 'sqlalchemy'}, default 'auto' + SQL engine library to use. If 'auto', then the option + ``io.sql.engine`` is used. The default ``io.sql.engine`` + behavior is 'sqlalchemy' + + .. versionadded:: 1.3.0 + + **engine_kwargs + Any additional kwargs are passed to the engine. """ if if_exists not in ("fail", "replace", "append"): raise ValueError(f"'{if_exists}' is not valid for if_exists") @@ -712,6 +727,8 @@ def to_sql( chunksize=chunksize, dtype=dtype, method=method, + engine=engine, + **engine_kwargs, ) @@ -1283,6 +1300,91 @@ def to_sql( ) +class BaseEngine: + def insert_records( + self, + table: SQLTable, + con, + frame, + name, + index=True, + schema=None, + chunksize=None, + method=None, + **engine_kwargs, + ): + """ + Inserts data into already-prepared table + """ + raise AbstractMethodError(self) + + +class SQLAlchemyEngine(BaseEngine): + def __init__(self): + import_optional_dependency( + "sqlalchemy", extra="sqlalchemy is required for SQL support." + ) + + def insert_records( + self, + table: SQLTable, + con, + frame, + name, + index=True, + schema=None, + chunksize=None, + method=None, + **engine_kwargs, + ): + from sqlalchemy import exc + + try: + table.insert(chunksize=chunksize, method=method) + except exc.SQLAlchemyError as err: + # GH34431 + # https://stackoverflow.com/a/67358288/6067848 + msg = r"""(\(1054, "Unknown column 'inf(e0)?' in 'field list'"\))(?# + )|inf can not be used with MySQL""" + err_text = str(err.orig) + if re.search(msg, err_text): + raise ValueError("inf cannot be used with MySQL") from err + else: + raise err + + +def get_engine(engine: str) -> BaseEngine: + """ return our implementation """ + if engine == "auto": + engine = get_option("io.sql.engine") + + if engine == "auto": + # try engines in this order + engine_classes = [SQLAlchemyEngine] + + error_msgs = "" + for engine_class in engine_classes: + try: + return engine_class() + except ImportError as err: + error_msgs += "\n - " + str(err) + + raise ImportError( + "Unable to find a usable engine; " + "tried using: 'sqlalchemy'.\n" + "A suitable version of " + "sqlalchemy is required for sql I/O " + "support.\n" + "Trying to import the above resulted in these errors:" + f"{error_msgs}" + ) + + elif engine == "sqlalchemy": + return SQLAlchemyEngine() + + raise ValueError("engine must be one of 'auto', 'sqlalchemy'") + + class SQLDatabase(PandasSQL): """ This class enables conversion between DataFrame and SQL databases @@ -1504,7 +1606,7 @@ def read_query( read_sql = read_query - def to_sql( + def prep_table( self, frame, name, @@ -1512,50 +1614,10 @@ def to_sql( index=True, index_label=None, schema=None, - chunksize=None, dtype: DtypeArg | None = None, - method=None, - ): + ) -> SQLTable: """ - Write records stored in a DataFrame to a SQL database. - - Parameters - ---------- - frame : DataFrame - name : string - Name of SQL table. - if_exists : {'fail', 'replace', 'append'}, default 'fail' - - fail: If table exists, do nothing. - - replace: If table exists, drop it, recreate it, and insert data. - - append: If table exists, insert data. Create if does not exist. - index : bool, default True - Write DataFrame index as a column. - index_label : string or sequence, default None - Column label for index column(s). If None is given (default) and - `index` is True, then the index names are used. - A sequence should be given if the DataFrame uses MultiIndex. - schema : string, default None - Name of SQL schema in database to write to (if database flavor - supports this). If specified, this overwrites the default - schema of the SQLDatabase object. - chunksize : int, default None - If not None, then rows will be written in batches of this size at a - time. If None, all rows will be written at once. - dtype : single type or dict of column name to SQL type, default None - Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type. If all columns are of the same type, one - single value can be used. - method : {None', 'multi', callable}, default None - Controls the SQL insertion clause used: - - * None : Uses standard SQL ``INSERT`` clause (one per row). - * 'multi': Pass multiple values in a single ``INSERT`` clause. - * callable with signature ``(pd_table, conn, keys, data_iter)``. - - Details and a sample callable implementation can be found in the - section :ref:`insert method `. - - .. versionadded:: 0.24.0 + Prepares table in the database for data insertion. Creates it if needed, etc. """ if dtype: if not is_dict_like(dtype): @@ -1589,15 +1651,17 @@ def to_sql( dtype=dtype, ) table.create() + return table - from sqlalchemy.exc import SQLAlchemyError - - try: - table.insert(chunksize, method=method) - except SQLAlchemyError as err: - # GH 34431 36465 - raise ValueError("inf cannot be used with MySQL") from err - + def check_case_sensitive( + self, + name, + schema, + ): + """ + Checks table name for issues with case-sensitivity. + Method is called after data is inserted. + """ if not name.isdigit() and not name.islower(): # check for potentially case sensitivity issues (GH7815) # Only check when name is not a number and name is not lower case @@ -1623,6 +1687,97 @@ def to_sql( ) warnings.warn(msg, UserWarning) + def to_sql( + self, + frame, + name, + if_exists="fail", + index=True, + index_label=None, + schema=None, + chunksize=None, + dtype: DtypeArg | None = None, + method=None, + engine="auto", + **engine_kwargs, + ): + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame + name : string + Name of SQL table. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column. + index_label : string or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If specified, this overwrites the default + schema of the SQLDatabase object. + chunksize : int, default None + If not None, then rows will be written in batches of this size at a + time. If None, all rows will be written at once. + dtype : single type or dict of column name to SQL type, default None + Optional specifying the datatype for columns. The SQL type should + be a SQLAlchemy type. If all columns are of the same type, one + single value can be used. + method : {None', 'multi', callable}, default None + Controls the SQL insertion clause used: + + * None : Uses standard SQL ``INSERT`` clause (one per row). + * 'multi': Pass multiple values in a single ``INSERT`` clause. + * callable with signature ``(pd_table, conn, keys, data_iter)``. + + Details and a sample callable implementation can be found in the + section :ref:`insert method `. + + .. versionadded:: 0.24.0 + + engine : {'auto', 'sqlalchemy'}, default 'auto' + SQL engine library to use. If 'auto', then the option + ``io.sql.engine`` is used. The default ``io.sql.engine`` + behavior is 'sqlalchemy' + + .. versionadded:: 1.3.0 + + **engine_kwargs + Any additional kwargs are passed to the engine. + """ + sql_engine = get_engine(engine) + + table = self.prep_table( + frame=frame, + name=name, + if_exists=if_exists, + index=index, + index_label=index_label, + schema=schema, + dtype=dtype, + ) + + sql_engine.insert_records( + table=table, + con=self.connectable, + frame=frame, + name=name, + index=index, + schema=schema, + chunksize=chunksize, + method=method, + **engine_kwargs, + ) + + self.check_case_sensitive(name=name, schema=schema) + @property def tables(self): return self.meta.tables @@ -2008,6 +2163,7 @@ def to_sql( chunksize=None, dtype: DtypeArg | None = None, method=None, + **kwargs, ): """ Write records stored in a DataFrame to a SQL database. diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 56971af9bcd3f..c0287df1694e9 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -314,10 +314,14 @@ def test_validate_inplace_raises(self, value): cat.as_unordered(inplace=value) with pytest.raises(ValueError, match=msg): - cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value) with pytest.raises(ValueError, match=msg): - cat.rename_categories(["X", "Y", "Z"], inplace=value) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + cat.rename_categories(["X", "Y", "Z"], inplace=value) with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning): diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 2eeb502d36367..a063491cd08fa 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -82,7 +82,10 @@ def test_rename_categories(self): tm.assert_categorical_equal(result, expected) # and now inplace - res = cat.rename_categories([1, 2, 3], inplace=True) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + res = cat.rename_categories([1, 2, 3], inplace=True) + assert res is None tm.assert_numpy_array_equal( cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) @@ -114,7 +117,10 @@ def test_rename_categories_dict(self): tm.assert_index_equal(res.categories, expected) # Test for inplace - res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True) + assert res is None tm.assert_index_equal(cat.categories, expected) @@ -223,7 +229,10 @@ def test_set_categories(self): exp_categories = Index(["c", "b", "a"]) exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) - res = cat.set_categories(["c", "b", "a"], inplace=True) + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + res = cat.set_categories(["c", "b", "a"], inplace=True) + tm.assert_index_equal(cat.categories, exp_categories) tm.assert_numpy_array_equal(cat.__array__(), exp_values) assert res is None @@ -433,7 +442,11 @@ def test_describe(self): # check unused categories cat = self.factor.copy() - cat.set_categories(["a", "b", "c", "d"], inplace=True) + + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + cat.set_categories(["a", "b", "c", "d"], inplace=True) + desc = cat.describe() exp_index = CategoricalIndex( @@ -469,7 +482,11 @@ def test_describe(self): def test_set_categories_inplace(self): cat = self.factor.copy() - cat.set_categories(["a", "b", "c", "d"], inplace=True) + + with tm.assert_produces_warning(FutureWarning): + # issue #37643 inplace kwarg deprecated + cat.set_categories(["a", "b", "c", "d"], inplace=True) + tm.assert_index_equal(cat.categories, Index(["a", "b", "c", "d"])) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e2d8e522abb35..43ba5667d4d93 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -566,3 +566,23 @@ def test_to_numpy_na_value(dtype, nulls_fixture): result = arr.to_numpy(na_value=na_value) expected = np.array(["a", na_value, "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) + + +def test_isin(dtype, request): + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(["a", "c"]) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(["a", pd.NA]) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + + result = s.isin([]) + expected = pd.Series([False, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(["a", pd.Timestamp.now()]) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 62d368264752b..771d60b000a7d 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1150,7 +1150,7 @@ def test_array_interface(self, arr1d): tm.assert_numpy_array_equal(result, arr.asi8) # to other dtypes - msg = r"float\(\) argument must be a string or a number, not 'Period'" + msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'" with pytest.raises(TypeError, match=msg): np.asarray(arr, dtype="float64") diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index d1e6409307915..076cc155f3626 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -24,6 +24,7 @@ from pandas._libs import ( lib, missing as libmissing, + ops as libops, ) import pandas.util._test_decorators as td @@ -61,7 +62,11 @@ Timestamp, ) import pandas._testing as tm -from pandas.core.arrays import IntegerArray +from pandas.core.arrays import ( + BooleanArray, + FloatingArray, + IntegerArray, +) @pytest.fixture(params=[True, False], ids=str) @@ -416,73 +421,116 @@ def test_isneginf_scalar(self, value, expected): result = libmissing.isneginf_scalar(value) assert result is expected + @pytest.mark.parametrize( + "convert_to_masked_nullable, exp", + [ + ( + True, + BooleanArray( + np.array([True, False], dtype="bool"), np.array([False, True]) + ), + ), + (False, np.array([True, np.nan], dtype="object")), + ], + ) + def test_maybe_convert_nullable_boolean(self, convert_to_masked_nullable, exp): + # GH 40687 + arr = np.array([True, np.NaN], dtype=object) + result = libops.maybe_convert_bool( + arr, set(), convert_to_masked_nullable=convert_to_masked_nullable + ) + if convert_to_masked_nullable: + tm.assert_extension_array_equal(BooleanArray(*result), exp) + else: + result = result[0] + tm.assert_numpy_array_equal(result, exp) + + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) @pytest.mark.parametrize("coerce_numeric", [True, False]) @pytest.mark.parametrize( "infinity", ["inf", "inF", "iNf", "Inf", "iNF", "InF", "INf", "INF"] ) @pytest.mark.parametrize("prefix", ["", "-", "+"]) - def test_maybe_convert_numeric_infinities(self, coerce_numeric, infinity, prefix): + def test_maybe_convert_numeric_infinities( + self, coerce_numeric, infinity, prefix, convert_to_masked_nullable + ): # see gh-13274 - result = lib.maybe_convert_numeric( + result, _ = lib.maybe_convert_numeric( np.array([prefix + infinity], dtype=object), na_values={"", "NULL", "nan"}, coerce_numeric=coerce_numeric, + convert_to_masked_nullable=convert_to_masked_nullable, ) expected = np.array([np.inf if prefix in ["", "+"] else -np.inf]) tm.assert_numpy_array_equal(result, expected) - def test_maybe_convert_numeric_infinities_raises(self): + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_maybe_convert_numeric_infinities_raises(self, convert_to_masked_nullable): msg = "Unable to parse string" with pytest.raises(ValueError, match=msg): lib.maybe_convert_numeric( np.array(["foo_inf"], dtype=object), na_values={"", "NULL", "nan"}, coerce_numeric=False, + convert_to_masked_nullable=convert_to_masked_nullable, ) - def test_maybe_convert_numeric_post_floatify_nan(self, coerce): + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_maybe_convert_numeric_post_floatify_nan( + self, coerce, convert_to_masked_nullable + ): # see gh-13314 data = np.array(["1.200", "-999.000", "4.500"], dtype=object) expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) nan_values = {-999, -999.0} - out = lib.maybe_convert_numeric(data, nan_values, coerce) - tm.assert_numpy_array_equal(out, expected) + out = lib.maybe_convert_numeric( + data, + nan_values, + coerce, + convert_to_masked_nullable=convert_to_masked_nullable, + ) + if convert_to_masked_nullable: + expected = FloatingArray(expected, np.isnan(expected)) + tm.assert_extension_array_equal(expected, FloatingArray(*out)) + else: + out = out[0] + tm.assert_numpy_array_equal(out, expected) def test_convert_infs(self): arr = np.array(["inf", "inf", "inf"], dtype="O") - result = lib.maybe_convert_numeric(arr, set(), False) + result, _ = lib.maybe_convert_numeric(arr, set(), False) assert result.dtype == np.float64 arr = np.array(["-inf", "-inf", "-inf"], dtype="O") - result = lib.maybe_convert_numeric(arr, set(), False) + result, _ = lib.maybe_convert_numeric(arr, set(), False) assert result.dtype == np.float64 def test_scientific_no_exponent(self): # See PR 12215 arr = np.array(["42E", "2E", "99e", "6e"], dtype="O") - result = lib.maybe_convert_numeric(arr, set(), False, True) + result, _ = lib.maybe_convert_numeric(arr, set(), False, True) assert np.all(np.isnan(result)) def test_convert_non_hashable(self): # GH13324 # make sure that we are handing non-hashables arr = np.array([[10.0, 2], 1.0, "apple"], dtype=object) - result = lib.maybe_convert_numeric(arr, set(), False, True) + result, _ = lib.maybe_convert_numeric(arr, set(), False, True) tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) def test_convert_numeric_uint64(self): arr = np.array([2 ** 63], dtype=object) exp = np.array([2 ** 63], dtype=np.uint64) - tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set())[0], exp) arr = np.array([str(2 ** 63)], dtype=object) exp = np.array([2 ** 63], dtype=np.uint64) - tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set())[0], exp) arr = np.array([np.uint64(2 ** 63)], dtype=object) exp = np.array([2 ** 63], dtype=np.uint64) - tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set())[0], exp) @pytest.mark.parametrize( "arr", @@ -495,17 +543,33 @@ def test_convert_numeric_uint64(self): ) def test_convert_numeric_uint64_nan(self, coerce, arr): expected = arr.astype(float) if coerce else arr.copy() - result = lib.maybe_convert_numeric(arr, set(), coerce_numeric=coerce) + result, _ = lib.maybe_convert_numeric(arr, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) - def test_convert_numeric_uint64_nan_values(self, coerce): + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_convert_numeric_uint64_nan_values( + self, coerce, convert_to_masked_nullable + ): arr = np.array([2 ** 63, 2 ** 63 + 1], dtype=object) na_values = {2 ** 63} expected = ( np.array([np.nan, 2 ** 63 + 1], dtype=float) if coerce else arr.copy() ) - result = lib.maybe_convert_numeric(arr, na_values, coerce_numeric=coerce) + result = lib.maybe_convert_numeric( + arr, + na_values, + coerce_numeric=coerce, + convert_to_masked_nullable=convert_to_masked_nullable, + ) + if convert_to_masked_nullable and coerce: + expected = IntegerArray( + np.array([0, 2 ** 63 + 1], dtype="u8"), + np.array([True, False], dtype="bool"), + ) + result = IntegerArray(*result) + else: + result = result[0] # discard mask tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( @@ -519,16 +583,33 @@ def test_convert_numeric_uint64_nan_values(self, coerce): np.array([str(-1), str(2 ** 63)], dtype=object), ], ) - def test_convert_numeric_int64_uint64(self, case, coerce): + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_convert_numeric_int64_uint64( + self, case, coerce, convert_to_masked_nullable + ): expected = case.astype(float) if coerce else case.copy() - result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) + result, _ = lib.maybe_convert_numeric( + case, + set(), + coerce_numeric=coerce, + convert_to_masked_nullable=convert_to_masked_nullable, + ) + tm.assert_almost_equal(result, expected) - def test_convert_numeric_string_uint64(self): + @pytest.mark.parametrize("convert_to_masked_nullable", [True, False]) + def test_convert_numeric_string_uint64(self, convert_to_masked_nullable): # GH32394 result = lib.maybe_convert_numeric( - np.array(["uint64"], dtype=object), set(), coerce_numeric=True + np.array(["uint64"], dtype=object), + set(), + coerce_numeric=True, + convert_to_masked_nullable=convert_to_masked_nullable, ) + if convert_to_masked_nullable: + result = FloatingArray(*result) + else: + result = result[0] assert np.isnan(result) @pytest.mark.parametrize("value", [-(2 ** 63) - 1, 2 ** 64]) @@ -608,6 +689,54 @@ def test_maybe_convert_objects_nullable_integer(self, exp): tm.assert_extension_array_equal(result, exp) + @pytest.mark.parametrize( + "convert_to_masked_nullable, exp", + [ + (True, IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True]))), + (False, np.array([2, np.nan], dtype="float64")), + ], + ) + def test_maybe_convert_numeric_nullable_integer( + self, convert_to_masked_nullable, exp + ): + # GH 40687 + arr = np.array([2, np.NaN], dtype=object) + result = lib.maybe_convert_numeric( + arr, set(), convert_to_masked_nullable=convert_to_masked_nullable + ) + if convert_to_masked_nullable: + result = IntegerArray(*result) + tm.assert_extension_array_equal(result, exp) + else: + result = result[0] + tm.assert_numpy_array_equal(result, exp) + + @pytest.mark.parametrize( + "convert_to_masked_nullable, exp", + [ + ( + True, + FloatingArray( + np.array([2.0, 0.0], dtype="float64"), np.array([False, True]) + ), + ), + (False, np.array([2.0, np.nan], dtype="float64")), + ], + ) + def test_maybe_convert_numeric_floating_array( + self, convert_to_masked_nullable, exp + ): + # GH 40687 + arr = np.array([2.0, np.nan], dtype=object) + result = lib.maybe_convert_numeric( + arr, set(), convert_to_masked_nullable=convert_to_masked_nullable + ) + if convert_to_masked_nullable: + tm.assert_extension_array_equal(FloatingArray(*result), exp) + else: + result = result[0] + tm.assert_numpy_array_equal(result, exp) + def test_maybe_convert_objects_bool_nan(self): # GH32146 ind = Index([True, False, np.nan], dtype=object) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 7c5ef5b3b27d3..99a5666926e10 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas.core.internals import ObjectBlock from pandas.tests.extension.base.base import BaseExtensionTests @@ -43,10 +45,21 @@ def test_astype_str(self, data): expected = pd.Series([str(x) for x in data[:5]], dtype=str) self.assert_series_equal(result, expected) - def test_astype_string(self, data): + @pytest.mark.parametrize( + "nullable_string_dtype", + [ + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ], + ) + def test_astype_string(self, data, nullable_string_dtype): # GH-33465 - result = pd.Series(data[:5]).astype("string") - expected = pd.Series([str(x) for x in data[:5]], dtype="string") + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + result = pd.Series(data[:5]).astype(nullable_string_dtype) + expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype) self.assert_series_equal(result, expected) def test_to_numpy(self, data): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index ffe2769730f34..2eef828288e59 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -40,7 +40,6 @@ ExtensionDtype, ) from pandas.api.types import is_bool_dtype -from pandas.core.arrays.string_arrow import ArrowStringDtype class JSONDtype(ExtensionDtype): @@ -196,7 +195,7 @@ def astype(self, dtype, copy=True): if copy: return self.copy() return self - elif isinstance(dtype, (StringDtype, ArrowStringDtype)): + elif isinstance(dtype, StringDtype): value = self.astype(str) # numpy doesn'y like nested dicts return dtype.construct_array_type()._from_sequence(value, copy=False) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 3fa8295084718..b84ff38b43ae7 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -18,6 +18,7 @@ PeriodDtype, ) +import pandas as pd from pandas import ( Categorical, DataFrame, @@ -791,6 +792,41 @@ def test_setitem_slice_position(self): expected = DataFrame(arr) tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize("indexer", [tm.setitem, tm.iloc]) + @pytest.mark.parametrize("box", [Series, np.array, list, pd.array]) + @pytest.mark.parametrize("n", [1, 2, 3]) + def test_setitem_slice_indexer_broadcasting_rhs(self, n, box, indexer): + # GH#40440 + df = DataFrame([[1, 3, 5]] + [[2, 4, 6]] * n, columns=["a", "b", "c"]) + indexer(df)[1:] = box([10, 11, 12]) + expected = DataFrame([[1, 3, 5]] + [[10, 11, 12]] * n, columns=["a", "b", "c"]) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("box", [Series, np.array, list, pd.array]) + @pytest.mark.parametrize("n", [1, 2, 3]) + def test_setitem_list_indexer_broadcasting_rhs(self, n, box): + # GH#40440 + df = DataFrame([[1, 3, 5]] + [[2, 4, 6]] * n, columns=["a", "b", "c"]) + df.iloc[list(range(1, n + 1))] = box([10, 11, 12]) + expected = DataFrame([[1, 3, 5]] + [[10, 11, 12]] * n, columns=["a", "b", "c"]) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("indexer", [tm.setitem, tm.iloc]) + @pytest.mark.parametrize("box", [Series, np.array, list, pd.array]) + @pytest.mark.parametrize("n", [1, 2, 3]) + def test_setitem_slice_broadcasting_rhs_mixed_dtypes(self, n, box, indexer): + # GH#40440 + df = DataFrame( + [[1, 3, 5], ["x", "y", "z"]] + [[2, 4, 6]] * n, columns=["a", "b", "c"] + ) + indexer(df)[1:] = box([10, 11, 12]) + expected = DataFrame( + [[1, 3, 5]] + [[10, 11, 12]] * (n + 1), + columns=["a", "b", "c"], + dtype="object", + ) + tm.assert_frame_equal(df, expected) + class TestDataFrameSetItemCallable: def test_setitem_callable(self): @@ -859,9 +895,11 @@ def test_setitem_mask_categorical(self): df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) exp_fancy = exp_multi_row.copy() - return_value = exp_fancy["cats"].cat.set_categories( - ["a", "b", "c"], inplace=True - ) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # issue #37643 inplace kwarg deprecated + return_value = exp_fancy["cats"].cat.set_categories( + ["a", "b", "c"], inplace=True + ) assert return_value is None mask = df["cats"] == "c" diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 823ce7435f229..6e5cb3add43df 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -52,33 +52,46 @@ def test_nonzero_single_element(self): s = Series([False]) assert not s.bool() - msg = "The truth value of a Series is ambiguous" + @pytest.mark.parametrize("data", [np.nan, pd.NaT, True, False]) + def test_nonzero_single_element_raise_1(self, data): # single item nan to raise - for s in [Series([np.nan]), Series([pd.NaT]), Series([True]), Series([False])]: - with pytest.raises(ValueError, match=msg): - bool(s) + series = Series([data]) + + msg = "The truth value of a Series is ambiguous" + with pytest.raises(ValueError, match=msg): + bool(series) + + @pytest.mark.parametrize("data", [np.nan, pd.NaT]) + def test_nonzero_single_element_raise_2(self, data): + series = Series([data]) msg = "bool cannot act on a non-boolean single element Series" - for s in [Series([np.nan]), Series([pd.NaT])]: - with pytest.raises(ValueError, match=msg): - s.bool() + with pytest.raises(ValueError, match=msg): + series.bool() + @pytest.mark.parametrize("data", [(True, True), (False, False)]) + def test_nonzero_multiple_element_raise(self, data): # multiple bool are still an error + series = Series([data]) + msg = "The truth value of a Series is ambiguous" - for s in [Series([True, True]), Series([False, False])]: - with pytest.raises(ValueError, match=msg): - bool(s) - with pytest.raises(ValueError, match=msg): - s.bool() + with pytest.raises(ValueError, match=msg): + bool(series) + with pytest.raises(ValueError, match=msg): + series.bool() + @pytest.mark.parametrize("data", [1, 0, "a", 0.0]) + def test_nonbool_single_element_raise(self, data): # single non-bool are an error - for s in [Series([1]), Series([0]), Series(["a"]), Series([0.0])]: - msg = "The truth value of a Series is ambiguous" - with pytest.raises(ValueError, match=msg): - bool(s) - msg = "bool cannot act on a non-boolean single element Series" - with pytest.raises(ValueError, match=msg): - s.bool() + series = Series([data]) + + msg = "The truth value of a Series is ambiguous" + with pytest.raises(ValueError, match=msg): + bool(series) + + msg = "bool cannot act on a non-boolean single element Series" + with pytest.raises(ValueError, match=msg): + series.bool() def test_metadata_propagation_indiv_resample(self): # resample diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 28344897a686f..b601ba92886d9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -234,11 +234,10 @@ def test_aggregate_item_by_item(df): K = len(result.columns) # GH5782 - # odd comparisons can result here, so cast to make easy - exp = Series(np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo") + exp = Series(np.array([foo] * K), index=list("BCD"), name="foo") tm.assert_series_equal(result.xs("foo"), exp) - exp = Series(np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar") + exp = Series(np.array([bar] * K), index=list("BCD"), name="bar") tm.assert_almost_equal(result.xs("bar"), exp) def aggfun(ser): @@ -442,6 +441,57 @@ def test_bool_agg_dtype(op): assert is_integer_dtype(result) +@pytest.mark.parametrize( + "keys, agg_index", + [ + (["a"], Index([1], name="a")), + (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])), + ], +) +@pytest.mark.parametrize( + "input_dtype", ["bool", "int32", "int64", "float32", "float64"] +) +@pytest.mark.parametrize( + "result_dtype", ["bool", "int32", "int64", "float32", "float64"] +) +@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"]) +def test_callable_result_dtype_frame( + keys, agg_index, input_dtype, result_dtype, method +): + # GH 21240 + df = DataFrame({"a": [1], "b": [2], "c": [True]}) + df["c"] = df["c"].astype(input_dtype) + op = getattr(df.groupby(keys)[["c"]], method) + result = op(lambda x: x.astype(result_dtype).iloc[0]) + expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index + expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype( + result_dtype + ) + if method == "apply": + expected.columns.names = [0] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "keys, agg_index", + [ + (["a"], Index([1], name="a")), + (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])), + ], +) +@pytest.mark.parametrize("input", [True, 1, 1.0]) +@pytest.mark.parametrize("dtype", [bool, int, float]) +@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"]) +def test_callable_result_dtype_series(keys, agg_index, input, dtype, method): + # GH 21240 + df = DataFrame({"a": [1], "b": [2], "c": [input]}) + op = getattr(df.groupby(keys)["c"], method) + result = op(lambda x: x.astype(dtype).iloc[0]) + expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index + expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype) + tm.assert_series_equal(result, expected) + + def test_order_aggregate_multiple_funcs(): # GH 25692 df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) @@ -849,7 +899,11 @@ def test_multiindex_custom_func(func): data = [[1, 4, 2], [5, 7, 1]] df = DataFrame(data, columns=MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]])) result = df.groupby(np.array([0, 1])).agg(func) - expected_dict = {(1, 3): {0: 1, 1: 5}, (1, 4): {0: 4, 1: 7}, (2, 3): {0: 2, 1: 1}} + expected_dict = { + (1, 3): {0: 1.0, 1: 5.0}, + (1, 4): {0: 4.0, 1: 7.0}, + (2, 3): {0: 2.0, 1: 1.0}, + } expected = DataFrame(expected_dict) tm.assert_frame_equal(result, expected) @@ -1105,6 +1159,11 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): expected_df = DataFrame(data=exp_data, index=cat_index) + if "cat_ord" in expected_df: + # ordered categorical columns should be preserved + dtype = input_df["cat_ord"].dtype + expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype) + tm.assert_frame_equal(result_df, expected_df) @@ -1149,6 +1208,10 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): multi_index = MultiIndex.from_tuples(tuple(multi_index_list)) expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index) + for col in expected_df.columns: + if isinstance(col, tuple) and "cat_ord" in col: + # ordered categorical should be preserved + expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype) tm.assert_frame_equal(result_df, expected_df) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 4a8aabe41b754..ded10ab11d5a8 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -196,6 +196,9 @@ def test_cython_agg_empty_buckets(op, targop, observed): g = df.groupby(pd.cut(df[0], grps), observed=observed) expected = g.agg(lambda x: targop(x)) + if observed and op not in ("min", "max"): + # TODO: GH 41137 + expected = expected.astype("int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 13fddad30eeba..aa126ae801f1e 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -20,7 +20,7 @@ def test_series_grouper(): grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2) result, counts = grouper.get_result() - expected = np.array([obj[3:6].mean(), obj[6:].mean()]) + expected = np.array([obj[3:6].mean(), obj[6:].mean()], dtype=object) tm.assert_almost_equal(result, expected) exp_counts = np.array([3, 4], dtype=np.int64) @@ -36,7 +36,7 @@ def test_series_grouper_result_length_difference(): grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2) result, counts = grouper.get_result() - expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)]) + expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)], dtype=object) tm.assert_equal(result, expected) exp_counts = np.array([3, 4], dtype=np.int64) @@ -61,7 +61,7 @@ def test_series_bin_grouper(): grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins) result, counts = grouper.get_result() - expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) + expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()], dtype=object) tm.assert_almost_equal(result, expected) exp_counts = np.array([3, 3, 4], dtype=np.int64) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index da438826a939a..7349664614614 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -800,6 +800,12 @@ def test_preserve_on_ordered_ops(func, values): ).set_index("payload") tm.assert_frame_equal(result, expected) + # we should also preserve categorical for SeriesGroupBy + sgb = df.groupby("payload")["col"] + result = getattr(sgb, func)() + expected = expected["col"] + tm.assert_series_equal(result, expected) + def test_categorical_no_compress(): data = Series(np.random.randn(9)) @@ -1494,7 +1500,11 @@ def test_groupy_first_returned_categorical_instead_of_dataframe(func): df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()}) df_grouped = df.groupby("A")["B"] result = getattr(df_grouped, func)() - expected = Series(["b"], index=Index([1997], name="A"), name="B") + + # ordered categorical dtype should be preserved + expected = Series( + ["b"], index=Index([1997], name="A"), name="B", dtype=df["B"].dtype + ) tm.assert_series_equal(result, expected) @@ -1561,7 +1571,15 @@ def test_agg_cython_category_not_implemented_fallback(): df["col_cat"] = df["col_num"].astype("category") result = df.groupby("col_num").col_cat.first() - expected = Series([1, 2, 3], index=Index([1, 2, 3], name="col_num"), name="col_cat") + + # ordered categorical dtype should definitely be preserved; + # this is unordered, so is less-clear case (if anything, it should raise) + expected = Series( + [1, 2, 3], + index=Index([1, 2, 3], name="col_num"), + name="col_cat", + dtype=df["col_cat"].dtype, + ) tm.assert_series_equal(result, expected) result = df.groupby("col_num").agg({"col_cat": "first"}) @@ -1576,6 +1594,10 @@ def test_aggregate_categorical_lost_index(func: str): df = DataFrame({"A": [1997], "B": ds}) result = df.groupby("A").agg({"B": func}) expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) + + # ordered categorical dtype should be preserved + expected["B"] = expected["B"].astype(ds.dtype) + tm.assert_frame_equal(result, expected) @@ -1597,7 +1619,7 @@ def test_aggregate_categorical_with_isnan(): index = MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B")) expected = DataFrame( data={ - "numerical_col": [1.0, 0.0], + "numerical_col": [1, 0], "object_col": [0, 0], "categorical_col": [0, 0], }, @@ -1653,6 +1675,9 @@ def test_categorical_transform(): expected["status"] = expected["status"].astype(delivery_status_type) + # .transform(max) should preserve ordered categoricals + expected["last_status"] = expected["last_status"].astype(delivery_status_type) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 40f8135637292..3f43c34b6eb34 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -439,7 +439,8 @@ def test_median_empty_bins(observed): result = df.groupby(bins, observed=observed).median() expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) - tm.assert_frame_equal(result, expected) + # TODO: GH 41137 + tm.assert_frame_equal(result, expected, check_dtype=False) @pytest.mark.parametrize( @@ -619,7 +620,7 @@ def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - result = getattr(df.groupby(labels), op)().astype(float) + result = getattr(df.groupby(labels), op)() expected = df.groupby(labels).agg(targop) tm.assert_frame_equal(result, expected) @@ -1077,6 +1078,7 @@ def test_describe_with_duplicate_output_column_names(as_index): "c": [10, 20, 30, 40, 50, 60], }, columns=["a", "b", "b"], + copy=False, ) expected = ( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3f6485be871f1..abfa2a23a4402 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -299,10 +299,9 @@ def f(x): return float(len(x)) agged = grouped.agg(f) - expected = Series([4, 2], index=["bar", "foo"]) + expected = Series([4.0, 2.0], index=["bar", "foo"]) - tm.assert_series_equal(agged, expected, check_dtype=False) - assert issubclass(agged.dtype.type, np.dtype(dtype).type) + tm.assert_series_equal(agged, expected) def test_indices_concatenation_order(): @@ -2020,6 +2019,12 @@ def test_groupby_crash_on_nunique(axis): tm.assert_frame_equal(result, expected) + # same thing, but empty columns + gb = df[[]].groupby(axis=axis_number, level=0) + res = gb.nunique() + exp = expected[[]] + tm.assert_frame_equal(res, exp) + def test_groupby_list_level(): # GH 9790 diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 14c117bf7257a..b22e4749bfdfc 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -242,7 +242,7 @@ def test_transform_bug(): # transforming on a datetime column df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)}) result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False)) - expected = Series(np.arange(5, 0, step=-1), name="B") + expected = Series(np.arange(5, 0, step=-1), name="B", dtype="float64") tm.assert_series_equal(result, expected) @@ -493,7 +493,7 @@ def test_groupby_transform_with_int(): ) with np.errstate(all="ignore"): result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) - expected = DataFrame({"B": np.nan, "C": [-1, 0, 1, -1, 0, 1]}) + expected = DataFrame({"B": np.nan, "C": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0]}) tm.assert_frame_equal(result, expected) # int that needs float conversion @@ -509,9 +509,9 @@ def test_groupby_transform_with_int(): expected = DataFrame({"B": np.nan, "C": concat([s1, s2])}) tm.assert_frame_equal(result, expected) - # int downcasting + # int doesn't get downcasted result = df.groupby("A").transform(lambda x: x * 2 / 2) - expected = DataFrame({"B": 1, "C": [2, 3, 4, 10, 5, -1]}) + expected = DataFrame({"B": 1.0, "C": [2.0, 3.0, 4.0, 10.0, 5.0, -1.0]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 678344f5b6909..40ab887d5bb5d 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -2,6 +2,7 @@ import pytest from pandas._libs import index as libindex +from pandas._libs.arrays import NDArrayBacked import pandas as pd from pandas import ( @@ -17,13 +18,17 @@ class TestCategoricalIndex(Base): - _holder = CategoricalIndex + _index_cls = CategoricalIndex + + @pytest.fixture + def simple_index(self) -> CategoricalIndex: + return self._index_cls(list("aabbca"), categories=list("cab"), ordered=False) @pytest.fixture def index(self, request): return tm.makeCategoricalIndex(100) - def create_index(self, categories=None, ordered=False): + def create_index(self, *, categories=None, ordered=False): if categories is None: categories = list("cab") return CategoricalIndex(list("aabbca"), categories=categories, ordered=ordered) @@ -33,9 +38,9 @@ def test_can_hold_identifiers(self): key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is True - def test_insert(self): + def test_insert(self, simple_index): - ci = self.create_index() + ci = simple_index categories = ci.categories # test 0th element @@ -70,9 +75,9 @@ def test_insert_na_mismatched_dtype(self): expected = Index([pd.NaT, 0, 1, 1], dtype=object) tm.assert_index_equal(result, expected) - def test_delete(self): + def test_delete(self, simple_index): - ci = self.create_index() + ci = simple_index categories = ci.categories result = ci.delete(0) @@ -302,7 +307,8 @@ def test_engine_type(self, dtype, engine_type): # having 2**32 - 2**31 categories would be very memory-intensive, # so we cheat a bit with the dtype ci = CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1) - ci.values._ndarray = ci.values._ndarray.astype("int64") + arr = ci.values._ndarray.astype("int64") + NDArrayBacked.__init__(ci._data, arr, ci.dtype) assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type) diff --git a/pandas/tests/indexes/categorical/test_constructors.py b/pandas/tests/indexes/categorical/test_constructors.py index 2acf79ee0bced..35620875d5a1a 100644 --- a/pandas/tests/indexes/categorical/test_constructors.py +++ b/pandas/tests/indexes/categorical/test_constructors.py @@ -108,8 +108,8 @@ def test_construction_with_dtype(self): tm.assert_index_equal(result, ci, exact=True) # make sure indexes are handled - expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], ordered=True) idx = Index(range(3)) + expected = CategoricalIndex([0, 1, 2], categories=idx, ordered=True) result = CategoricalIndex(idx, categories=idx, ordered=True) tm.assert_index_equal(result, expected, exact=True) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index ab2b2db7eec53..45e1b615b1ade 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1,3 +1,4 @@ +from datetime import datetime import gc from typing import Type @@ -5,6 +6,7 @@ import pytest from pandas._libs import iNaT +from pandas._libs.tslibs import Timestamp from pandas.core.dtypes.common import is_datetime64tz_dtype from pandas.core.dtypes.dtypes import CategoricalDtype @@ -13,6 +15,7 @@ from pandas import ( CategoricalIndex, DatetimeIndex, + Float64Index, Index, Int64Index, IntervalIndex, @@ -29,9 +32,15 @@ class Base: - """ base class for index sub-class tests """ + """ + Base class for index sub-class tests. + """ - _holder: Type[Index] + _index_cls: Type[Index] + + @pytest.fixture + def simple_index(self) -> Index: + raise NotImplementedError("Method not implemented") def create_index(self) -> Index: raise NotImplementedError("Method not implemented") @@ -45,12 +54,12 @@ def test_pickle_compat_construction(self): r"__new__\(\) takes at least 2 arguments \(1 given\)" ) with pytest.raises(TypeError, match=msg): - self._holder() + self._index_cls() @pytest.mark.parametrize("name", [None, "new_name"]) - def test_to_frame(self, name): + def test_to_frame(self, name, simple_index): # see GH-15230, GH-22580 - idx = self.create_index() + idx = simple_index if name: idx_name = name @@ -67,10 +76,10 @@ def test_to_frame(self, name): df = idx.to_frame(index=False, name=idx_name) assert df.index is not idx - def test_shift(self): + def test_shift(self, simple_index): # GH8083 test the base class for shift - idx = self.create_index() + idx = simple_index msg = ( f"This method is only implemented for DatetimeIndex, PeriodIndex and " f"TimedeltaIndex; Got type {type(idx).__name__}" @@ -80,18 +89,18 @@ def test_shift(self): with pytest.raises(NotImplementedError, match=msg): idx.shift(1, 2) - def test_constructor_name_unhashable(self): + def test_constructor_name_unhashable(self, simple_index): # GH#29069 check that name is hashable # See also same-named test in tests.series.test_constructors - idx = self.create_index() + idx = simple_index with pytest.raises(TypeError, match="Index.name must be a hashable type"): type(idx)(idx, name=[]) - def test_create_index_existing_name(self): + def test_create_index_existing_name(self, simple_index): # GH11193, when an existing index is passed, and a new name is not # specified, the new index should inherit the previous object name - expected = self.create_index() + expected = simple_index if not isinstance(expected, MultiIndex): expected.name = "foo" result = Index(expected) @@ -140,9 +149,9 @@ def test_create_index_existing_name(self): ), ) - def test_numeric_compat(self): + def test_numeric_compat(self, simple_index): - idx = self.create_index() + idx = simple_index # Check that this doesn't cover MultiIndex case, if/when it does, # we can remove multi.test_compat.test_numeric_compat assert not isinstance(idx, MultiIndex) @@ -183,21 +192,21 @@ def test_numeric_compat(self): with pytest.raises(TypeError, match=floordiv_err): 1 // idx - def test_logical_compat(self): - idx = self.create_index() + def test_logical_compat(self, simple_index): + idx = simple_index with pytest.raises(TypeError, match="cannot perform all"): idx.all() with pytest.raises(TypeError, match="cannot perform any"): idx.any() - def test_repr_roundtrip(self): + def test_repr_roundtrip(self, simple_index): - idx = self.create_index() + idx = simple_index tm.assert_index_equal(eval(repr(idx)), idx) - def test_repr_max_seq_item_setting(self): + def test_repr_max_seq_item_setting(self, simple_index): # GH10182 - idx = self.create_index() + idx = simple_index idx = idx.repeat(50) with pd.option_context("display.max_seq_items", None): repr(idx) @@ -331,42 +340,42 @@ def test_numpy_argsort(self, index): with pytest.raises(ValueError, match=msg): np.argsort(index, order=("a", "b")) - def test_repeat(self): + def test_repeat(self, simple_index): rep = 2 - i = self.create_index() - expected = Index(i.values.repeat(rep), name=i.name) - tm.assert_index_equal(i.repeat(rep), expected) + idx = simple_index.copy() + expected = Index(idx.values.repeat(rep), name=idx.name) + tm.assert_index_equal(idx.repeat(rep), expected) - i = self.create_index() - rep = np.arange(len(i)) - expected = Index(i.values.repeat(rep), name=i.name) - tm.assert_index_equal(i.repeat(rep), expected) + idx = simple_index + rep = np.arange(len(idx)) + expected = Index(idx.values.repeat(rep), name=idx.name) + tm.assert_index_equal(idx.repeat(rep), expected) - def test_numpy_repeat(self): + def test_numpy_repeat(self, simple_index): rep = 2 - i = self.create_index() - expected = i.repeat(rep) - tm.assert_index_equal(np.repeat(i, rep), expected) + idx = simple_index + expected = idx.repeat(rep) + tm.assert_index_equal(np.repeat(idx, rep), expected) msg = "the 'axis' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.repeat(i, rep, axis=0) + np.repeat(idx, rep, axis=0) @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) - def test_where(self, klass): - i = self.create_index() - if isinstance(i, (DatetimeIndex, TimedeltaIndex)): + def test_where(self, klass, simple_index): + idx = simple_index + if isinstance(idx, (DatetimeIndex, TimedeltaIndex)): # where does not preserve freq - i = i._with_freq(None) + idx = idx._with_freq(None) - cond = [True] * len(i) - result = i.where(klass(cond)) - expected = i + cond = [True] * len(idx) + result = idx.where(klass(cond)) + expected = idx tm.assert_index_equal(result, expected) - cond = [False] + [True] * len(i[1:]) - expected = Index([i._na_value] + i[1:].tolist(), dtype=i.dtype) - result = i.where(klass(cond)) + cond = [False] + [True] * len(idx[1:]) + expected = Index([idx._na_value] + idx[1:].tolist(), dtype=idx.dtype) + result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) def test_insert_base(self, index): @@ -424,9 +433,9 @@ def test_equals(self, index): # do not test MultiIndex assert not index.equals(Series(index)) - def test_equals_op(self): + def test_equals_op(self, simple_index): # GH9947, GH10637 - index_a = self.create_index() + index_a = simple_index n = len(index_a) index_b = index_a[0:-1] @@ -487,15 +496,15 @@ def test_equals_op(self): # For RangeIndex we can convert to Int64Index tm.assert_series_equal(series_a == item, Series(expected3)) - def test_format(self): + def test_format(self, simple_index): # GH35439 - idx = self.create_index() + idx = simple_index expected = [str(x) for x in idx] assert idx.format() == expected def test_format_empty(self): # GH35712 - empty_idx = self._holder([]) + empty_idx = self._index_cls([]) assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] @@ -588,29 +597,29 @@ def test_nulls(self, index): tm.assert_numpy_array_equal(index.isna(), result) tm.assert_numpy_array_equal(index.notna(), ~result) - def test_empty(self): + def test_empty(self, simple_index): # GH 15270 - index = self.create_index() - assert not index.empty - assert index[:0].empty + idx = simple_index + assert not idx.empty + assert idx[:0].empty - def test_join_self_unique(self, join_type): - index = self.create_index() - if index.is_unique: - joined = index.join(index, how=join_type) - assert (index == joined).all() + def test_join_self_unique(self, join_type, simple_index): + idx = simple_index + if idx.is_unique: + joined = idx.join(idx, how=join_type) + assert (idx == joined).all() - def test_map(self): + def test_map(self, simple_index): # callable - index = self.create_index() + idx = simple_index # we don't infer UInt64 - if isinstance(index, UInt64Index): - expected = index.astype("int64") + if isinstance(idx, UInt64Index): + expected = idx.astype("int64") else: - expected = index + expected = idx - result = index.map(lambda x: x) + result = idx.map(lambda x: x) # For RangeIndex we convert to Int64Index tm.assert_index_equal(result, expected) @@ -621,66 +630,66 @@ def test_map(self): lambda values, index: Series(values, index), ], ) - def test_map_dictlike(self, mapper): + def test_map_dictlike(self, mapper, simple_index): - index = self.create_index() - if isinstance(index, CategoricalIndex): - pytest.skip(f"skipping tests for {type(index)}") + idx = simple_index + if isinstance(idx, CategoricalIndex): + pytest.skip(f"skipping tests for {type(idx)}") - identity = mapper(index.values, index) + identity = mapper(idx.values, idx) # we don't infer to UInt64 for a dict - if isinstance(index, UInt64Index) and isinstance(identity, dict): - expected = index.astype("int64") + if isinstance(idx, UInt64Index) and isinstance(identity, dict): + expected = idx.astype("int64") else: - expected = index + expected = idx - result = index.map(identity) + result = idx.map(identity) # For RangeIndex we convert to Int64Index tm.assert_index_equal(result, expected) # empty mappable - expected = Index([np.nan] * len(index)) - result = index.map(mapper(expected, index)) + expected = Index([np.nan] * len(idx)) + result = idx.map(mapper(expected, idx)) tm.assert_index_equal(result, expected) - def test_map_str(self): + def test_map_str(self, simple_index): # GH 31202 - index = self.create_index() - result = index.map(str) - expected = Index([str(x) for x in index], dtype=object) + idx = simple_index + result = idx.map(str) + expected = Index([str(x) for x in idx], dtype=object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("copy", [True, False]) @pytest.mark.parametrize("name", [None, "foo"]) @pytest.mark.parametrize("ordered", [True, False]) - def test_astype_category(self, copy, name, ordered): + def test_astype_category(self, copy, name, ordered, simple_index): # GH 18630 - index = self.create_index() + idx = simple_index if name: - index = index.rename(name) + idx = idx.rename(name) # standard categories dtype = CategoricalDtype(ordered=ordered) - result = index.astype(dtype, copy=copy) - expected = CategoricalIndex(index.values, name=name, ordered=ordered) - tm.assert_index_equal(result, expected) + result = idx.astype(dtype, copy=copy) + expected = CategoricalIndex(idx, name=name, ordered=ordered) + tm.assert_index_equal(result, expected, exact=True) # non-standard categories - dtype = CategoricalDtype(index.unique().tolist()[:-1], ordered) - result = index.astype(dtype, copy=copy) - expected = CategoricalIndex(index.values, name=name, dtype=dtype) - tm.assert_index_equal(result, expected) + dtype = CategoricalDtype(idx.unique().tolist()[:-1], ordered) + result = idx.astype(dtype, copy=copy) + expected = CategoricalIndex(idx, name=name, dtype=dtype) + tm.assert_index_equal(result, expected, exact=True) if ordered is False: # dtype='category' defaults to ordered=False, so only test once - result = index.astype("category", copy=copy) - expected = CategoricalIndex(index.values, name=name) - tm.assert_index_equal(result, expected) + result = idx.astype("category", copy=copy) + expected = CategoricalIndex(idx, name=name) + tm.assert_index_equal(result, expected, exact=True) - def test_is_unique(self): + def test_is_unique(self, simple_index): # initialize a unique index - index = self.create_index().drop_duplicates() + index = simple_index.drop_duplicates() assert index.is_unique is True # empty index should be unique @@ -700,32 +709,32 @@ def test_is_unique(self): assert index_na_dup.is_unique is False @pytest.mark.arm_slow - def test_engine_reference_cycle(self): + def test_engine_reference_cycle(self, simple_index): # GH27585 - index = self.create_index() + index = simple_index nrefs_pre = len(gc.get_referrers(index)) index._engine assert len(gc.get_referrers(index)) == nrefs_pre - def test_getitem_2d_deprecated(self): + def test_getitem_2d_deprecated(self, simple_index): # GH#30588 - idx = self.create_index() + idx = simple_index with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res = idx[:, None] assert isinstance(res, np.ndarray), type(res) - def test_copy_shares_cache(self): + def test_copy_shares_cache(self, simple_index): # GH32898, GH36840 - idx = self.create_index() + idx = simple_index idx.get_loc(idx[0]) # populates the _cache. copy = idx.copy() assert copy._cache is idx._cache - def test_shallow_copy_shares_cache(self): + def test_shallow_copy_shares_cache(self, simple_index): # GH32669, GH36840 - idx = self.create_index() + idx = simple_index idx.get_loc(idx[0]) # populates the _cache. shallow_copy = idx._view() @@ -734,3 +743,91 @@ def test_shallow_copy_shares_cache(self): shallow_copy = idx._shallow_copy(idx._data) assert shallow_copy._cache is not idx._cache assert shallow_copy._cache == {} + + def test_index_groupby(self, simple_index): + idx = simple_index[:5] + to_groupby = np.array([1, 2, np.nan, 2, 1]) + tm.assert_dict_equal( + idx.groupby(to_groupby), {1.0: idx[[0, 4]], 2.0: idx[[1, 3]]} + ) + + to_groupby = DatetimeIndex( + [ + datetime(2011, 11, 1), + datetime(2011, 12, 1), + pd.NaT, + datetime(2011, 12, 1), + datetime(2011, 11, 1), + ], + tz="UTC", + ).values + + ex_keys = [Timestamp("2011-11-01"), Timestamp("2011-12-01")] + expected = {ex_keys[0]: idx[[0, 4]], ex_keys[1]: idx[[1, 3]]} + tm.assert_dict_equal(idx.groupby(to_groupby), expected) + + +class NumericBase(Base): + """ + Base class for numeric index (incl. RangeIndex) sub-class tests. + """ + + def test_where(self): + # Tested in numeric.test_indexing + pass + + def test_can_hold_identifiers(self, simple_index): + idx = simple_index + key = idx[0] + assert idx._can_hold_identifiers_and_holds_name(key) is False + + def test_format(self, simple_index): + # GH35439 + idx = simple_index + max_width = max(len(str(x)) for x in idx) + expected = [str(x).ljust(max_width) for x in idx] + assert idx.format() == expected + + def test_numeric_compat(self): + pass # override Base method + + def test_insert_na(self, nulls_fixture, simple_index): + # GH 18295 (test missing) + index = simple_index + na_val = nulls_fixture + + if na_val is pd.NaT: + expected = Index([index[0], pd.NaT] + list(index[1:]), dtype=object) + else: + expected = Float64Index([index[0], np.nan] + list(index[1:])) + + result = index.insert(1, na_val) + tm.assert_index_equal(result, expected) + + def test_arithmetic_explicit_conversions(self): + # GH 8608 + # add/sub are overridden explicitly for Float/Int Index + index_cls = self._index_cls + if index_cls is RangeIndex: + idx = RangeIndex(5) + else: + idx = index_cls(np.arange(5, dtype="int64")) + + # float conversions + arr = np.arange(5, dtype="int64") * 3.2 + expected = Float64Index(arr) + fidx = idx * 3.2 + tm.assert_index_equal(fidx, expected) + fidx = 3.2 * idx + tm.assert_index_equal(fidx, expected) + + # interops with numpy arrays + expected = Float64Index(arr) + a = np.zeros(5, dtype="float64") + result = fidx - a + tm.assert_index_equal(result, expected) + + expected = Float64Index(-arr) + a = np.zeros(5, dtype="float64") + result = a - fidx + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 4c8ab27d2c824..a8f8406e24fef 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -9,33 +9,33 @@ class DatetimeLike(Base): - def test_argsort_matches_array(self): - rng = self.create_index() - rng = rng.insert(1, pd.NaT) + def test_argsort_matches_array(self, simple_index): + idx = simple_index + idx = idx.insert(1, pd.NaT) - result = rng.argsort() - expected = rng._data.argsort() + result = idx.argsort() + expected = idx._data.argsort() tm.assert_numpy_array_equal(result, expected) - def test_can_hold_identifiers(self): - idx = self.create_index() + def test_can_hold_identifiers(self, simple_index): + idx = simple_index key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is False - def test_shift_identity(self): + def test_shift_identity(self, simple_index): - idx = self.create_index() + idx = simple_index tm.assert_index_equal(idx, idx.shift(0)) - def test_shift_empty(self): + def test_shift_empty(self, simple_index): # GH#14811 - idx = self.create_index()[:0] + idx = simple_index[:0] tm.assert_index_equal(idx, idx.shift(1)) - def test_str(self): + def test_str(self, simple_index): # test the string repr - idx = self.create_index() + idx = simple_index idx.name = "foo" assert not (f"length={len(idx)}" in str(idx)) assert "'foo'" in str(idx) @@ -47,19 +47,19 @@ def test_str(self): if hasattr(idx, "freq"): assert f"freq='{idx.freqstr}'" in str(idx) - def test_view(self): - i = self.create_index() + def test_view(self, simple_index): + idx = simple_index - i_view = i.view("i8") - result = self._holder(i) - tm.assert_index_equal(result, i) + idx_view = idx.view("i8") + result = self._index_cls(idx) + tm.assert_index_equal(result, idx) - i_view = i.view(self._holder) - result = self._holder(i) - tm.assert_index_equal(result, i_view) + idx_view = idx.view(self._index_cls) + result = self._index_cls(idx) + tm.assert_index_equal(result, idx_view) - def test_map_callable(self): - index = self.create_index() + def test_map_callable(self, simple_index): + index = simple_index expected = index + index.freq result = index.map(lambda x: x + x.freq) tm.assert_index_equal(result, expected) @@ -76,8 +76,8 @@ def test_map_callable(self): lambda values, index: pd.Series(values, index, dtype=object), ], ) - def test_map_dictlike(self, mapper): - index = self.create_index() + def test_map_dictlike(self, mapper, simple_index): + index = simple_index expected = index + index.freq # don't compare the freqs @@ -97,15 +97,15 @@ def test_map_dictlike(self, mapper): result = index.map(mapper([], [])) tm.assert_index_equal(result, expected) - def test_getitem_preserves_freq(self): - index = self.create_index() + def test_getitem_preserves_freq(self, simple_index): + index = simple_index assert index.freq is not None result = index[:] assert result.freq == index.freq - def test_where_cast_str(self): - index = self.create_index() + def test_where_cast_str(self, simple_index): + index = simple_index mask = np.ones(len(index), dtype=bool) mask[-1] = False diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 94303359958b3..0a387fe3141e4 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -10,7 +10,11 @@ class TestDatetimeIndex(DatetimeLike): - _holder = DatetimeIndex + _index_cls = DatetimeIndex + + @pytest.fixture + def simple_index(self) -> DatetimeIndex: + return date_range("20130101", periods=5) @pytest.fixture( params=[tm.makeDateIndex(10), date_range("20130110", periods=10, freq="-1D")], @@ -19,12 +23,9 @@ class TestDatetimeIndex(DatetimeLike): def index(self, request): return request.param - def create_index(self) -> DatetimeIndex: - return date_range("20130101", periods=5) - - def test_format(self): + def test_format(self, simple_index): # GH35439 - idx = self.create_index() + idx = simple_index expected = [f"{x:%Y-%m-%d}" for x in idx] assert idx.format() == expected diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index 8bf418a2fc731..b14db459f996d 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -16,13 +16,17 @@ class TestBase(Base): in test_interval.py or the specific test file (e.g. test_astype.py) """ - _holder = IntervalIndex + _index_cls = IntervalIndex + + @pytest.fixture + def simple_index(self) -> IntervalIndex: + return self._index_cls.from_breaks(range(11), closed="right") @pytest.fixture def index(self): return tm.makeIntervalIndex(10) - def create_index(self, closed="right"): + def create_index(self, *, closed="right"): return IntervalIndex.from_breaks(range(11), closed=closed) def test_repr_max_seq_item_setting(self): @@ -44,8 +48,8 @@ def test_take(self, closed): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) - def test_where(self, closed, klass): - idx = self.create_index(closed=closed) + def test_where(self, simple_index, klass): + idx = simple_index cond = [True] * len(idx) expected = idx result = expected.where(klass(cond)) @@ -56,9 +60,9 @@ def test_where(self, closed, klass): result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_getitem_2d_deprecated(self): + def test_getitem_2d_deprecated(self, simple_index): # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable - idx = self.create_index() + idx = simple_index with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): idx[:, None] diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 4a170d9cd161f..0b59e832ce3a8 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -414,6 +414,18 @@ def test_union_empty_self_different_names(): tm.assert_index_equal(result, expected) +def test_union_multiindex_empty_rangeindex(): + # GH#41234 + mi = MultiIndex.from_arrays([[1, 2], [3, 4]], names=["a", "b"]) + ri = pd.RangeIndex(0) + + result_left = mi.union(ri) + tm.assert_index_equal(mi, result_left, check_names=False) + + result_right = ri.union(mi) + tm.assert_index_equal(mi, result_right, check_names=False) + + @pytest.mark.parametrize( "method", ["union", "intersection", "difference", "symmetric_difference"] ) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/numeric/test_numeric.py similarity index 56% rename from pandas/tests/indexes/test_numeric.py rename to pandas/tests/indexes/numeric/test_numeric.py index 99dadfba4e7aa..bfe06d74570da 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/numeric/test_numeric.py @@ -1,5 +1,3 @@ -from datetime import datetime - import numpy as np import pytest @@ -15,108 +13,17 @@ UInt64Index, ) import pandas._testing as tm -from pandas.tests.indexes.common import Base - - -class TestArithmetic: - @pytest.mark.parametrize( - "klass", [Float64Index, Int64Index, UInt64Index, RangeIndex] - ) - def test_arithmetic_explicit_conversions(self, klass): +from pandas.tests.indexes.common import NumericBase - # GH 8608 - # add/sub are overridden explicitly for Float/Int Index - if klass is RangeIndex: - idx = RangeIndex(5) - else: - idx = klass(np.arange(5, dtype="int64")) - - # float conversions - arr = np.arange(5, dtype="int64") * 3.2 - expected = Float64Index(arr) - fidx = idx * 3.2 - tm.assert_index_equal(fidx, expected) - fidx = 3.2 * idx - tm.assert_index_equal(fidx, expected) - - # interops with numpy arrays - expected = Float64Index(arr) - a = np.zeros(5, dtype="float64") - result = fidx - a - tm.assert_index_equal(result, expected) - - expected = Float64Index(-arr) - a = np.zeros(5, dtype="float64") - result = a - fidx - tm.assert_index_equal(result, expected) - - -class TestNumericIndex: - def test_index_groupby(self): - int_idx = Index(range(6)) - float_idx = Index(np.arange(0, 0.6, 0.1)) - obj_idx = Index("A B C D E F".split()) - dt_idx = pd.date_range("2013-01-01", freq="M", periods=6) - - for idx in [int_idx, float_idx, obj_idx, dt_idx]: - to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1]) - tm.assert_dict_equal( - idx.groupby(to_groupby), {1.0: idx[[0, 5]], 2.0: idx[[1, 4]]} - ) - - to_groupby = pd.DatetimeIndex( - [ - datetime(2011, 11, 1), - datetime(2011, 12, 1), - pd.NaT, - pd.NaT, - datetime(2011, 12, 1), - datetime(2011, 11, 1), - ], - tz="UTC", - ).values - - ex_keys = [Timestamp("2011-11-01"), Timestamp("2011-12-01")] - expected = {ex_keys[0]: idx[[0, 5]], ex_keys[1]: idx[[1, 4]]} - tm.assert_dict_equal(idx.groupby(to_groupby), expected) - - -class Numeric(Base): - def test_where(self): - # Tested in numeric.test_indexing - pass - - def test_can_hold_identifiers(self): - idx = self.create_index() - key = idx[0] - assert idx._can_hold_identifiers_and_holds_name(key) is False - - def test_format(self): - # GH35439 - idx = self.create_index() - max_width = max(len(str(x)) for x in idx) - expected = [str(x).ljust(max_width) for x in idx] - assert idx.format() == expected - - def test_numeric_compat(self): - pass # override Base method - - def test_insert_na(self, nulls_fixture): - # GH 18295 (test missing) - index = self.create_index() - na_val = nulls_fixture - - if na_val is pd.NaT: - expected = Index([index[0], pd.NaT] + list(index[1:]), dtype=object) - else: - expected = Float64Index([index[0], np.nan] + list(index[1:])) - - result = index.insert(1, na_val) - tm.assert_index_equal(result, expected) +class TestFloat64Index(NumericBase): + _index_cls = Float64Index + _dtype = np.float64 -class TestFloat64Index(Numeric): - _holder = Float64Index + @pytest.fixture + def simple_index(self) -> Index: + values = np.arange(5, dtype=self._dtype) + return self._index_cls(values) @pytest.fixture( params=[ @@ -128,63 +35,73 @@ class TestFloat64Index(Numeric): ids=["mixed", "float", "mixed_dec", "float_dec"], ) def index(self, request): - return Float64Index(request.param) + return self._index_cls(request.param) @pytest.fixture def mixed_index(self): - return Float64Index([1.5, 2, 3, 4, 5]) + return self._index_cls([1.5, 2, 3, 4, 5]) @pytest.fixture def float_index(self): - return Float64Index([0.0, 2.5, 5.0, 7.5, 10.0]) - - def create_index(self) -> Float64Index: - return Float64Index(np.arange(5, dtype="float64")) + return self._index_cls([0.0, 2.5, 5.0, 7.5, 10.0]) def test_repr_roundtrip(self, index): tm.assert_index_equal(eval(repr(index)), index) - def check_is_index(self, i): - assert isinstance(i, Index) - assert not isinstance(i, Float64Index) + def check_is_index(self, idx): + assert isinstance(idx, Index) + assert not isinstance(idx, self._index_cls) def check_coerce(self, a, b, is_float_index=True): assert a.equals(b) tm.assert_index_equal(a, b, exact=False) if is_float_index: - assert isinstance(b, Float64Index) + assert isinstance(b, self._index_cls) else: self.check_is_index(b) def test_constructor(self): + index_cls = self._index_cls + dtype = self._dtype # explicit construction - index = Float64Index([1, 2, 3, 4, 5]) - assert isinstance(index, Float64Index) - expected = np.array([1, 2, 3, 4, 5], dtype="float64") + index = index_cls([1, 2, 3, 4, 5]) + + assert isinstance(index, index_cls) + assert index.dtype.type is dtype + + expected = np.array([1, 2, 3, 4, 5], dtype=dtype) tm.assert_numpy_array_equal(index.values, expected) - index = Float64Index(np.array([1, 2, 3, 4, 5])) - assert isinstance(index, Float64Index) - index = Float64Index([1.0, 2, 3, 4, 5]) - assert isinstance(index, Float64Index) - index = Float64Index(np.array([1.0, 2, 3, 4, 5])) - assert isinstance(index, Float64Index) - assert index.dtype == float - - index = Float64Index(np.array([1.0, 2, 3, 4, 5]), dtype=np.float32) - assert isinstance(index, Float64Index) - assert index.dtype == np.float64 - - index = Float64Index(np.array([1, 2, 3, 4, 5]), dtype=np.float32) - assert isinstance(index, Float64Index) - assert index.dtype == np.float64 + index = index_cls(np.array([1, 2, 3, 4, 5])) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls([1.0, 2, 3, 4, 5]) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls(np.array([1.0, 2, 3, 4, 5])) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls(np.array([1.0, 2, 3, 4, 5]), dtype=np.float32) + assert isinstance(index, index_cls) + assert index.dtype == dtype + + index = index_cls(np.array([1, 2, 3, 4, 5]), dtype=np.float32) + assert isinstance(index, index_cls) + assert index.dtype == dtype # nan handling - result = Float64Index([np.nan, np.nan]) + result = index_cls([np.nan, np.nan]) assert pd.isna(result.values).all() - result = Float64Index(np.array([np.nan])) + + result = index_cls(np.array([np.nan])) assert pd.isna(result.values).all() + result = Index(np.array([np.nan])) + assert isinstance(result, index_cls) + assert result.dtype == dtype assert pd.isna(result.values).all() @pytest.mark.parametrize( @@ -205,14 +122,16 @@ def test_invalid_dtype(self, index, dtype): index([1, 2, 3], dtype=dtype) def test_constructor_invalid(self): + index_cls = self._index_cls + cls_name = index_cls.__name__ # invalid msg = ( - r"Float64Index\(\.\.\.\) must be called with a collection of " + rf"{cls_name}\(\.\.\.\) must be called with a collection of " r"some kind, 0\.0 was passed" ) with pytest.raises(TypeError, match=msg): - Float64Index(0.0) + index_cls(0.0) # 2021-02-1 we get ValueError in numpy 1.20, but not on all builds msg = "|".join( @@ -222,11 +141,13 @@ def test_constructor_invalid(self): ] ) with pytest.raises((TypeError, ValueError), match=msg): - Float64Index(["a", "b", 0.0]) + index_cls(["a", "b", 0.0]) - msg = r"float\(\) argument must be a string or a number, not 'Timestamp'" + msg = ( + r"float\(\) argument must be a string or a( real)? number, not 'Timestamp'" + ) with pytest.raises(TypeError, match=msg): - Float64Index([Timestamp("20130101")]) + index_cls([Timestamp("20130101")]) def test_constructor_coerce(self, mixed_index, float_index): @@ -255,24 +176,25 @@ def test_type_coercion_fail(self, any_int_dtype): def test_type_coercion_valid(self, float_dtype): # There is no Float32Index, so we always # generate Float64Index. - i = Index([1, 2, 3.5], dtype=float_dtype) - tm.assert_index_equal(i, Index([1, 2, 3.5])) + idx = Index([1, 2, 3.5], dtype=float_dtype) + tm.assert_index_equal(idx, Index([1, 2, 3.5])) def test_equals_numeric(self): + index_cls = self._index_cls - i = Float64Index([1.0, 2.0]) - assert i.equals(i) - assert i.identical(i) + idx = index_cls([1.0, 2.0]) + assert idx.equals(idx) + assert idx.identical(idx) - i2 = Float64Index([1.0, 2.0]) - assert i.equals(i2) + idx2 = index_cls([1.0, 2.0]) + assert idx.equals(idx2) - i = Float64Index([1.0, np.nan]) - assert i.equals(i) - assert i.identical(i) + idx = index_cls([1.0, np.nan]) + assert idx.equals(idx) + assert idx.identical(idx) - i2 = Float64Index([1.0, np.nan]) - assert i.equals(i2) + idx2 = index_cls([1.0, np.nan]) + assert idx.equals(idx2) @pytest.mark.parametrize( "other", @@ -283,9 +205,9 @@ def test_equals_numeric(self): ), ) def test_equals_numeric_other_index_type(self, other): - i = Float64Index([1.0, 2.0]) - assert i.equals(other) - assert other.equals(i) + idx = self._index_cls([1.0, 2.0]) + assert idx.equals(other) + assert other.equals(idx) @pytest.mark.parametrize( "vals", @@ -295,10 +217,12 @@ def test_equals_numeric_other_index_type(self, other): ], ) def test_lookups_datetimelike_values(self, vals): + dtype = self._dtype + # If we have datetime64 or timedelta64 values, make sure they are # wrappped correctly GH#31163 ser = Series(vals, index=range(3, 6)) - ser.index = ser.index.astype("float64") + ser.index = ser.index.astype(dtype) expected = vals[1] @@ -332,19 +256,21 @@ def test_lookups_datetimelike_values(self, vals): assert isinstance(result, type(expected)) and result == expected def test_doesnt_contain_all_the_things(self): - i = Float64Index([np.nan]) - assert not i.isin([0]).item() - assert not i.isin([1]).item() - assert i.isin([np.nan]).item() + idx = self._index_cls([np.nan]) + assert not idx.isin([0]).item() + assert not idx.isin([1]).item() + assert idx.isin([np.nan]).item() def test_nan_multiple_containment(self): - i = Float64Index([1.0, np.nan]) - tm.assert_numpy_array_equal(i.isin([1.0]), np.array([True, False])) - tm.assert_numpy_array_equal(i.isin([2.0, np.pi]), np.array([False, False])) - tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, True])) - tm.assert_numpy_array_equal(i.isin([1.0, np.nan]), np.array([True, True])) - i = Float64Index([1.0, 2.0]) - tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, False])) + index_cls = self._index_cls + + idx = index_cls([1.0, np.nan]) + tm.assert_numpy_array_equal(idx.isin([1.0]), np.array([True, False])) + tm.assert_numpy_array_equal(idx.isin([2.0, np.pi]), np.array([False, False])) + tm.assert_numpy_array_equal(idx.isin([np.nan]), np.array([False, True])) + tm.assert_numpy_array_equal(idx.isin([1.0, np.nan]), np.array([True, True])) + idx = index_cls([1.0, 2.0]) + tm.assert_numpy_array_equal(idx.isin([np.nan]), np.array([False, False])) def test_fillna_float64(self): # GH 11343 @@ -354,7 +280,7 @@ def test_fillna_float64(self): tm.assert_index_equal(idx.fillna(0.1), exp) # downcast - exp = Float64Index([1.0, 2.0, 3.0], name="x") + exp = self._index_cls([1.0, 2.0, 3.0], name="x") tm.assert_index_equal(idx.fillna(2), exp) # object @@ -362,32 +288,36 @@ def test_fillna_float64(self): tm.assert_index_equal(idx.fillna("obj"), exp) -class NumericInt(Numeric): +class NumericInt(NumericBase): def test_view(self): - i = self._holder([], name="Foo") - i_view = i.view() - assert i_view.name == "Foo" + index_cls = self._index_cls - i_view = i.view(self._dtype) - tm.assert_index_equal(i, self._holder(i_view, name="Foo")) + idx = index_cls([], name="Foo") + idx_view = idx.view() + assert idx_view.name == "Foo" - i_view = i.view(self._holder) - tm.assert_index_equal(i, self._holder(i_view, name="Foo")) + idx_view = idx.view(self._dtype) + tm.assert_index_equal(idx, index_cls(idx_view, name="Foo")) + + idx_view = idx.view(index_cls) + tm.assert_index_equal(idx, index_cls(idx_view, name="Foo")) def test_is_monotonic(self): - index = self._holder([1, 2, 3, 4]) + index_cls = self._index_cls + + index = index_cls([1, 2, 3, 4]) assert index.is_monotonic is True assert index.is_monotonic_increasing is True assert index._is_strictly_monotonic_increasing is True assert index.is_monotonic_decreasing is False assert index._is_strictly_monotonic_decreasing is False - index = self._holder([4, 3, 2, 1]) + index = index_cls([4, 3, 2, 1]) assert index.is_monotonic is False assert index._is_strictly_monotonic_increasing is False assert index._is_strictly_monotonic_decreasing is True - index = self._holder([1]) + index = index_cls([1]) assert index.is_monotonic is True assert index.is_monotonic_increasing is True assert index.is_monotonic_decreasing is True @@ -395,40 +325,43 @@ def test_is_monotonic(self): assert index._is_strictly_monotonic_decreasing is True def test_is_strictly_monotonic(self): - index = self._holder([1, 1, 2, 3]) + index_cls = self._index_cls + + index = index_cls([1, 1, 2, 3]) assert index.is_monotonic_increasing is True assert index._is_strictly_monotonic_increasing is False - index = self._holder([3, 2, 1, 1]) + index = index_cls([3, 2, 1, 1]) assert index.is_monotonic_decreasing is True assert index._is_strictly_monotonic_decreasing is False - index = self._holder([1, 1]) + index = index_cls([1, 1]) assert index.is_monotonic_increasing assert index.is_monotonic_decreasing assert not index._is_strictly_monotonic_increasing assert not index._is_strictly_monotonic_decreasing - def test_logical_compat(self): - idx = self.create_index() + def test_logical_compat(self, simple_index): + idx = simple_index assert idx.all() == idx.values.all() assert idx.any() == idx.values.any() - def test_identical(self): - index = self.create_index() - i = Index(index.copy()) - assert i.identical(index) + def test_identical(self, simple_index): + index = simple_index + + idx = Index(index.copy()) + assert idx.identical(index) - same_values_different_type = Index(i, dtype=object) - assert not i.identical(same_values_different_type) + same_values_different_type = Index(idx, dtype=object) + assert not idx.identical(same_values_different_type) - i = index.astype(dtype=object) - i = i.rename("foo") - same_values = Index(i, dtype=object) - assert same_values.identical(i) + idx = index.astype(dtype=object) + idx = idx.rename("foo") + same_values = Index(idx, dtype=object) + assert same_values.identical(idx) - assert not i.identical(index) - assert Index(same_values, name="foo", dtype=object).identical(i) + assert not idx.identical(index) + assert Index(same_values, name="foo", dtype=object).identical(idx) assert not index.astype(dtype=object).identical(index.astype(dtype=self._dtype)) @@ -440,58 +373,61 @@ def test_cant_or_shouldnt_cast(self): # can't data = ["foo", "bar", "baz"] with pytest.raises(TypeError, match=msg): - self._holder(data) + self._index_cls(data) # shouldn't data = ["0", "1", "2"] with pytest.raises(TypeError, match=msg): - self._holder(data) + self._index_cls(data) - def test_view_index(self): - index = self.create_index() + def test_view_index(self, simple_index): + index = simple_index index.view(Index) - def test_prevent_casting(self): - index = self.create_index() + def test_prevent_casting(self, simple_index): + index = simple_index result = index.astype("O") assert result.dtype == np.object_ class TestInt64Index(NumericInt): - _dtype = "int64" - _holder = Int64Index + _index_cls = Int64Index + _dtype = np.int64 + + @pytest.fixture + def simple_index(self) -> Index: + return self._index_cls(range(0, 20, 2), dtype=self._dtype) @pytest.fixture( params=[range(0, 20, 2), range(19, -1, -1)], ids=["index_inc", "index_dec"] ) def index(self, request): - return Int64Index(request.param) - - def create_index(self) -> Int64Index: - # return Int64Index(np.arange(5, dtype="int64")) - return Int64Index(range(0, 20, 2)) + return self._index_cls(request.param) def test_constructor(self): + index_cls = self._index_cls + dtype = self._dtype + # pass list, coerce fine - index = Int64Index([-5, 0, 1, 2]) - expected = Index([-5, 0, 1, 2], dtype=np.int64) + index = index_cls([-5, 0, 1, 2]) + expected = Index([-5, 0, 1, 2], dtype=dtype) tm.assert_index_equal(index, expected) # from iterable - index = Int64Index(iter([-5, 0, 1, 2])) + index = index_cls(iter([-5, 0, 1, 2])) tm.assert_index_equal(index, expected) # scalar raise Exception msg = ( - r"Int64Index\(\.\.\.\) must be called with a collection of some " + rf"{index_cls.__name__}\(\.\.\.\) must be called with a collection of some " "kind, 5 was passed" ) with pytest.raises(TypeError, match=msg): - Int64Index(5) + index_cls(5) # copy arr = index.values - new_index = Int64Index(arr, copy=True) + new_index = index_cls(arr, copy=True) tm.assert_index_equal(new_index, index) val = arr[0] + 3000 @@ -500,29 +436,32 @@ def test_constructor(self): assert new_index[0] != val # interpret list-like - expected = Int64Index([5, 0]) - for cls in [Index, Int64Index]: + expected = index_cls([5, 0]) + for cls in [Index, index_cls]: for idx in [ - cls([5, 0], dtype="int64"), - cls(np.array([5, 0]), dtype="int64"), - cls(Series([5, 0]), dtype="int64"), + cls([5, 0], dtype=dtype), + cls(np.array([5, 0]), dtype=dtype), + cls(Series([5, 0]), dtype=dtype), ]: tm.assert_index_equal(idx, expected) def test_constructor_corner(self): + index_cls = self._index_cls + dtype = self._dtype + arr = np.array([1, 2, 3, 4], dtype=object) - index = Int64Index(arr) - assert index.values.dtype == np.int64 + index = index_cls(arr) + assert index.values.dtype == dtype tm.assert_index_equal(index, Index(arr)) # preventing casting arr = np.array([1, "2", 3, "4"], dtype=object) with pytest.raises(TypeError, match="casting"): - Int64Index(arr) + index_cls(arr) arr_with_floats = [0, 2, 3, 4, 5, 1.25, 3, -1] with pytest.raises(TypeError, match="casting"): - Int64Index(arr_with_floats) + index_cls(arr_with_floats) def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): @@ -534,14 +473,14 @@ def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): def test_constructor_unwraps_index(self): idx = Index([1, 2]) - result = Int64Index(idx) - expected = np.array([1, 2], dtype="int64") + result = self._index_cls(idx) + expected = np.array([1, 2], dtype=self._dtype) tm.assert_numpy_array_equal(result._data, expected) def test_coerce_list(self): # coerce things arr = Index([1, 2, 3, 4]) - assert isinstance(arr, Int64Index) + assert isinstance(arr, self._index_cls) # but not if explicit dtype passed arr = Index([1, 2, 3, 4], dtype=object) @@ -550,8 +489,13 @@ def test_coerce_list(self): class TestUInt64Index(NumericInt): - _dtype = "uint64" - _holder = UInt64Index + _index_cls = UInt64Index + _dtype = np.uint64 + + @pytest.fixture + def simple_index(self) -> Index: + # compat with shared Int64/Float64 tests + return self._index_cls(np.arange(5, dtype=self._dtype)) @pytest.fixture( params=[ @@ -561,22 +505,21 @@ class TestUInt64Index(NumericInt): ids=["index_inc", "index_dec"], ) def index(self, request): - return UInt64Index(request.param) - - def create_index(self) -> UInt64Index: - # compat with shared Int64/Float64 tests - return UInt64Index(np.arange(5, dtype="uint64")) + return self._index_cls(request.param) def test_constructor(self): - idx = UInt64Index([1, 2, 3]) - res = Index([1, 2, 3], dtype=np.uint64) + index_cls = self._index_cls + dtype = self._dtype + + idx = index_cls([1, 2, 3]) + res = Index([1, 2, 3], dtype=dtype) tm.assert_index_equal(res, idx) - idx = UInt64Index([1, 2 ** 63]) - res = Index([1, 2 ** 63], dtype=np.uint64) + idx = index_cls([1, 2 ** 63]) + res = Index([1, 2 ** 63], dtype=dtype) tm.assert_index_equal(res, idx) - idx = UInt64Index([1, 2 ** 63]) + idx = index_cls([1, 2 ** 63]) res = Index([1, 2 ** 63]) tm.assert_index_equal(res, idx) @@ -585,8 +528,8 @@ def test_constructor(self): tm.assert_index_equal(res, idx) # https://github.com/pandas-dev/pandas/issues/29526 - idx = UInt64Index([1, 2 ** 63 + 1], dtype=np.uint64) - res = Index([1, 2 ** 63 + 1], dtype=np.uint64) + idx = index_cls([1, 2 ** 63 + 1], dtype=dtype) + res = Index([1, 2 ** 63 + 1], dtype=dtype) tm.assert_index_equal(res, idx) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 032b376f6d6a9..b80e92b105dbd 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -19,7 +19,11 @@ class TestPeriodIndex(DatetimeLike): - _holder = PeriodIndex + _index_cls = PeriodIndex + + @pytest.fixture + def simple_index(self) -> Index: + return period_range("20130101", periods=5, freq="D") @pytest.fixture( params=[ @@ -31,9 +35,6 @@ class TestPeriodIndex(DatetimeLike): def index(self, request): return request.param - def create_index(self) -> PeriodIndex: - return period_range("20130101", periods=5, freq="D") - def test_pickle_compat_construction(self): pass @@ -357,7 +358,7 @@ def test_map(self): def test_format_empty(self): # GH35712 - empty_idx = self._holder([], freq="A") + empty_idx = self._index_cls([], freq="A") assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] diff --git a/pandas/tests/indexes/ranges/test_constructors.py b/pandas/tests/indexes/ranges/test_constructors.py index 9539b0ff7cdba..e306b6e67cf7f 100644 --- a/pandas/tests/indexes/ranges/test_constructors.py +++ b/pandas/tests/indexes/ranges/test_constructors.py @@ -119,7 +119,9 @@ def test_constructor_range(self): expected = RangeIndex(1, 5, 2) tm.assert_index_equal(result, expected, exact=True) - msg = r"^from_range\(\) got an unexpected keyword argument" + msg = ( + r"(RangeIndex.)?from_range\(\) got an unexpected keyword argument( 'copy')?" + ) with pytest.raises(TypeError, match=msg): RangeIndex.from_range(range(10), copy=True) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 3f77c2c974842..3a4aa29ea620e 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -11,7 +11,7 @@ RangeIndex, ) import pandas._testing as tm -from pandas.tests.indexes.test_numeric import Numeric +from pandas.tests.indexes.common import NumericBase # aliases to make some tests easier to read RI = RangeIndex @@ -20,8 +20,12 @@ OI = Index -class TestRangeIndex(Numeric): - _holder = RangeIndex +class TestRangeIndex(NumericBase): + _index_cls = RangeIndex + + @pytest.fixture + def simple_index(self) -> Index: + return self._index_cls(start=0, stop=20, step=2) @pytest.fixture( params=[ @@ -33,16 +37,13 @@ class TestRangeIndex(Numeric): def index(self, request): return request.param - def create_index(self) -> RangeIndex: - return RangeIndex(start=0, stop=20, step=2) - - def test_can_hold_identifiers(self): - idx = self.create_index() + def test_can_hold_identifiers(self, simple_index): + idx = simple_index key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is False - def test_too_many_names(self): - index = self.create_index() + def test_too_many_names(self, simple_index): + index = simple_index with pytest.raises(ValueError, match="^Length"): index.names = ["roger", "harold"] @@ -62,9 +63,9 @@ def test_start_stop_step_attrs(self, index, start, stop, step): assert index.step == step @pytest.mark.parametrize("attr_name", ["_start", "_stop", "_step"]) - def test_deprecated_start_stop_step_attrs(self, attr_name): + def test_deprecated_start_stop_step_attrs(self, attr_name, simple_index): # GH 26581 - idx = self.create_index() + idx = simple_index with tm.assert_produces_warning(FutureWarning): getattr(idx, attr_name) @@ -140,8 +141,8 @@ def test_view(self): i_view = i.view(RangeIndex) tm.assert_index_equal(i, i_view) - def test_dtype(self): - index = self.create_index() + def test_dtype(self, simple_index): + index = simple_index assert index.dtype == np.int64 def test_cache(self): @@ -253,13 +254,13 @@ def test_equals_range(self): assert left.equals(right) assert right.equals(left) - def test_logical_compat(self): - idx = self.create_index() + def test_logical_compat(self, simple_index): + idx = simple_index assert idx.all() == idx.values.all() assert idx.any() == idx.values.any() - def test_identical(self): - index = self.create_index() + def test_identical(self, simple_index): + index = simple_index i = Index(index.copy()) assert i.identical(index) @@ -304,17 +305,17 @@ def test_cant_or_shouldnt_cast(self, start, stop, step): with pytest.raises(TypeError, match=msg): RangeIndex(start, stop, step) - def test_view_index(self): - index = self.create_index() + def test_view_index(self, simple_index): + index = simple_index index.view(Index) - def test_prevent_casting(self): - index = self.create_index() + def test_prevent_casting(self, simple_index): + index = simple_index result = index.astype("O") assert result.dtype == np.object_ - def test_repr_roundtrip(self): - index = self.create_index() + def test_repr_roundtrip(self, simple_index): + index = simple_index tm.assert_index_equal(eval(repr(index)), index) def test_slice_keep_name(self): @@ -325,8 +326,8 @@ def test_has_duplicates(self, index): assert index.is_unique assert not index.has_duplicates - def test_extended_gcd(self): - index = self.create_index() + def test_extended_gcd(self, simple_index): + index = simple_index result = index._extended_gcd(6, 10) assert result[0] == result[1] * 6 + result[2] * 10 assert 2 == result[0] @@ -375,8 +376,8 @@ def test_pickle_compat_construction(self): # RangeIndex() is a valid constructor pass - def test_slice_specialised(self): - index = self.create_index() + def test_slice_specialised(self, simple_index): + index = simple_index index.name = "foo" # scalar indexing @@ -506,7 +507,7 @@ def test_engineless_lookup(self): def test_format_empty(self): # GH35712 - empty_idx = self._holder(0) + empty_idx = self._index_cls(0) assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 1e9348dc410d7..b5822b768fdde 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -46,13 +46,14 @@ class TestIndex(Base): - _holder = Index + _index_cls = Index - def create_index(self) -> Index: - return Index(list("abcde")) + @pytest.fixture + def simple_index(self) -> Index: + return self._index_cls(list("abcde")) - def test_can_hold_identifiers(self): - index = self.create_index() + def test_can_hold_identifiers(self, simple_index): + index = simple_index key = index[0] assert index._can_hold_identifiers_and_holds_name(key) is True @@ -77,8 +78,6 @@ def test_constructor_casting(self, index): @pytest.mark.parametrize("index", ["string"], indirect=True) def test_constructor_copy(self, index): - # copy - # index = self.create_index() arr = np.array(index) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) @@ -600,8 +599,8 @@ def test_booleanindex(self, index): for i, val in enumerate(sub_index): assert sub_index.get_loc(val) == i - def test_fancy(self): - index = self.create_index() + def test_fancy(self, simple_index): + index = simple_index sl = index[[1, 2, 3]] for i in sl: assert i == sl[sl.get_loc(i)] @@ -628,9 +627,9 @@ def test_empty_fancy_raises(self, index): with pytest.raises(IndexError, match=msg): index[empty_farr] - def test_union_dt_as_obj(self, sort): + def test_union_dt_as_obj(self, sort, simple_index): # TODO: Replace with fixturesult - index = self.create_index() + index = simple_index date_index = date_range("2019-01-01", periods=10) first_cat = index.union(date_index) second_cat = index.union(index) @@ -754,9 +753,9 @@ def test_append_empty_preserve_name(self, name, expected): result = left.append(right) assert result.name == expected - def test_is_mixed_deprecated(self): + def test_is_mixed_deprecated(self, simple_index): # GH#32922 - index = self.create_index() + index = simple_index with tm.assert_produces_warning(FutureWarning): index.is_mixed() @@ -866,8 +865,8 @@ def test_format_datetime_with_time(self): assert result == expected @pytest.mark.parametrize("op", ["any", "all"]) - def test_logical_compat(self, op): - index = self.create_index() + def test_logical_compat(self, op, simple_index): + index = simple_index assert getattr(index, op)() == getattr(index.values, op)() @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) @@ -973,9 +972,9 @@ def test_is_monotonic_incomparable(self, attr): index = Index([5, datetime.now(), 7]) assert not getattr(index, attr) - def test_set_value_deprecated(self): + def test_set_value_deprecated(self, simple_index): # GH 28621 - idx = self.create_index() + idx = simple_index arr = np.array([1, 2, 3]) with tm.assert_produces_warning(FutureWarning): idx.set_value(arr, idx[1], 80) @@ -1415,29 +1414,30 @@ class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ # in py2 and py3 because ints and strings are uncomparable in py3 # (GH 13514) - _holder = Index + _index_cls = Index + + @pytest.fixture + def simple_index(self) -> Index: + return self._index_cls([0, "a", 1, "b", 2, "c"]) @pytest.fixture(params=[[0, "a", 1, "b", 2, "c"]], ids=["mixedIndex"]) def index(self, request): return Index(request.param) - def create_index(self) -> Index: - return Index([0, "a", 1, "b", 2, "c"]) - - def test_argsort(self): - index = self.create_index() + def test_argsort(self, simple_index): + index = simple_index with pytest.raises(TypeError, match="'>|<' not supported"): index.argsort() - def test_numpy_argsort(self): - index = self.create_index() + def test_numpy_argsort(self, simple_index): + index = simple_index with pytest.raises(TypeError, match="'>|<' not supported"): np.argsort(index) - def test_copy_name(self): + def test_copy_name(self, simple_index): # Check that "name" argument passed at initialization is honoured # GH12309 - index = self.create_index() + index = simple_index first = type(index)(index, copy=True, name="mario") second = type(first)(first, copy=False) @@ -1482,8 +1482,8 @@ def test_unique_na(self): result = idx.unique() tm.assert_index_equal(result, expected) - def test_logical_compat(self): - index = self.create_index() + def test_logical_compat(self, simple_index): + index = simple_index assert index.all() == index.values.all() assert index.any() == index.values.any() diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index d0f4828e8c7bd..478697ed1a5be 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -20,19 +20,20 @@ class TestTimedeltaIndex(DatetimeLike): - _holder = TimedeltaIndex + _index_cls = TimedeltaIndex @pytest.fixture - def index(self): - return tm.makeTimedeltaIndex(10) - - def create_index(self) -> TimedeltaIndex: + def simple_index(self) -> TimedeltaIndex: index = pd.to_timedelta(range(5), unit="d")._with_freq("infer") assert index.freq == "D" ret = index + pd.offsets.Hour(1) assert ret.freq == "D" return ret + @pytest.fixture + def index(self): + return tm.makeTimedeltaIndex(10) + def test_numeric_compat(self): # Dummy method to override super's version; this test is now done # in test_arithmetic.py diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index ad0d4245d58c3..446b616111e9e 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -122,7 +122,11 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, box, using_array_manager else: values = obj[0].values - obj.iloc[:2] = box(arr[2:]) + if frame_or_series is Series: + obj.iloc[:2] = box(arr[2:]) + else: + obj.iloc[:2, 0] = box(arr[2:]) + expected = frame_or_series(np.array([3, 4, 3, 4], dtype="i8")) tm.assert_equal(obj, expected) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index 0062d5aa34319..21299d76eaf5a 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -3,6 +3,7 @@ in core.internals """ +import pandas as pd from pandas.core import internals from pandas.core.internals import api @@ -44,3 +45,12 @@ def test_namespace(): result = [x for x in dir(internals) if not x.startswith("__")] assert set(result) == set(expected + modules) + + +def test_make_block_2d_with_dti(): + # GH#41168 + dti = pd.date_range("2012", periods=3, tz="UTC") + blk = api.make_block(dti, placement=[0]) + + assert blk.shape == (1, 3) + assert blk.values.shape == (1, 3) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 3299503dbc3a4..08dba5aa76a2f 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -823,7 +823,7 @@ def test_equals_block_order_different_dtypes(self, mgr_string): def test_single_mgr_ctor(self): mgr = create_single_mgr("f8", num_rows=5) - assert mgr.as_array().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0] + assert mgr.external_values().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0] @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_args(self, value): @@ -837,6 +837,12 @@ def test_validate_bool_args(self, value): bm1.replace_list([1], [2], inplace=value) +def _as_array(mgr): + if mgr.ndim == 1: + return mgr.external_values() + return mgr.as_array() + + class TestIndexing: # Nosetests-style data-driven tests. # @@ -859,7 +865,7 @@ class TestIndexing: @pytest.mark.parametrize("mgr", MANAGERS) def test_get_slice(self, mgr): def assert_slice_ok(mgr, axis, slobj): - mat = mgr.as_array() + mat = _as_array(mgr) # we maybe using an ndarray to test slicing and # might not be the full length of the axis @@ -881,7 +887,7 @@ def assert_slice_ok(mgr, axis, slobj): mat_slobj = (slice(None),) * axis + (slobj,) tm.assert_numpy_array_equal( - mat[mat_slobj], sliced.as_array(), check_dtype=False + mat[mat_slobj], _as_array(sliced), check_dtype=False ) tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) @@ -919,10 +925,10 @@ def assert_slice_ok(mgr, axis, slobj): @pytest.mark.parametrize("mgr", MANAGERS) def test_take(self, mgr): def assert_take_ok(mgr, axis, indexer): - mat = mgr.as_array() + mat = _as_array(mgr) taken = mgr.take(indexer, axis) tm.assert_numpy_array_equal( - np.take(mat, indexer, axis), taken.as_array(), check_dtype=False + np.take(mat, indexer, axis), _as_array(taken), check_dtype=False ) tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis]) @@ -940,13 +946,13 @@ def assert_take_ok(mgr, axis, indexer): @pytest.mark.parametrize("fill_value", [None, np.nan, 100.0]) def test_reindex_axis(self, fill_value, mgr): def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): - mat = mgr.as_array() + mat = _as_array(mgr) indexer = mgr.axes[axis].get_indexer_for(new_labels) reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) tm.assert_numpy_array_equal( algos.take_nd(mat, indexer, axis, fill_value=fill_value), - reindexed.as_array(), + _as_array(reindexed), check_dtype=False, ) tm.assert_index_equal(reindexed.axes[axis], new_labels) @@ -971,13 +977,13 @@ def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): @pytest.mark.parametrize("fill_value", [None, np.nan, 100.0]) def test_reindex_indexer(self, fill_value, mgr): def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): - mat = mgr.as_array() + mat = _as_array(mgr) reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value) reindexed = mgr.reindex_indexer( new_labels, indexer, axis, fill_value=fill_value ) tm.assert_numpy_array_equal( - reindexed_mat, reindexed.as_array(), check_dtype=False + reindexed_mat, _as_array(reindexed), check_dtype=False ) tm.assert_index_equal(reindexed.axes[axis], new_labels) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 6731c481f8935..abc65f2f1eda1 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -192,7 +192,7 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): def test_readjson_unicode(monkeypatch): with tm.ensure_clean("test.json") as path: - monkeypatch.setattr("_bootlocale.getpreferredencoding", lambda l: "cp949") + monkeypatch.setattr("locale.getpreferredencoding", lambda l: "cp949") with open(path, "w", encoding="utf-8") as f: f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index a44d47470eb5e..805f6b8dbe461 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -828,65 +828,65 @@ def test_0d_array(self): ( [{}, []], ValueError, - "nesting not supported for object or variable length dtypes", + r"nesting not supported for object or variable length dtypes", {}, ), ( [42, None], TypeError, - "int() argument must be a string, a bytes-like object or a number, " - "not 'NoneType'", + r"int\(\) argument must be a string, a bytes-like object or a( real)? " + r"number, not 'NoneType'", {}, ), ( [["a"], 42], ValueError, - "Cannot decode multidimensional arrays with variable length elements " - "to numpy", + r"Cannot decode multidimensional arrays with variable length elements " + r"to numpy", {}, ), ( [42, {}, "a"], TypeError, - "int() argument must be a string, a bytes-like object or a number, " - "not 'dict'", + r"int\(\) argument must be a string, a bytes-like object or a( real)? " + r"number, not 'dict'", {}, ), ( [42, ["a"], 42], ValueError, - "invalid literal for int() with base 10: 'a'", + r"invalid literal for int\(\) with base 10: 'a'", {}, ), ( ["a", "b", [], "c"], ValueError, - "nesting not supported for object or variable length dtypes", + r"nesting not supported for object or variable length dtypes", {}, ), ( [{"a": "b"}], ValueError, - "Cannot decode multidimensional arrays with variable length elements " - "to numpy", + r"Cannot decode multidimensional arrays with variable length elements " + r"to numpy", {"labelled": True}, ), ( {"a": {"b": {"c": 42}}}, ValueError, - "labels only supported up to 2 dimensions", + r"labels only supported up to 2 dimensions", {"labelled": True}, ), ( [{"a": 42, "b": 23}, {"c": 17}], ValueError, - "cannot reshape array of size 3 into shape (2,1)", + r"cannot reshape array of size 3 into shape \(2,1\)", {"labelled": True}, ), ], ) def test_array_numpy_except(self, bad_input, exc_type, err_msg, kwargs): - with pytest.raises(exc_type, match=re.escape(err_msg)): + with pytest.raises(exc_type, match=err_msg): ujson.decode(ujson.dumps(bad_input), numpy=True, **kwargs) def test_array_numpy_labelled(self): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 04ddef57a9621..290e063a59be7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -52,7 +52,9 @@ import pandas.io.sql as sql from pandas.io.sql import ( + SQLAlchemyEngine, _gt14, + get_engine, read_sql_query, read_sql_table, ) @@ -575,6 +577,23 @@ def sample(pd_table, conn, keys, data_iter): # Nuke table self.drop_table("test_frame1") + def _to_sql_with_sql_engine(self, engine="auto", **engine_kwargs): + """`to_sql` with the `engine` param""" + # mostly copied from this class's `_to_sql()` method + self.drop_table("test_frame1") + + self.pandasSQL.to_sql( + self.test_frame1, "test_frame1", engine=engine, **engine_kwargs + ) + assert self.pandasSQL.has_table("test_frame1") + + num_entries = len(self.test_frame1) + num_rows = self._count_rows("test_frame1") + assert num_rows == num_entries + + # Nuke table + self.drop_table("test_frame1") + def _roundtrip(self): self.drop_table("test_frame_roundtrip") self.pandasSQL.to_sql(self.test_frame1, "test_frame_roundtrip") @@ -2053,6 +2072,41 @@ class Temporary(Base): tm.assert_frame_equal(df, expected) + # -- SQL Engine tests (in the base class for now) + def test_invalid_engine(self): + msg = "engine must be one of 'auto', 'sqlalchemy'" + with pytest.raises(ValueError, match=msg): + self._to_sql_with_sql_engine("bad_engine") + + def test_options_sqlalchemy(self): + # use the set option + + with pd.option_context("io.sql.engine", "sqlalchemy"): + self._to_sql_with_sql_engine() + + def test_options_auto(self): + # use the set option + + with pd.option_context("io.sql.engine", "auto"): + self._to_sql_with_sql_engine() + + def test_options_get_engine(self): + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + with pd.option_context("io.sql.engine", "sqlalchemy"): + assert isinstance(get_engine("auto"), SQLAlchemyEngine) + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + with pd.option_context("io.sql.engine", "auto"): + assert isinstance(get_engine("auto"), SQLAlchemyEngine) + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + def test_get_engine_auto_error_message(self): + # Expect different error messages from get_engine(engine="auto") + # if engines aren't installed vs. are installed but bad version + pass + # TODO fill this in when we add more engines + class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): def test_transactions(self): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index bbe9ac6fa8094..66cb2f2291e98 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1206,6 +1206,9 @@ def test_resample_median_bug_1688(): result = df.resample("T").apply(lambda x: x.mean()) exp = df.asfreq("T") + if dtype == "float32": + # TODO: Empty groups cause x.mean() to return float64 + exp = exp.astype("float64") tm.assert_frame_equal(result, exp) result = df.resample("T").median() @@ -1686,6 +1689,8 @@ def f(data, add_arg): df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) result = df.groupby("A").resample("D").agg(f, multiplier) expected = df.groupby("A").resample("D").mean().multiply(multiplier) + # TODO: GH 41137 + expected = expected.astype("float64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 999d8a6c90ba2..3e78d6ebf4c0c 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -258,6 +258,8 @@ def f(x): return x.resample("2s").apply(lambda y: y.sum()) result = g.apply(f) + # y.sum() results in int64 instead of int32 on 32-bit architectures + expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -289,7 +291,7 @@ def test_apply_columns_multilevel(): agg_dict = {col: (np.sum if col[3] == "one" else np.mean) for col in df.columns} result = df.resample("H").apply(lambda x: agg_dict[x.name](x)) expected = DataFrame( - np.array([0] * 4).reshape(2, 2), + 2 * [[0, 0.0]], index=date_range(start="2017-01-01", freq="1H", periods=2), columns=pd.MultiIndex.from_tuples( [("A", "a", "", "one"), ("B", "b", "i", "two")] @@ -354,11 +356,15 @@ def test_apply_to_one_column_of_df(): {"col": range(10), "col1": range(10, 20)}, index=date_range("2012-01-01", periods=10, freq="20min"), ) + + # access "col" via getattr -> make sure we handle AttributeError result = df.resample("H").apply(lambda group: group.col.sum()) expected = Series( [3, 12, 21, 9], index=date_range("2012-01-01", periods=4, freq="H") ) tm.assert_series_equal(result, expected) + + # access "col" via _getitem__ -> make sure we handle KeyErrpr result = df.resample("H").apply(lambda group: group["col"].sum()) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index c6ee295208607..b1560623cd871 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -162,7 +162,7 @@ def test_resample_with_timedelta_yields_no_empty_groups(): result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x)) expected = DataFrame( - [[768.0] * 4] * 12 + [[528.0] * 4], + [[768] * 4] * 12 + [[528] * 4], index=timedelta_range(start="1s", periods=13, freq="3s"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 2ed38670e88a6..96b88dc61cfed 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -627,3 +627,14 @@ def test_concat_null_object_with_dti(): index=exp_index, ) tm.assert_frame_equal(result, expected) + + +def test_concat_multiindex_with_empty_rangeindex(): + # GH#41234 + mi = MultiIndex.from_tuples([("B", 1), ("C", 1)]) + df1 = DataFrame([[1, 2]], columns=mi) + df2 = DataFrame(index=[1], columns=pd.RangeIndex(0)) + + result = concat([df1, df2]) + expected = DataFrame([[1, 2], [np.nan, np.nan]], columns=mi) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 44299d51a878f..62fd93026d5e2 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -559,6 +559,8 @@ def test_crosstab_with_numpy_size(self): expected = DataFrame( expected_data, index=expected_index, columns=expected_column ) + # aggfunc is np.size, resulting in integers + expected["All"] = expected["All"].astype("int64") tm.assert_frame_equal(result, expected) def test_crosstab_duplicate_names(self): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 3d1c3b81c492f..2276281e3ecf8 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -986,7 +986,6 @@ def test_margins_dtype(self): tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(reason="GH#17035 (len of floats is casted back to floats)") def test_margins_dtype_len(self): mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 98ec4de614a07..2340d154e9e10 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -10,6 +10,7 @@ import pytest import pytz +from pandas.compat import PY310 from pandas.errors import OutOfBoundsDatetime from pandas import ( @@ -223,7 +224,11 @@ def test_constructor_tz_or_tzinfo(self): def test_constructor_positional(self): # see gh-10758 - msg = "an integer is required" + msg = ( + "'NoneType' object cannot be interpreted as an integer" + if PY310 + else "an integer is required" + ) with pytest.raises(TypeError, match=msg): Timestamp(2000, 1) diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py index 8a4c4d56e264d..7aea45755f940 100644 --- a/pandas/tests/series/accessors/test_cat_accessor.py +++ b/pandas/tests/series/accessors/test_cat_accessor.py @@ -48,7 +48,11 @@ def test_cat_accessor(self): assert not ser.cat.ordered, False exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"]) - return_value = ser.cat.set_categories(["b", "a"], inplace=True) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # issue #37643 inplace kwarg deprecated + return_value = ser.cat.set_categories(["b", "a"], inplace=True) + assert return_value is None tm.assert_categorical_equal(ser.values, exp) diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 442718d677101..6185fe6c54fa4 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -40,6 +40,26 @@ def test_clip_types_and_nulls(self): assert list(isna(s)) == list(isna(lower)) assert list(isna(s)) == list(isna(upper)) + def test_series_clipping_with_na_values( + self, any_nullable_numeric_dtype, nulls_fixture + ): + # Ensure that clipping method can handle NA values with out failing + # GH#40581 + + s = Series([nulls_fixture, 1.0, 3.0], dtype=any_nullable_numeric_dtype) + s_clipped_upper = s.clip(upper=2.0) + s_clipped_lower = s.clip(lower=2.0) + + expected_upper = Series( + [nulls_fixture, 1.0, 2.0], dtype=any_nullable_numeric_dtype + ) + expected_lower = Series( + [nulls_fixture, 2.0, 3.0], dtype=any_nullable_numeric_dtype + ) + + tm.assert_series_equal(s_clipped_upper, expected_upper) + tm.assert_series_equal(s_clipped_lower, expected_lower) + def test_clip_with_na_args(self): """Should process np.nan argument as None """ # GH#17276 diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 4fedbee91f649..17703d970e29e 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import Series from pandas.core import strings as strings @@ -173,3 +175,24 @@ def any_allowed_skipna_inferred_dtype(request): # correctness of inference tested in tests/dtypes/test_inference.py return inferred_dtype, values + + +@pytest.fixture( + params=[ + "object", + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def any_string_dtype(request): + """ + Parametrized fixture for string dtypes. + * 'object' + * 'string' + * 'arrow_string' + """ + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + return request.param diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 0c54042d983ad..06a7c6d56a61d 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -4,8 +4,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( Index, @@ -14,27 +12,6 @@ ) -@pytest.fixture( - params=[ - "object", - "string", - pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), - ] -) -def any_string_dtype(request): - """ - Parametrized fixture for string dtypes. - * 'object' - * 'string' - * 'arrow_string' - """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - return request.param - - def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -266,144 +243,157 @@ def test_endswith_nullable_string_dtype(nullable_string_dtype, na): tm.assert_series_equal(result, exp) -def test_replace(): - values = Series(["fooBAD__barBAD", np.nan]) +def test_replace(any_string_dtype): + values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) result = values.str.replace("BAD[_]*", "", regex=True) - exp = Series(["foobar", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foobar", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) result = values.str.replace("BAD[_]*", "", n=1, regex=True) - exp = Series(["foobarBAD", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) - # mixed + +def test_replace_mixed_object(): mixed = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) - rs = Series(mixed).str.replace("BAD[_]*", "", regex=True) - xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) + result = Series(mixed).str.replace("BAD[_]*", "", regex=True) + expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) + assert isinstance(result, Series) + tm.assert_almost_equal(result, expected) + - # flags + unicode - values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) - exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) +def test_replace_unicode(any_string_dtype): + values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) + expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) - tm.assert_series_equal(result, exp) + tm.assert_series_equal(result, expected) + - # GH 13438 +@pytest.mark.parametrize("klass", [Series, Index]) +@pytest.mark.parametrize("repl", [None, 3, {"a": "b"}]) +@pytest.mark.parametrize("data", [["a", "b", None], ["a", "b", "c", "ad"]]) +def test_replace_raises(any_string_dtype, klass, repl, data): + # https://github.com/pandas-dev/pandas/issues/13438 msg = "repl must be a string or callable" - for klass in (Series, Index): - for repl in (None, 3, {"a": "b"}): - for data in (["a", "b", None], ["a", "b", "c", "ad"]): - values = klass(data) - with pytest.raises(TypeError, match=msg): - values.str.replace("a", repl) + values = klass(data, dtype=any_string_dtype) + with pytest.raises(TypeError, match=msg): + values.str.replace("a", repl) -def test_replace_callable(): +def test_replace_callable(any_string_dtype): # GH 15055 - values = Series(["fooBAD__barBAD", np.nan]) + values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with callable repl = lambda m: m.group(0).swapcase() result = values.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) - exp = Series(["foObaD__baRbaD", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None] +) +def test_replace_callable_raises(any_string_dtype, repl): + # GH 15055 + values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with wrong number of arguments, raising an error - p_err = ( + msg = ( r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " r"(?(3)required )positional arguments?" ) - - repl = lambda: None - with pytest.raises(TypeError, match=p_err): - values.str.replace("a", repl) - - repl = lambda m, x: None - with pytest.raises(TypeError, match=p_err): + with pytest.raises(TypeError, match=msg): values.str.replace("a", repl) - repl = lambda m, x, y=None: None - with pytest.raises(TypeError, match=p_err): - values.str.replace("a", repl) +def test_replace_callable_named_groups(any_string_dtype): # test regex named groups - values = Series(["Foo Bar Baz", np.nan]) + values = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() result = values.str.replace(pat, repl, regex=True) - exp = Series(["bAR", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["bAR", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) -def test_replace_compiled_regex(): +def test_replace_compiled_regex(any_string_dtype): # GH 15446 - values = Series(["fooBAD__barBAD", np.nan]) + values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with compiled regex pat = re.compile(r"BAD_*") result = values.str.replace(pat, "", regex=True) - exp = Series(["foobar", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foobar", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) result = values.str.replace(pat, "", n=1, regex=True) - exp = Series(["foobarBAD", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) - # mixed + +def test_replace_compiled_regex_mixed_object(): + pat = re.compile(r"BAD_*") mixed = Series( ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) - rs = Series(mixed).str.replace(pat, "", regex=True) - xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) + result = Series(mixed).str.replace(pat, "", regex=True) + expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]) + assert isinstance(result, Series) + tm.assert_almost_equal(result, expected) + - # flags + unicode - values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) - exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) +def test_replace_compiled_regex_unicode(any_string_dtype): + values = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) + expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) result = values.str.replace(pat, ", ") - tm.assert_series_equal(result, exp) + tm.assert_series_equal(result, expected) + +def test_replace_compiled_regex_raises(any_string_dtype): # case and flags provided to str.replace will have no effect # and will produce warnings - values = Series(["fooBAD__barBAD__bad", np.nan]) + values = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype) pat = re.compile(r"BAD_*") - with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, "", flags=re.IGNORECASE) + msg = "case and flags cannot be set when pat is a compiled regex" + + with pytest.raises(ValueError, match=msg): + values.str.replace(pat, "", flags=re.IGNORECASE) + + with pytest.raises(ValueError, match=msg): + values.str.replace(pat, "", case=False) - with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, "", case=False) + with pytest.raises(ValueError, match=msg): + values.str.replace(pat, "", case=True) - with pytest.raises(ValueError, match="case and flags cannot be"): - result = values.str.replace(pat, "", case=True) +def test_replace_compiled_regex_callable(any_string_dtype): # test with callable - values = Series(["fooBAD__barBAD", np.nan]) + values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") result = values.str.replace(pat, repl, n=2) - exp = Series(["foObaD__baRbaD", np.nan]) - tm.assert_series_equal(result, exp) + expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) -def test_replace_literal(): +def test_replace_literal(any_string_dtype): # GH16808 literal replace (regex=False vs regex=True) - values = Series(["f.o", "foo", np.nan]) - exp = Series(["bao", "bao", np.nan]) + values = Series(["f.o", "foo", np.nan], dtype=any_string_dtype) + expected = Series(["bao", "bao", np.nan], dtype=any_string_dtype) result = values.str.replace("f.", "ba", regex=True) - tm.assert_series_equal(result, exp) + tm.assert_series_equal(result, expected) - exp = Series(["bao", "foo", np.nan]) + expected = Series(["bao", "foo", np.nan], dtype=any_string_dtype) result = values.str.replace("f.", "ba", regex=False) - tm.assert_series_equal(result, exp) + tm.assert_series_equal(result, expected) # Cannot do a literal replace if given a callable repl or compiled # pattern @@ -419,19 +409,39 @@ def test_replace_literal(): values.str.replace(compiled_pat, "", regex=False) -def test_match(): +def test_match(any_string_dtype): # New match behavior introduced in 0.13 - values = Series(["fooBAD__barBAD", np.nan, "foo"]) + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + + values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") - exp = Series([True, np.nan, False]) - tm.assert_series_equal(result, exp) + expected = Series([True, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) - values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) + values = Series( + ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype + ) result = values.str.match(".*BAD[_]+.*BAD") - exp = Series([True, True, np.nan, False]) - tm.assert_series_equal(result, exp) + expected = Series([True, True, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) - # mixed + result = values.str.match("BAD[_]+.*BAD") + expected = Series([False, True, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + values = Series( + ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype + ) + result = values.str.match("^BAD[_]+.*BAD") + expected = Series([False, False, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + result = values.str.match("\\^BAD[_]+.*BAD") + expected = Series([False, True, np.nan, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + +def test_match_mixed_object(): mixed = Series( [ "aBAD_BAD", @@ -445,22 +455,34 @@ def test_match(): 2.0, ] ) - rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") - xp = Series([True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_series_equal(rs, xp) + result = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") + expected = Series( + [True, np.nan, True, np.nan, np.nan, False, np.nan, np.nan, np.nan] + ) + assert isinstance(result, Series) + tm.assert_series_equal(result, expected) + + +def test_match_na_kwarg(any_string_dtype): + # GH #6609 + s = Series(["a", "b", np.nan], dtype=any_string_dtype) + + result = s.str.match("a", na=False) + expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected = Series([True, False, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + result = s.str.match("a") + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected = Series([True, False, np.nan], dtype=expected_dtype) + tm.assert_series_equal(result, expected) - # na GH #6609 - res = Series(["a", 0, np.nan]).str.match("a", na=False) - exp = Series([True, False, False]) - tm.assert_series_equal(exp, res) - res = Series(["a", 0, np.nan]).str.match("a") - exp = Series([True, np.nan, np.nan]) - tm.assert_series_equal(exp, res) - values = Series(["ab", "AB", "abc", "ABC"]) +def test_match_case_kwarg(any_string_dtype): + values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected = Series([True, True, True, True]) + expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -680,13 +702,17 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) -def test_replace_moar(): +def test_replace_moar(any_string_dtype): # PR #1179 - s = Series(["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"]) + s = Series( + ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], + dtype=any_string_dtype, + ) result = s.str.replace("A", "YYY") expected = Series( - ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"] + ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"], + dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) @@ -703,7 +729,8 @@ def test_replace_moar(): "CYYYBYYY", "dog", "cYYYt", - ] + ], + dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) @@ -720,7 +747,8 @@ def test_replace_moar(): "XX-XX BA", "XX-XX ", "XX-XX t", - ] + ], + dtype=any_string_dtype, ) tm.assert_series_equal(result, expected) @@ -751,6 +779,7 @@ def test_flags_kwarg(any_string_dtype): result = data.str.count(pat, flags=re.IGNORECASE) assert result[0] == 1 - with tm.assert_produces_warning(UserWarning): + msg = "This pattern has match groups" + with tm.assert_produces_warning(UserWarning, match=msg): result = data.str.contains(pat, flags=re.IGNORECASE) assert result[0] diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index f218d5333b415..5d8a63fe481f8 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -6,40 +6,16 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Index, MultiIndex, Series, isna, - notna, ) import pandas._testing as tm -@pytest.fixture( - params=[ - "object", - "string", - pytest.param( - "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), - ] -) -def any_string_dtype(request): - """ - Parametrized fixture for string dtypes. - * 'object' - * 'string' - * 'arrow_string' - """ - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - return request.param - - def assert_series_or_index_equal(left, right): if isinstance(left, Series): tm.assert_series_equal(left, right) @@ -402,14 +378,19 @@ def test_join(): tm.assert_almost_equal(rs, xp) -def test_len(): - values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"]) +def test_len(any_string_dtype): + values = Series( + ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"], + dtype=any_string_dtype, + ) result = values.str.len() - exp = values.map(lambda x: len(x) if notna(x) else np.nan) - tm.assert_series_equal(result, exp) + expected_dtype = "float64" if any_string_dtype == "object" else "Int64" + expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) + tm.assert_series_equal(result, expected) - # mixed + +def test_len_mixed(): mixed = Series( [ "a_b", diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c7af104f62770..964dd9bdd0e0a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1756,14 +1756,15 @@ def _check(arr): _check(np.array([np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan])) _check(np.array([4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan])) - def test_basic(self, writable): + @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) + def test_basic(self, writable, dtype): exp = np.array([1, 2], dtype=np.float64) - for dtype in np.typecodes["AllInteger"]: - data = np.array([1, 100], dtype=dtype) - data.setflags(write=writable) - s = Series(data) - tm.assert_numpy_array_equal(algos.rank(s), exp) + data = np.array([1, 100], dtype=dtype) + data.setflags(write=writable) + ser = Series(data) + result = algos.rank(ser) + tm.assert_numpy_array_equal(result, exp) def test_uint64_overflow(self): exp = np.array([1, 2], dtype=np.float64) diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 82a3a223b442b..1778b6fb9d832 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -3,9 +3,11 @@ from pandas import ( Categorical, + CategoricalIndex, Index, MultiIndex, NaT, + RangeIndex, ) import pandas._testing as tm @@ -199,6 +201,28 @@ def test_index_equal_category_mismatch(check_categorical): tm.assert_index_equal(idx1, idx2, check_categorical=check_categorical) +@pytest.mark.parametrize("exact", [False, True]) +def test_index_equal_range_categories(check_categorical, exact): + # GH41263 + msg = """\ +Index are different + +Index classes are different +\\[left\\]: RangeIndex\\(start=0, stop=10, step=1\\) +\\[right\\]: Int64Index\\(\\[0, 1, 2, 3, 4, 5, 6, 7, 8, 9\\], dtype='int64'\\)""" + + rcat = CategoricalIndex(RangeIndex(10)) + icat = CategoricalIndex(list(range(10))) + + if check_categorical and exact: + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(rcat, icat, check_categorical=True, exact=True) + else: + tm.assert_index_equal( + rcat, icat, check_categorical=check_categorical, exact=exact + ) + + def test_assert_index_equal_mixed_dtype(): # GH#39168 idx = Index(["foo", "bar", 42]) diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 06b34201e0dba..b79c367d482ae 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -123,30 +123,44 @@ def func_2(x): @td.skip_if_no("numba", "0.46.0") -class TestGroupbyEWMMean: - def test_invalid_engine(self): +class TestEWMMean: + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_invalid_engine(self, grouper): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) with pytest.raises(ValueError, match="engine must be either"): - df.groupby("A").ewm(com=1.0).mean(engine="foo") + grouper(df).ewm(com=1.0).mean(engine="foo") - def test_invalid_engine_kwargs(self): + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_invalid_engine_kwargs(self, grouper): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) with pytest.raises(ValueError, match="cython engine does not"): - df.groupby("A").ewm(com=1.0).mean( + grouper(df).ewm(com=1.0).mean( engine="cython", engine_kwargs={"nopython": True} ) - def test_cython_vs_numba(self, nogil, parallel, nopython, ignore_na, adjust): + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_cython_vs_numba( + self, grouper, nogil, parallel, nopython, ignore_na, adjust + ): df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) - gb_ewm = df.groupby("A").ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) + ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = gb_ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = gb_ewm.mean(engine="cython") + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected) - def test_cython_vs_numba_times(self, nogil, parallel, nopython, ignore_na): + @pytest.mark.parametrize( + "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] + ) + def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na): # GH 40951 halflife = "23 days" times = to_datetime( @@ -160,13 +174,13 @@ def test_cython_vs_numba_times(self, nogil, parallel, nopython, ignore_na): ] ) df = DataFrame({"A": ["a", "b", "a", "b", "b", "a"], "B": [0, 0, 1, 1, 2, 2]}) - gb_ewm = df.groupby("A").ewm( + ewm = grouper(df).ewm( halflife=halflife, adjust=True, ignore_na=ignore_na, times=times ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = gb_ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = gb_ewm.mean(engine="cython") + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected) From 04b1a943ac76c61da47064768470182a20196d6a Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 5 May 2021 16:27:57 -0700 Subject: [PATCH 08/10] mypy fixup --- pandas/_libs/tslibs/nattype.pyi | 28 ++++++++++++++++++++++++---- pandas/_typing.py | 4 ++-- pandas/core/arrays/masked.py | 2 +- pandas/core/common.py | 3 +-- pandas/core/dtypes/cast.py | 6 +++++- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/datetimes.py | 3 ++- 7 files changed, 36 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index 233cd5688cb16..ee7299e22a9dd 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -141,7 +141,27 @@ class NaTType(datetime): def __eq__(self, other: Any) -> bool: ... def __ne__(self, other: Any) -> bool: ... - def __lt__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... - def __le__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... - def __gt__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... - def __ge__(self, other: NaTType | datetime | timedelta | Period) -> bool: ... + # error: Argument 1 of "__lt__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __lt__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... + # error: Argument 1 of "__le__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __le__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... + # error: Argument 1 of "__gt__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __gt__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... + # error: Argument 1 of "__ge__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __ge__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... diff --git a/pandas/_typing.py b/pandas/_typing.py index afbe6614ff4ec..68e2a2250a1d0 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -86,10 +86,10 @@ # scalars -PythonScalar = Optional[Union[str, int, float, bool, date, time, timedelta]] +PythonScalar = Optional[Union[str, int, float, complex, bool, date, time, timedelta]] DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"] PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] -Scalar = Union[PythonScalar, PandasScalar, np.number] +Scalar = Union[PythonScalar, PandasScalar, np.number, np.datetime64, np.timedelta64] # timestamp and timedelta convertible types diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 11f9f645920ec..a0c68e1bbb072 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -215,7 +215,7 @@ def to_numpy( # type: ignore[override] self, dtype: NpDtype | None = None, copy: bool = False, - na_value: Scalar = lib.no_default, + na_value: Scalar | lib.NoDefault = lib.no_default, ) -> np.ndarray: """ Convert to a NumPy Array. diff --git a/pandas/core/common.py b/pandas/core/common.py index 04ff2d2c4618f..2ebcbf8829e6a 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -30,7 +30,6 @@ from pandas._typing import ( AnyArrayLike, NpDtype, - Scalar, T, ) from pandas.compat import np_version_under1p18 @@ -490,7 +489,7 @@ def f(x): def convert_to_list_like( - values: Scalar | Iterable | AnyArrayLike, + values: Any, ) -> list | AnyArrayLike: """ Convert list-like or scalar input to list-like. List, numpy and pandas array-like diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 46dc97214e2f6..b76a2fcd43c08 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -215,7 +215,11 @@ def maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar: # GH#36541: can't fill array directly with pd.NaT # > np.empty(10, dtype="datetime64[64]").fill(pd.NaT) # ValueError: cannot convert float NaN to integer - value = dtype.type("NaT", "ns") + # error: Incompatible types in assignment (expression has type + # "Union[generic, Any]", variable has type "Union[Union[str, int, float, + # complex, bool, date, time, timedelta, None], Union[Period, Timestamp, + # Timedelta, Interval], number[Any], datetime64, timedelta64]") + value = dtype.type("NaT", "ns") # type: ignore[assignment] elif isinstance(value, Timestamp): if value.tz is None: value = value.to_datetime64() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 84f1245299d53..93e1a9bbe5d80 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4410,7 +4410,7 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ return result - @doc(IndexOpsMixin._memory_usage) + @Appender(IndexOpsMixin._memory_usage.__doc__) def memory_usage(self, deep: bool = False) -> int: result = self._memory_usage(deep=deep) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ceb9a96366adc..5fd9156cddc1f 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -37,6 +37,7 @@ ) from pandas.errors import InvalidIndexError from pandas.util._decorators import ( + Appender, cache_readonly, doc, ) @@ -269,7 +270,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): # methods that dispatch to DatetimeArray and wrap result # error: Cannot determine type of 'strftime' - @doc(DatetimeArray.strftime) # type: ignore[has-type] + @Appender(DatetimeArray.strftime.__doc__) # type: ignore[has-type] def strftime(self, date_format) -> Index: arr = self._data.strftime(date_format) return Index(arr, name=self.name) From 6bc8788567ccbbe911fb780365e1efbb85497422 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 6 May 2021 12:39:17 -0700 Subject: [PATCH 09/10] mypy fixup --- pandas/core/arrays/categorical.py | 5 ++++- pandas/core/arrays/datetimelike.py | 5 ++++- pandas/core/arrays/datetimes.py | 3 ++- pandas/core/arrays/numpy_.py | 3 ++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a82c75f4b2557..7ee17b26d0679 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -243,7 +243,10 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): +# error: Cannot determine type of 'repeat' in base class 'ExtensionArray' +class Categorical( # type: ignore[misc] + NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin +): """ Represent a categorical variable in classic R / S-plus fashion. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 286fd8bf8ba4a..3550757e0271f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -141,7 +141,10 @@ class InvalidComparison(Exception): pass -class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): +# error: Cannot determine type of 'repeat' in base class 'ExtensionArray' +class DatetimeLikeArrayMixin( # type: ignore[misc] + OpsMixin, NDArrayBackedExtensionArray +): """ Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index f07a04b8087e0..0d17bfd5f30c2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -147,7 +147,8 @@ def f(self): return property(f) -class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): +# error: Cannot determine type of 'repeat' in base class 'ExtensionArray' +class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index e9d554200805e..fd30cf47666d8 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -27,7 +27,8 @@ from pandas.core.strings.object_array import ObjectStringArrayMixin -class PandasArray( +# error: Cannot determine type of 'repeat' in base class 'ExtensionArray' +class PandasArray( # type: ignore[misc] OpsMixin, NDArrayBackedExtensionArray, NDArrayOperatorsMixin, From 9904a42ecb587aa86b75ba7fe4fa1ae5d5e3e94d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 6 May 2021 12:45:36 -0700 Subject: [PATCH 10/10] revert experiment --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 93e1a9bbe5d80..84f1245299d53 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4410,7 +4410,7 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ return result - @Appender(IndexOpsMixin._memory_usage.__doc__) + @doc(IndexOpsMixin._memory_usage) def memory_usage(self, deep: bool = False) -> int: result = self._memory_usage(deep=deep)