From c8dc332937164e74c3e561954408826bb9065109 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Dec 2020 13:45:16 +0100 Subject: [PATCH 1/2] Add stricter EA._from_scalars and use in maybe_cast_result --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/base.py | 16 +++++++++++----- pandas/core/arrays/boolean.py | 7 +++++++ pandas/core/arrays/categorical.py | 8 ++++++++ pandas/core/arrays/datetimes.py | 7 +++++++ pandas/core/arrays/floating.py | 7 +++++++ pandas/core/arrays/integer.py | 10 ++++++++++ pandas/core/arrays/interval.py | 2 +- pandas/core/arrays/numpy_.py | 7 +++++++ pandas/core/dtypes/cast.py | 13 ++++++++++--- pandas/core/frame.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/core/series.py | 4 +++- pandas/tests/extension/decimal/array.py | 7 +++++++ 14 files changed, 81 insertions(+), 13 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9749297efd004..40de965a46d24 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -210,7 +210,7 @@ def _reconstruct_data( if isinstance(values, cls) and values.dtype == dtype: return values - values = cls._from_sequence(values) + values = cls._from_scalars(values, dtype=dtype) elif is_bool_dtype(dtype): values = values.astype(dtype, copy=False) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 77cd603cc6b8d..4bd6f9f0dc2d3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -188,6 +188,12 @@ class ExtensionArray: # Constructors # ------------------------------------------------------------------------ + @classmethod + def _from_scalars(cls, data, dtype): + if not all(isinstance(v, dtype.type) or isna(v) for v in data): + raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): """ @@ -668,7 +674,7 @@ def fillna(self, value=None, method=None, limit=None): if method is not None: func = get_fill_func(method) new_values = func(self.astype(object), limit=limit, mask=mask) - new_values = self._from_sequence(new_values, dtype=self.dtype) + new_values = self._from_scalars(new_values, dtype=self.dtype) else: # fill with value new_values = self.copy() @@ -730,7 +736,7 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: if isna(fill_value): fill_value = self.dtype.na_value - empty = self._from_sequence( + empty = self._from_scalars( [fill_value] * min(abs(periods), len(self)), dtype=self.dtype ) if periods > 0: @@ -750,7 +756,7 @@ def unique(self): uniques : ExtensionArray """ uniques = unique(self.astype(object)) - return self._from_sequence(uniques, dtype=self.dtype) + return self._from_scalars(uniques, dtype=self.dtype) def searchsorted(self, value, side="left", sorter=None): """ @@ -1044,7 +1050,7 @@ def take(self, indices, allow_fill=False, fill_value=None): result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) - return self._from_sequence(result, dtype=self.dtype) + return self._from_scalars(result, dtype=self.dtype) """ # Implementer note: The `fill_value` parameter should be a user-facing # value, an instance of self.dtype.type. When passed `fill_value=None`, @@ -1377,7 +1383,7 @@ def _maybe_convert(arr): # https://github.com/pandas-dev/pandas/issues/22850 # We catch all regular exceptions here, and fall back # to an ndarray. - res = maybe_cast_to_extension_array(type(self), arr) + res = maybe_cast_to_extension_array(type(self), arr, self.dtype) if not isinstance(res, type(self)): # exception raised in _from_sequence; ensure we have ndarray res = np.asarray(arr) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 44cc108ed9cfd..6476f04b8f244 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -272,6 +272,13 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): def dtype(self) -> BooleanDtype: return self._dtype + @classmethod + def _from_scalars(cls, data, dtype) -> "BooleanArray": + # override because dtype.type is only the numpy scalar + if not all(isinstance(v, (bool, np.bool_)) or isna(v) for v in data): + raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence( cls, scalars, *, dtype=None, copy: bool = False diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3995e7b251184..fc528e84ddae9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -387,6 +387,14 @@ def dtype(self) -> CategoricalDtype: def _constructor(self) -> Type["Categorical"]: return Categorical + @classmethod + def _from_scalars(cls, data, dtype): + # if not all( + # isinstance(v, dtype.categories.dtype.type) or isna(v) for v in data + # ): + # raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): return Categorical(scalars, dtype=dtype) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index ce70f929cc79d..a180c253cb44f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -301,6 +301,13 @@ def _simple_new( result._dtype = dtype return result + @classmethod + def _from_scalars(cls, data, dtype): + # override because dtype.type is not always Timestamp + if not all(isinstance(v, Timestamp) or isna(v) for v in data): + raise TypeError("Requires timestamp scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1077538f6a21d..88d4fae6e1f8c 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -273,6 +273,13 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): ) super().__init__(values, mask, copy=copy) + @classmethod + def _from_scalars(cls, data, dtype): + # override because dtype.type is only the numpy scalar + if not all(isinstance(v, (float, dtype.type)) or isna(v) for v in data): + raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence( cls, scalars, *, dtype=None, copy: bool = False diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index fa427e94fe08f..3d6d92de88f3d 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -356,6 +356,16 @@ def __pos__(self): def __abs__(self): return type(self)(np.abs(self._data), self._mask) + @classmethod + def _from_scalars(cls, data, dtype): + # override because dtype.type is only the numpy scalar + # TODO accept float here? + if not all( + isinstance(v, (int, dtype.type, float, np.float_)) or isna(v) for v in data + ): + raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence( cls, scalars, *, dtype=None, copy: bool = False diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 53a98fc43becc..4b8234fabca9c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -850,7 +850,7 @@ def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": fill_value = Index(self._left, copy=False)._na_value empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) else: - empty = self._from_sequence([fill_value] * empty_len) + empty = self._from_scalars([fill_value] * empty_len, self.dtype) if periods > 0: a = empty diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 50d12703c3a30..a880f6d90cfb1 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -170,6 +170,13 @@ def __init__(self, values: Union[np.ndarray, "PandasArray"], copy: bool = False) self._ndarray = values self._dtype = PandasDtype(values.dtype) + @classmethod + def _from_scalars(cls, data, dtype): + # doesn't work for object dtype + # if not all(isinstance(v, dtype.type) or isna(v) for v in data): + # raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence( cls, scalars, *, dtype=None, copy: bool = False diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 12974d56dacdc..481cece4086c7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -320,6 +320,12 @@ def maybe_cast_result( """ dtype = obj.dtype dtype = maybe_cast_result_dtype(dtype, how) + # result_dtype = maybe_cast_result_dtype(dtype, how) + # if result_dtype is not None: + # # we know what the result dtypes needs to be -> be more permissive in casting + # # (eg ints with nans became floats) + # cls = result_dtype.construct_array_type() + # return cls._from_sequence(obj, dtype=result_dtype) assert not is_scalar(result) @@ -364,19 +370,20 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype): return Int64Dtype() return dtype + # return None def maybe_cast_to_extension_array( cls: Type["ExtensionArray"], obj: ArrayLike, dtype: Optional[ExtensionDtype] = None ) -> ArrayLike: """ - Call to `_from_sequence` that returns the object unchanged on Exception. + Call to `_from_scalars` that returns the object unchanged on Exception. Parameters ---------- cls : class, subclass of ExtensionArray obj : arraylike - Values to pass to cls._from_sequence + Values to pass to cls._from_scalars dtype : ExtensionDtype, optional Returns @@ -398,7 +405,7 @@ def maybe_cast_to_extension_array( return obj try: - result = cls._from_sequence(obj, dtype=dtype) + result = cls._from_scalars(obj, dtype=dtype) except Exception: # We can't predict what downstream EA constructors may raise result = obj diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f710660d6ad8e..2b082a10f1949 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2912,7 +2912,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: arr_type = dtype.construct_array_type() values = self.values - new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] + new_values = [arr_type._from_scalars(row, dtype=dtype) for row in values] result = self._constructor( dict(zip(self.index, new_values)), index=self.columns ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 93ab207d8ce12..fbee7b0b36050 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -968,7 +968,7 @@ def fast_xs(self, loc: int) -> ArrayLike: result[rl] = blk.iget((i, loc)) if isinstance(dtype, ExtensionDtype): - result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + result = dtype.construct_array_type()._from_scalars(result, dtype=dtype) return result diff --git a/pandas/core/series.py b/pandas/core/series.py index b20cf8eed9a2e..d1b2277a50ca6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2919,7 +2919,9 @@ def combine(self, other, func, fill_value=None) -> "Series": # TODO: can we do this for only SparseDtype? # The function can return something of any type, so check # if the type is compatible with the calling EA. - new_values = maybe_cast_to_extension_array(type(self._values), new_values) + new_values = maybe_cast_to_extension_array( + type(self._values), new_values, self.dtype + ) return self._constructor(new_values, index=new_index, name=new_name) def combine_first(self, other) -> "Series": diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index a713550dafa5c..3b560c571b1d8 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -67,6 +67,13 @@ def __init__(self, values, dtype=None, copy=False, context=None): def dtype(self): return self._dtype + @classmethod + def _from_scalars(cls, data, dtype): + # TODO not needed if we keep the base class method + if not all(isinstance(v, dtype.type) or pd.isna(v) for v in data): + raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(scalars) From 2bf2992858b008158f0b936750a9969a48499432 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 5 Dec 2020 16:15:20 +0100 Subject: [PATCH 2/2] silence interval astype(category) test --- pandas/tests/indexes/interval/test_astype.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index b4af1cb5859f0..ea157222e3da3 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -33,8 +33,15 @@ def test_astype_object(self, index): def test_astype_category(self, index): result = index.astype("category") - expected = CategoricalIndex(index.values) + # TODO astype doesn't preserve the exact interval dtype (eg uint64) + # while the CategoricalIndex constructor does -> temporarily also + # here convert to object dtype numpy array. + # Once this is fixed, the commented code can be uncommented + # -> https://github.com/pandas-dev/pandas/issues/38316 + # expected = CategoricalIndex(index.values) + expected = CategoricalIndex(np.asarray(index.values)) tm.assert_index_equal(result, expected) + # assert result.dtype.categories.dtype == index.dtype result = index.astype(CategoricalDtype()) tm.assert_index_equal(result, expected)