diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ae7e8191fc482..613c0ebfbfc77 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -213,7 +213,7 @@ def _reconstruct_data( if isinstance(values, cls) and values.dtype == dtype: return values - values = cls._from_sequence(values) + values = cls._from_scalars(values, dtype=dtype) elif is_bool_dtype(dtype): values = values.astype(dtype, copy=False) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a62a5ec4ec7f7..eba5619ab9624 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -189,6 +189,12 @@ class ExtensionArray: # Constructors # ------------------------------------------------------------------------ + @classmethod + def _from_scalars(cls, data, dtype): + if not all(isinstance(v, dtype.type) or isna(v) for v in data): + raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): """ @@ -688,7 +694,7 @@ def fillna(self, value=None, method=None, limit=None): if method is not None: func = get_fill_func(method) new_values = func(self.astype(object), limit=limit, mask=mask) - new_values = self._from_sequence(new_values, dtype=self.dtype) + new_values = self._from_scalars(new_values, dtype=self.dtype) else: # fill with value new_values = self.copy() @@ -750,7 +756,7 @@ def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: if isna(fill_value): fill_value = self.dtype.na_value - empty = self._from_sequence( + empty = self._from_scalars( [fill_value] * min(abs(periods), len(self)), dtype=self.dtype ) if periods > 0: @@ -770,7 +776,7 @@ def unique(self): uniques : ExtensionArray """ uniques = unique(self.astype(object)) - return self._from_sequence(uniques, dtype=self.dtype) + return self._from_scalars(uniques, dtype=self.dtype) def searchsorted(self, value, side="left", sorter=None): """ @@ -1080,7 +1086,7 @@ def take(self, indices, allow_fill=False, fill_value=None): result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) - return self._from_sequence(result, dtype=self.dtype) + return self._from_scalars(result, dtype=self.dtype) """ # Implementer note: The `fill_value` parameter should be a user-facing # value, an instance of self.dtype.type. When passed `fill_value=None`, @@ -1420,7 +1426,7 @@ def _maybe_convert(arr): # https://github.com/pandas-dev/pandas/issues/22850 # We catch all regular exceptions here, and fall back # to an ndarray. - res = maybe_cast_to_extension_array(type(self), arr) + res = maybe_cast_to_extension_array(type(self), arr, self.dtype) if not isinstance(res, type(self)): # exception raised in _from_sequence; ensure we have ndarray res = np.asarray(arr) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index dd281a39907fd..35921dce4414f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -275,6 +275,13 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): def dtype(self) -> BooleanDtype: return self._dtype + @classmethod + def _from_scalars(cls, data, dtype) -> BooleanArray: + # override because dtype.type is only the numpy scalar + if not all(isinstance(v, (bool, np.bool_)) or isna(v) for v in data): + raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence( cls, scalars, *, dtype: Optional[Dtype] = None, copy: bool = False diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 48316373a1140..3006b012c8f33 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -426,6 +426,13 @@ def _constructor(self) -> Type[Categorical]: return Categorical @classmethod + def _from_scalars(cls, data, dtype): + # if not all( + # isinstance(v, dtype.categories.dtype.type) or isna(v) for v in data + # ): + # raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): return Categorical(scalars, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 144a7186f5826..cda406f9cd035 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -305,6 +305,13 @@ def _simple_new( result._dtype = dtype return result + @classmethod + def _from_scalars(cls, data, dtype): + # override because dtype.type is not always Timestamp + if not all(isinstance(v, Timestamp) or isna(v) for v in data): + raise TypeError("Requires timestamp scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 2c3b3d3c2f0b4..7953043c2b614 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -241,6 +241,13 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): ) super().__init__(values, mask, copy=copy) + @classmethod + def _from_scalars(cls, data, dtype): + # override because dtype.type is only the numpy scalar + if not all(isinstance(v, (float, dtype.type)) or isna(v) for v in data): + raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence( cls, scalars, *, dtype=None, copy: bool = False diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index ff1af80f81ac6..fb5bcb20c6d9b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -302,6 +302,16 @@ def __pos__(self): def __abs__(self): return type(self)(np.abs(self._data), self._mask) + @classmethod + def _from_scalars(cls, data, dtype): + # override because dtype.type is only the numpy scalar + # TODO accept float here? + if not all( + isinstance(v, (int, dtype.type, float, np.float_)) or isna(v) for v in data + ): + raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence( cls, scalars, *, dtype: Optional[Dtype] = None, copy: bool = False diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 882ca0955bc99..5982bb5c1c127 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -882,7 +882,7 @@ def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: fill_value = Index(self._left, copy=False)._na_value empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) else: - empty = self._from_sequence([fill_value] * empty_len) + empty = self._from_scalars([fill_value] * empty_len, self.dtype) if periods > 0: a = empty diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 9999a9ed411d8..2dd7cc70dd50d 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -78,6 +78,13 @@ def __init__(self, values: Union[np.ndarray, PandasArray], copy: bool = False): self._ndarray = values self._dtype = PandasDtype(values.dtype) + @classmethod + def _from_scalars(cls, data, dtype): + # doesn't work for object dtype + # if not all(isinstance(v, dtype.type) or isna(v) for v in data): + # raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence( cls, scalars, *, dtype: Optional[Dtype] = None, copy: bool = False diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0be3970159fbd..00af59ff311a6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -345,6 +345,12 @@ def maybe_cast_result( """ dtype = obj.dtype dtype = maybe_cast_result_dtype(dtype, how) + # result_dtype = maybe_cast_result_dtype(dtype, how) + # if result_dtype is not None: + # # we know what the result dtypes needs to be -> be more permissive in casting + # # (eg ints with nans became floats) + # cls = result_dtype.construct_array_type() + # return cls._from_sequence(obj, dtype=result_dtype) assert not is_scalar(result) @@ -395,19 +401,20 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: ): return Float64Dtype() return dtype + # return None def maybe_cast_to_extension_array( cls: Type[ExtensionArray], obj: ArrayLike, dtype: Optional[ExtensionDtype] = None ) -> ArrayLike: """ - Call to `_from_sequence` that returns the object unchanged on Exception. + Call to `_from_scalars` that returns the object unchanged on Exception. Parameters ---------- cls : class, subclass of ExtensionArray obj : arraylike - Values to pass to cls._from_sequence + Values to pass to cls._from_scalars dtype : ExtensionDtype, optional Returns @@ -429,7 +436,7 @@ def maybe_cast_to_extension_array( return obj try: - result = cls._from_sequence(obj, dtype=dtype) + result = cls._from_scalars(obj, dtype=dtype) except Exception: # We can't predict what downstream EA constructors may raise result = obj diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 82e984d36b6a1..e2d42820c9b68 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2951,7 +2951,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: arr_type = dtype.construct_array_type() values = self.values - new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] + new_values = [arr_type._from_scalars(row, dtype=dtype) for row in values] result = self._constructor( dict(zip(self.index, new_values)), index=self.columns ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f1cf1aa9a72cb..801be8aaddfe4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -957,7 +957,7 @@ def fast_xs(self, loc: int) -> ArrayLike: result[rl] = blk.iget((i, loc)) if isinstance(dtype, ExtensionDtype): - result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + result = dtype.construct_array_type()._from_scalars(result, dtype=dtype) return result diff --git a/pandas/core/series.py b/pandas/core/series.py index f75292f32dbca..ee36436303028 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2913,7 +2913,9 @@ def combine(self, other, func, fill_value=None) -> Series: # TODO: can we do this for only SparseDtype? # The function can return something of any type, so check # if the type is compatible with the calling EA. - new_values = maybe_cast_to_extension_array(type(self._values), new_values) + new_values = maybe_cast_to_extension_array( + type(self._values), new_values, self.dtype + ) return self._constructor(new_values, index=new_index, name=new_name) def combine_first(self, other) -> Series: diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 4122fcaae496b..21ab387d858c8 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -72,6 +72,13 @@ def __init__(self, values, dtype=None, copy=False, context=None): def dtype(self): return self._dtype + @classmethod + def _from_scalars(cls, data, dtype): + # TODO not needed if we keep the base class method + if not all(isinstance(v, dtype.type) or pd.isna(v) for v in data): + raise TypeError("Requires dtype scalars") + return cls._from_sequence(data, dtype=dtype) + @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(scalars) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index c269d6ff11896..c9193910bb063 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -35,8 +35,15 @@ def test_astype_object(self, index): def test_astype_category(self, index): result = index.astype("category") - expected = CategoricalIndex(index.values) + # TODO astype doesn't preserve the exact interval dtype (eg uint64) + # while the CategoricalIndex constructor does -> temporarily also + # here convert to object dtype numpy array. + # Once this is fixed, the commented code can be uncommented + # -> https://github.com/pandas-dev/pandas/issues/38316 + # expected = CategoricalIndex(index.values) + expected = CategoricalIndex(np.asarray(index.values)) tm.assert_index_equal(result, expected) + # assert result.dtype.categories.dtype == index.dtype result = index.astype(CategoricalDtype()) tm.assert_index_equal(result, expected)