diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 103df0fd94847..2d12566088e60 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -5,12 +5,12 @@ class BooleanArray: def setup(self): - self.values_bool = np.array([True, False, True, False]) - self.values_float = np.array([1.0, 0.0, 1.0, 0.0]) - self.values_integer = np.array([1, 0, 1, 0]) - self.values_integer_like = [1, 0, 1, 0] - self.data = np.array([True, False, True, False]) - self.mask = np.array([False, False, True, False]) + self.values_bool = np.array([True, False, True, False] * 1000) + self.values_float = np.array([1.0, 0.0, 1.0, 0.0] * 1000) + self.values_integer = np.array([1, 0, 1, 0] * 1000) + self.values_integer_like = [1, 0, 1, 0] * 1000 + self.data = np.array([True, False, True, False] * 1000) + self.mask = np.array([False, False, True, False] * 1000) def time_constructor(self): pd.arrays.BooleanArray(self.data, self.mask) @@ -30,12 +30,20 @@ def time_from_float_array(self): class IntegerArray: def setup(self): - self.values_integer = np.array([1, 0, 1, 0]) - self.data = np.array([1, 2, 3, 4], dtype="int64") - self.mask = np.array([False, False, True, False]) + self.values_integer = np.array([1, 0, 1, 0] * 1000) + self.data = np.array([1, 2, 3, 4] * 1000, dtype="int64") + self.mask = np.array([False, False, True, False] * 1000) def time_constructor(self): pd.arrays.IntegerArray(self.data, self.mask) def time_from_integer_array(self): pd.array(self.values_integer, dtype="Int64") + + +class NullableArrayMemory: + params = [["boolean", "Int8", "Float32"]] + param_names = ["dtype"] + + def track_array_size(self, dtype): + return pd.array(np.ones(1000), dtype=dtype).nbytes diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 7592ce54e3712..fdb64d278c318 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -194,6 +194,22 @@ def time_any(self, N, case, dtype): self.s.any() +class Isna: + params = ["float", "Float64", "Int64", "boolean"] + param_names = ["dtype"] + + def setup(self, dtype): + self.ser = Series(np.ones(10000), dtype=dtype) + self.ser_nulls = self.ser.copy() + self.ser_nulls[::2] = np.nan + + def time_isna_no_nans(self, dtype): + self.ser.isna() + + def time_isna_nans(self, dtype): + self.ser_nulls.isna() + + class NanOps: params = [ @@ -217,9 +233,6 @@ class NanOps: param_names = ["func", "N", "dtype"] def setup(self, func, N, dtype): - if func == "argmax" and dtype in {"Int64", "boolean"}: - # Skip argmax for nullable int since this doesn't work yet (GH-24382) - raise NotImplementedError self.s = Series([1] * N, dtype=dtype) self.func = getattr(self.s, func) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 166ea2f0d4164..031d9dfa9b25d 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -105,7 +105,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement and memory savings for operations with nullable data types when no missing values are present (:issue:`30435`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 01bb3d50c0da7..502d895ca8394 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -2,6 +2,7 @@ masked_reductions.py is for reduction algorithms using a mask-based approach for missing values. """ +from __future__ import annotations from typing import Callable @@ -15,7 +16,7 @@ def _sumprod( func: Callable, values: np.ndarray, - mask: np.ndarray, + mask: np.ndarray | None, *, skipna: bool = True, min_count: int = 0, @@ -30,7 +31,8 @@ def _sumprod( Numpy array with the values (can be of any dtype that support the operation). mask : np.ndarray - Boolean numpy array (True values indicate missing values). + Boolean numpy array (True values indicate missing values). None is equivalent + to all False skipna : bool, default True Whether to skip NA. min_count : int, default 0 @@ -38,18 +40,25 @@ def _sumprod( ``min_count`` non-NA values are present the result will be NA. """ if not skipna: - if mask.any() or check_below_min_count(values.shape, None, min_count): + if mask is not None or check_below_min_count(values.shape, None, min_count): return libmissing.NA else: return func(values) else: if check_below_min_count(values.shape, mask, min_count): return libmissing.NA - return func(values, where=~mask) + if mask is not None: + return func(values, where=~mask) + else: + return func(values) def sum( - values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0 + values: np.ndarray, + mask: np.ndarray | None, + *, + skipna: bool = True, + min_count: int = 0, ): return _sumprod( np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count @@ -57,7 +66,11 @@ def sum( def prod( - values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0 + values: np.ndarray, + mask: np.ndarray | None, + *, + skipna: bool = True, + min_count: int = 0, ): return _sumprod( np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count @@ -65,7 +78,7 @@ def prod( def _minmax( - func: Callable, values: np.ndarray, mask: np.ndarray, *, skipna: bool = True + func: Callable, values: np.ndarray, mask: np.ndarray | None, *, skipna: bool = True ): """ Reduction for 1D masked array. @@ -76,19 +89,23 @@ def _minmax( values : np.ndarray Numpy array with the values (can be of any dtype that support the operation). - mask : np.ndarray - Boolean numpy array (True values indicate missing values). + mask : np.ndarray or None + Boolean numpy array (True values indicate missing values). None is equivalent + to all False. skipna : bool, default True Whether to skip NA. """ if not skipna: - if mask.any() or not values.size: + if mask is not None or not values.size: # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA else: return func(values) else: - subset = values[~mask] + if mask is not None: + subset = values[~mask] + else: + subset = values if subset.size: return func(subset) else: @@ -96,18 +113,21 @@ def _minmax( return libmissing.NA -def min(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): +def min(values: np.ndarray, mask: np.ndarray | None, *, skipna: bool = True): return _minmax(np.min, values=values, mask=mask, skipna=skipna) -def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): +def max(values: np.ndarray, mask: np.ndarray | None, *, skipna: bool = True): return _minmax(np.max, values=values, mask=mask, skipna=skipna) -def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True): - if not values.size or mask.all(): +def mean(values: np.ndarray, mask: np.ndarray | None, skipna: bool = True): + if not values.size or (mask is not None and mask.all()): return libmissing.NA _sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna) - count = np.count_nonzero(~mask) + if mask is not None: + count = np.count_nonzero(~mask) + else: + count = len(values) mean_value = _sum / count return mean_value diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 14d059c04b7c0..72ecd5a9bfb04 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -141,16 +141,14 @@ def __from_arrow__( results.append(bool_arr) if not results: - return BooleanArray( - np.array([], dtype=np.bool_), np.array([], dtype=np.bool_) - ) + return BooleanArray(np.array([], dtype=np.bool_)) else: return BooleanArray._concat_same_type(results) def coerce_to_array( values, mask=None, copy: bool = False -) -> tuple[np.ndarray, np.ndarray]: +) -> tuple[np.ndarray, np.ndarray | None]: """ Coerce the input values array to numpy arrays with a mask. @@ -171,7 +169,8 @@ def coerce_to_array( values, mask = values._data, values._mask if copy: values = values.copy() - mask = mask.copy() + if mask is not None: + mask = mask.copy() return values, mask mask_values = None @@ -212,8 +211,8 @@ def coerce_to_array( raise TypeError("Need to pass bool-like values") if mask is None and mask_values is None: - mask = np.zeros(len(values), dtype=bool) - elif mask is None: + mask = None + elif mask is None and mask_values is not None: mask = mask_values else: if isinstance(mask, np.ndarray) and mask.dtype == np.bool_: @@ -229,7 +228,7 @@ def coerce_to_array( if values.ndim != 1: raise ValueError("values must be a 1D list-like") - if mask.ndim != 1: + if mask is not None and mask.ndim != 1: raise ValueError("mask must be a 1D list-like") return values, mask @@ -294,7 +293,9 @@ class BooleanArray(BaseMaskedArray): _TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"} _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"} - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + def __init__( + self, values: np.ndarray, mask: np.ndarray | None = None, copy: bool = False + ): if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): raise TypeError( "values should be boolean numpy array. Use " @@ -367,7 +368,8 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): inputs2 = [] for x in inputs: if isinstance(x, BooleanArray): - mask |= x._mask + if x._mask is not None: + mask |= x._mask inputs2.append(x._data) else: inputs2.append(x) @@ -389,7 +391,7 @@ def reconstruct(x): else: return reconstruct(result) - def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray]: + def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray | None]: return coerce_to_array(value) def astype(self, dtype, copy: bool = True) -> ArrayLike: @@ -455,7 +457,8 @@ def _values_for_argsort(self) -> np.ndarray: ExtensionArray.argsort : Return the indices that would sort this array. """ data = self._data.copy() - data[self._mask] = -1 + if self._mask is not None: + data[self._mask] = -1 return data def any(self, *, skipna: bool = True, **kwargs): @@ -516,12 +519,13 @@ def any(self, *, skipna: bool = True, **kwargs): nv.validate_any((), kwargs) values = self._data.copy() - np.putmask(values, self._mask, False) + if self._mask is not None: + np.putmask(values, self._mask, False) result = values.any() if skipna: return result else: - if result or len(self) == 0 or not self._mask.any(): + if result or len(self) == 0 or not self._hasna: return result else: return self.dtype.na_value @@ -582,13 +586,14 @@ def all(self, *, skipna: bool = True, **kwargs): nv.validate_all((), kwargs) values = self._data.copy() - np.putmask(values, self._mask, True) + if self._mask is not None: + np.putmask(values, self._mask, True) result = values.all() if skipna: return result else: - if not result or len(self) == 0 or not self._mask.any(): + if not result or len(self) == 0 or not self._hasna: return result else: return self.dtype.na_value @@ -619,6 +624,10 @@ def _logical_method(self, other, op): if not other_is_scalar and len(self) != len(other): raise ValueError("Lengths must match to compare") + if self._mask is None and mask is None: + result = op(self._data, other) + return BooleanArray(result) + if op.__name__ in {"or_", "ror_"}: result, mask = ops.kleene_or(self._data, other, self._mask, mask) elif op.__name__ in {"and_", "rand_"}: @@ -626,9 +635,7 @@ def _logical_method(self, other, op): elif op.__name__ in {"xor", "rxor"}: result, mask = ops.kleene_xor(self._data, other, self._mask, mask) - # error: Argument 2 to "BooleanArray" has incompatible type "Optional[Any]"; - # expected "ndarray" - return BooleanArray(result, mask) # type: ignore[arg-type] + return BooleanArray(result, mask) def _cmp_method(self, other, op): from pandas.arrays import ( @@ -666,9 +673,10 @@ def _cmp_method(self, other, op): # nans propagate if mask is None: - mask = self._mask.copy() + mask = self._copy_mask() else: - mask = self._mask | mask + if self._mask is not None: + mask = self._mask | mask return BooleanArray(result, mask, copy=False) @@ -688,11 +696,13 @@ def _arith_method(self, other, op): # nans propagate if mask is None: - mask = self._mask if other is libmissing.NA: - mask |= True + mask = np.ones_like(self._data, dtype=np.bool_) + else: + mask = self._mask else: - mask = self._mask | mask + if self._mask is not None: + mask = self._mask | mask if other is libmissing.NA: # if other is NA, the result will be all NA and we can't run the @@ -733,7 +743,7 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): Parameters ---------- result : array-like - mask : array-like bool + mask : array-like bool or None other : scalar or array-like op_name : str """ @@ -755,5 +765,6 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): return IntegerArray(result, mask, copy=False) else: - result[mask] = np.nan + if mask is not None: + result[mask] = np.nan return result diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1acbcf17dfffd..83826f2d5b13d 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -124,7 +124,8 @@ def coerce_to_array( if copy: values = values.copy() - mask = mask.copy() + if mask is not None: + mask = mask.copy() return values, mask values = np.array(values, copy=copy) @@ -246,7 +247,9 @@ class FloatingArray(NumericArray): def dtype(self) -> FloatingDtype: return FLOAT_STR_TO_DTYPE[str(self._data.dtype)] - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + def __init__( + self, values: np.ndarray, mask: np.ndarray | None = None, copy: bool = False + ): if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): raise TypeError( "values should be floating numpy array. Use " @@ -268,7 +271,7 @@ def _from_sequence_of_strings( scalars = to_numeric(strings, errors="raise") return cls._from_sequence(scalars, dtype=dtype, copy=copy) - def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray]: + def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray | None]: return coerce_to_array(value, dtype=self.dtype) def astype(self, dtype, copy: bool = True) -> ArrayLike: @@ -360,9 +363,10 @@ def _cmp_method(self, other, op): # nans propagate if mask is None: - mask = self._mask.copy() + mask = self._copy_mask() else: - mask = self._mask | mask + if self._mask is not None: + mask = self._mask | mask return BooleanArray(result, mask) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index c9ba762a271bd..da730fcf990f6 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -137,7 +137,7 @@ def safe_cast(values, dtype, copy: bool): def coerce_to_array( values, dtype, mask=None, copy: bool = False -) -> tuple[np.ndarray, np.ndarray]: +) -> tuple[np.ndarray, np.ndarray | None]: """ Coerce the input values array to numpy arrays with a mask @@ -179,7 +179,8 @@ def coerce_to_array( if copy: values = values.copy() - mask = mask.copy() + if mask is not None: + mask = mask.copy() return values, mask values = np.array(values, copy=copy) @@ -308,7 +309,9 @@ class IntegerArray(NumericArray): def dtype(self) -> _IntegerDtype: return INT_STR_TO_DTYPE[str(self._data.dtype)] - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + def __init__( + self, values: np.ndarray, mask: np.ndarray | None = None, copy: bool = False + ): if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): raise TypeError( "values should be integer numpy array. Use " @@ -330,7 +333,7 @@ def _from_sequence_of_strings( scalars = to_numeric(strings, errors="raise") return cls._from_sequence(scalars, dtype=dtype, copy=copy) - def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray]: + def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray | None]: return coerce_to_array(value, dtype=self.dtype) def astype(self, dtype, copy: bool = True) -> ArrayLike: @@ -390,7 +393,7 @@ def _values_for_argsort(self) -> np.ndarray: ExtensionArray.argsort : Return the indices that would sort this array. """ data = self._data.copy() - if self._mask.any(): + if self._mask is not None: data[self._mask] = data.min() - 1 return data @@ -433,9 +436,10 @@ def _cmp_method(self, other, op): # nans propagate if mask is None: - mask = self._mask.copy() + mask = self._copy_mask() else: - mask = self._mask | mask + if self._mask is not None: + mask = self._mask | mask return BooleanArray(result, mask) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d274501143916..555138e6475a5 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -112,21 +112,29 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): # The value used to fill '_data' to avoid upcasting _internal_fill_value: Scalar - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + def __init__( + self, values: np.ndarray, mask: np.ndarray | None = None, copy: bool = False + ): # values is supposed to already be validated in the subclass - if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): - raise TypeError( - "mask should be boolean numpy array. Use " - "the 'pd.array' function instead" - ) + if mask is not None: + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'pd.array' function instead" + ) + if mask.ndim != 1: + raise ValueError("mask must be a 1D array") + + if not mask.any(): + mask = None + if values.ndim != 1: raise ValueError("values must be a 1D array") - if mask.ndim != 1: - raise ValueError("mask must be a 1D array") if copy: values = values.copy() - mask = mask.copy() + if mask is not None: + mask = mask.copy() self._data = values self._mask = mask @@ -137,13 +145,15 @@ def dtype(self) -> BaseMaskedDtype: def __getitem__(self, item: PositionalIndexer) -> BaseMaskedArray | Any: if is_integer(item): - if self._mask[item]: + if self._mask is not None and self._mask[item]: return self.dtype.na_value return self._data[item] item = check_array_indexer(self, item) - - return type(self)(self._data[item], self._mask[item]) + if self._mask is not None: + return type(self)(self._data[item], self._mask[item]) + else: + return type(self)(self._data[item]) @doc(ExtensionArray.fillna) def fillna( @@ -159,9 +169,10 @@ def fillna( f"Length of 'value' does not match. Got ({len(value)}) " f" expected {len(self)}" ) - value = value[mask] + if mask is not None: + value = value[mask] - if mask.any(): + if mask is not None: if method is not None: func = missing.get_fill_func(method) new_values, new_mask = func( @@ -178,7 +189,7 @@ def fillna( new_values = self.copy() return new_values - def _coerce_to_array(self, values) -> tuple[np.ndarray, np.ndarray]: + def _coerce_to_array(self, values) -> tuple[np.ndarray, np.ndarray | None]: raise AbstractMethodError(self) def __setitem__(self, key, value) -> None: @@ -189,15 +200,37 @@ def __setitem__(self, key, value) -> None: if _is_scalar: value = value[0] - mask = mask[0] + if mask is not None: + mask = mask[0] key = check_array_indexer(self, key) self._data[key] = value - self._mask[key] = mask + + updated_mask = False + if mask is not None: + if self._mask is None: + self._mask = np.zeros_like(self._data, dtype=np.bool_) + self._mask[key] = mask + updated_mask = True + + # Mask is all False + else: + # So only need to update the mask if it exists (since otherwise already all + # False) + if self._mask is not None: + self._mask[key] = False + updated_mask = True + + # Maintain the invariant of self._mask.any() only being + # true if the mask is not None + if updated_mask: + assert self._mask is not None + if not self._mask.any(): + self._mask = None def __iter__(self): for i in range(len(self)): - if self._mask[i]: + if self._mask is not None and self._mask[i]: yield self.dtype.na_value else: yield self._data[i] @@ -206,7 +239,10 @@ def __len__(self) -> int: return len(self._data) def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT: - return type(self)(~self._data, self._mask.copy()) + if self._mask is not None: + return type(self)(~self._data, self._mask.copy()) + else: + return type(self)(~self._data) # error: Argument 1 of "to_numpy" is incompatible with supertype "ExtensionArray"; # supertype defines the argument type as "Union[ExtensionDtype, str, dtype[Any], @@ -283,6 +319,7 @@ def to_numpy( # type: ignore[override] # error: Incompatible types in assignment (expression has type # "Type[object]", variable has type "Union[str, dtype[Any], None]") dtype = object # type: ignore[assignment] + if self._hasna: if ( not is_object_dtype(dtype) @@ -315,7 +352,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: data = self._data.astype(dtype.numpy_dtype, copy=copy) # mask is copied depending on whether the data was copied, and # not directly depending on the `copy` keyword - mask = self._mask if data is self._data else self._mask.copy() + mask = self._mask if data is self._data else self._copy_mask() cls = dtype.construct_array_type() return cls(data, mask, copy=False) @@ -344,30 +381,41 @@ def __arrow_array__(self, type=None): @property def _hasna(self) -> bool: - # Note: this is expensive right now! The hope is that we can - # make this faster by having an optional mask, but not have to change - # source code using it.. - - # error: Incompatible return value type (got "bool_", expected "bool") - return self._mask.any() # type: ignore[return-value] + return self._mask is not None def isna(self) -> np.ndarray: - return self._mask.copy() + if self._mask is not None: + return self._mask.copy() + else: + return np.zeros_like(self._data, dtype=np.bool_) @property def _na_value(self): return self.dtype.na_value + @property + def _mask_as_ndarray(self) -> np.ndarray: + """Convert the mask to an ndarray[bool]""" + if self._mask is not None: + return self._mask + else: + return np.zeros_like(self._data, dtype=np.bool_) + + def _copy_mask(self) -> np.ndarray | None: + """Copy the mask if it exists, otherwise return None""" + return self._mask.copy() if self._mask is not None else None + @property def nbytes(self) -> int: - return self._data.nbytes + self._mask.nbytes + mask_size = self._mask.nbytes if self._mask is not None else None.__sizeof__() + return self._data.nbytes + mask_size @classmethod def _concat_same_type( cls: type[BaseMaskedArrayT], to_concat: Sequence[BaseMaskedArrayT] ) -> BaseMaskedArrayT: data = np.concatenate([x._data for x in to_concat]) - mask = np.concatenate([x._mask for x in to_concat]) + mask = np.concatenate([x._mask_as_ndarray for x in to_concat]) return cls(data, mask) def take( @@ -384,7 +432,9 @@ def take( self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill ) - mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) + mask = take( + self._mask_as_ndarray, indexer, fill_value=True, allow_fill=allow_fill + ) # if we are filling # we only fill where the indexer is null @@ -404,20 +454,17 @@ def isin(self, values) -> BooleanArray: # type: ignore[override] from pandas.core.arrays import BooleanArray result = isin(self._data, values) - if self._hasna: + if self._mask is not None: if libmissing.NA in values: result += self._mask else: result *= np.invert(self._mask) - # error: No overload variant of "zeros_like" matches argument types - # "BaseMaskedArray", "Type[bool]" - mask = np.zeros_like(self, dtype=bool) # type: ignore[call-overload] - return BooleanArray(result, mask, copy=False) + + return BooleanArray(result, copy=False) def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: - data, mask = self._data, self._mask - data = data.copy() - mask = mask.copy() + data = self._data.copy() + mask = self._copy_mask() return type(self)(data, mask, copy=False) @doc(ExtensionArray.factorize) @@ -429,14 +476,9 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: # the hashtables don't handle all different types of bits uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) - # error: Incompatible types in assignment (expression has type - # "BaseMaskedArray", variable has type "ndarray") - uniques = type(self)( # type: ignore[assignment] - uniques, np.zeros(len(uniques), dtype=bool) - ) - # error: Incompatible return value type (got "Tuple[ndarray, ndarray]", - # expected "Tuple[ndarray, ExtensionArray]") - return codes, uniques # type: ignore[return-value] + uniques_masked_arr = type(self)(uniques) + + return codes, uniques_masked_arr def value_counts(self, dropna: bool = True) -> Series: """ @@ -462,7 +504,7 @@ def value_counts(self, dropna: bool = True) -> Series: from pandas.arrays import IntegerArray # compute counts on the data with no nans - data = self._data[~self._mask] + data = self._data[~self._mask] if self._mask is not None else self._data value_counts = Index(data).value_counts() # TODO(extension) @@ -476,15 +518,14 @@ def value_counts(self, dropna: bool = True) -> Series: else: counts = np.empty(len(value_counts) + 1, dtype="int64") counts[:-1] = value_counts - counts[-1] = self._mask.sum() + counts[-1] = self._mask.sum() if self._mask is not None else 0 index = Index( np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), dtype=object, ) - mask = np.zeros(len(counts), dtype="bool") - counts = IntegerArray(counts, mask) + counts = IntegerArray(counts) return Series(counts, index=index) diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index bc467e93c2c2c..db9e46f1e8224 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -112,15 +112,25 @@ def _arith_method(self, other, op): raise TypeError("can only perform ops with numeric values") if omask is None: - mask = self._mask.copy() + if self._mask is not None: + mask = self._mask.copy() + else: + mask = np.zeros_like(self._data, dtype=np.bool_) if other is libmissing.NA: mask |= True else: - mask = self._mask | omask + if self._mask is None: + # zeros | omask = omask + mask = omask + else: + mask = self._mask | omask if op_name == "pow": # 1 ** x is 1. - mask = np.where((self._data == 1) & ~self._mask, False, mask) + if self._mask is not None: + mask = np.where((self._data == 1) & ~self._mask, False, mask) + else: + mask = np.where(self._data == 1, False, mask) # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) @@ -134,7 +144,10 @@ def _arith_method(self, other, op): elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. - mask = np.where((self._data == 0) & ~self._mask, False, mask) + if self._mask is not None: + mask = np.where((self._data == 0) & ~self._mask, False, mask) + else: + mask = np.where(self._data == 0, False, mask) if other is libmissing.NA: result = np.ones_like(self._data) @@ -177,7 +190,8 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): inputs2: list[Any] = [] for x in inputs: if isinstance(x, NumericArray): - mask |= x._mask + if x._mask is not None: + mask |= x._mask inputs2.append(x._data) else: inputs2.append(x) @@ -207,13 +221,13 @@ def reconstruct(x): return reconstruct(result) def __neg__(self): - return type(self)(-self._data, self._mask.copy()) + return type(self)(-self._data, self._copy_mask()) def __pos__(self): return self def __abs__(self): - return type(self)(abs(self._data), self._mask.copy()) + return type(self)(abs(self._data), self._copy_mask()) def round(self: T, decimals: int = 0, *args, **kwargs) -> T: """ @@ -241,4 +255,4 @@ def round(self: T, decimals: int = 0, *args, **kwargs) -> T: """ nv.validate_round(args, kwargs) values = np.round(self._data, decimals=decimals, **kwargs) - return type(self)(values, self._mask.copy()) + return type(self)(values, self._copy_mask()) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8d150c8f6ad3d..2e9afa1fab555 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -340,7 +340,8 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): na_values = scalars._mask result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = StringDtype.na_value + if na_values is not None: + result[na_values] = StringDtype.na_value else: # convert non-na-likes to str, and nan-likes to StringDtype.na_value diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 433d45d94167d..d0e34ea417d26 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2173,7 +2173,7 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: elif not isinstance(tipo, np.dtype): # i.e. nullable IntegerDtype; we can put this into an ndarray # losslessly iff it has no NAs - return not element._mask.any() + return not element._hasna return True # We have not inferred an integer from the dtype @@ -2189,7 +2189,7 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: elif not isinstance(tipo, np.dtype): # i.e. nullable IntegerDtype or FloatingDtype; # we can put this into an ndarray losslessly iff it has no NAs - return not element._mask.any() + return not element._hasna return True return lib.is_integer(element) or lib.is_float(element) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b65f26c7174fc..3bb1390c69a18 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -443,10 +443,13 @@ def _masked_ea_wrap_cython_operation( and cython algorithms which accept a mask. """ orig_values = values + arr = values._data # Copy to ensure input and result masks don't end up shared - mask = values._mask.copy() - arr = values._data + if values._mask is not None: + mask = values._mask.copy() + else: + mask = np.zeros_like(arr, dtype=np.bool_) res_values = self._cython_op_ndim_compat( arr, diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 6dfd67f5dc5ec..27b944210b417 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -166,10 +166,14 @@ def to_numeric(arg, errors="raise", downcast=None): # GH33013: for IntegerArray & FloatingArray extract non-null values for casting # save mask to reconstruct the full array after casting + is_numeric_array = isinstance(values, NumericArray) mask: np.ndarray | None = None - if isinstance(values, NumericArray): + if is_numeric_array: mask = values._mask - values = values._data[~mask] + if mask is not None: + values = values._data[~mask] + else: + values = values._data values_dtype = getattr(values, "dtype", None) if is_numeric_dtype(values_dtype): @@ -218,17 +222,20 @@ def to_numeric(arg, errors="raise", downcast=None): break # GH33013: for IntegerArray & FloatingArray need to reconstruct masked array - if mask is not None: - data = np.zeros(mask.shape, dtype=values.dtype) - data[~mask] = values - + if is_numeric_array: from pandas.core.arrays import ( FloatingArray, IntegerArray, ) - klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray - values = klass(data, mask.copy()) + klass = IntegerArray if is_integer_dtype(values.dtype) else FloatingArray + + if mask is None: + values = klass(values) + else: + data = np.zeros(mask.shape, dtype=values.dtype) + data[~mask] = values + values = klass(data, mask.copy()) if is_series: return arg._constructor(values, index=arg.index, name=arg.name) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index c9e96c437964f..ea4f2976cf3f4 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -24,9 +24,6 @@ def test_boolean_array_constructor(): with pytest.raises(TypeError, match="values should be boolean numpy array"): BooleanArray(values.astype(int), mask) - with pytest.raises(TypeError, match="mask should be boolean numpy array"): - BooleanArray(values, None) - with pytest.raises(ValueError, match="values must be a 1D array"): BooleanArray(values.reshape(1, -1), mask) @@ -34,6 +31,12 @@ def test_boolean_array_constructor(): BooleanArray(values, mask.reshape(1, -1)) +def test_boolean_array_constructor_no_mask(): + result = BooleanArray(np.array([True, False, True, False], dtype="bool")) + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + def test_boolean_array_constructor_copy(): values = np.array([True, False, True, False], dtype="bool") mask = np.array([False, False, False, True], dtype="bool") diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 4ce3dd35b538b..85706a7d7e242 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -35,9 +35,11 @@ def test_floating_array_constructor(): with pytest.raises(TypeError, match=msg): FloatingArray(values.astype(int), mask) - msg = r"__init__\(\) missing 1 required positional argument: 'mask'" - with pytest.raises(TypeError, match=msg): - FloatingArray(values) + +def test_floating_array_constructor_no_mask(): + result = FloatingArray(np.array([1, 2, 3, 4], dtype="float64")) + expected = pd.array([1, 2, 3, 4], dtype="Float64") + tm.assert_extension_array_equal(result, expected) def test_floating_array_constructor_copy(): diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index b48567d37ecaf..ba02d5e3fe54e 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -86,9 +86,12 @@ def test_integer_array_constructor(): with pytest.raises(TypeError, match=msg): IntegerArray(values.astype(float), mask) - msg = r"__init__\(\) missing 1 required positional argument: 'mask'" - with pytest.raises(TypeError, match=msg): - IntegerArray(values) + + +def test_boolean_array_constructor_no_mask(): + result = IntegerArray(np.array([1, 2, 0, 0], dtype="int64")) + expected = pd.array([1, 2, 0, 0], dtype="Int64") + tm.assert_extension_array_equal(result, expected) def test_integer_array_constructor_copy():