diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 647698f472978..90126d3c4df37 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -41,7 +41,7 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_inclusive -from pandas.core.dtypes.cast import astype_dt64_to_dt64tz +from pandas.core.dtypes.astype import astype_dt64_to_dt64tz from pandas.core.dtypes.common import ( DT64NS_DTYPE, INT64_DTYPE, diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index e1f80c5894bb1..25ddf04dc1c99 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -16,7 +16,7 @@ ) from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.astype import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c861abfc7920d..8f3d9f94769bc 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -44,8 +44,8 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_insert_loc +from pandas.core.dtypes.astype import astype_nansafe from pandas.core.dtypes.cast import ( - astype_nansafe, construct_1d_arraylike_from_scalar, find_common_type, maybe_box_datetimelike, diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index f1da2421c4106..89c6f4da98f30 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -18,11 +18,11 @@ from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.astype import astype_nansafe from pandas.core.dtypes.base import ( ExtensionDtype, register_extension_dtype, ) -from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, is_object_dtype, diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 79387cc2584da..93255f6677c55 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -38,7 +38,7 @@ from pandas.compat.numpy import function as nv from pandas.util._validators import validate_endpoints -from pandas.core.dtypes.cast import astype_td64_unit_conversion +from pandas.core.dtypes.astype import astype_td64_unit_conversion from pandas.core.dtypes.common import ( DT64NS_DTYPE, TD64NS_DTYPE, diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py new file mode 100644 index 0000000000000..f096afc1d0f4b --- /dev/null +++ b/pandas/core/dtypes/astype.py @@ -0,0 +1,418 @@ +""" +Functions for implementing 'astype' methods according to pandas conventions, +particularly ones that differ from numpy. +""" +from __future__ import annotations + +import inspect +from typing import ( + TYPE_CHECKING, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._typing import ( + ArrayLike, + DtypeObj, +) +from pandas.errors import IntCastingNaNError +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_datetime64_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_object_dtype, + is_timedelta64_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + PandasDtype, +) +from pandas.core.dtypes.missing import isna + +if TYPE_CHECKING: + from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + ) + + +_dtype_obj = np.dtype(object) + + +@overload +def astype_nansafe( + arr: np.ndarray, dtype: np.dtype, copy: bool = ..., skipna: bool = ... +) -> np.ndarray: + ... + + +@overload +def astype_nansafe( + arr: np.ndarray, dtype: ExtensionDtype, copy: bool = ..., skipna: bool = ... +) -> ExtensionArray: + ... + + +def astype_nansafe( + arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False +) -> ArrayLike: + """ + Cast the elements of an array to a given dtype a nan-safe manner. + + Parameters + ---------- + arr : ndarray + dtype : np.dtype or ExtensionDtype + copy : bool, default True + If False, a view will be attempted but may fail, if + e.g. the item sizes don't align. + skipna: bool, default False + Whether or not we should skip NaN when casting as a string-type. + + Raises + ------ + ValueError + The dtype was a datetime64/timedelta64 dtype, but it had no unit. + """ + if arr.ndim > 1: + flat = arr.ravel() + result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna) + # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no + # attribute "reshape" + return result.reshape(arr.shape) # type: ignore[union-attr] + + # We get here with 0-dim from sparse + arr = np.atleast_1d(arr) + + # dispatch on extension dtype if needed + if isinstance(dtype, ExtensionDtype): + return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) + + elif not isinstance(dtype, np.dtype): # pragma: no cover + raise ValueError("dtype must be np.dtype or ExtensionDtype") + + if arr.dtype.kind in ["m", "M"] and ( + issubclass(dtype.type, str) or dtype == _dtype_obj + ): + from pandas.core.construction import ensure_wrapped_if_datetimelike + + arr = ensure_wrapped_if_datetimelike(arr) + return arr.astype(dtype, copy=copy) + + if issubclass(dtype.type, str): + return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) + + elif is_datetime64_dtype(arr.dtype): + # Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + if dtype == np.int64: # type: ignore[comparison-overlap] + warnings.warn( + f"casting {arr.dtype} values to int64 with .astype(...) " + "is deprecated and will raise in a future version. " + "Use .view(...) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") + return arr.view(dtype) + + # allow frequency conversions + if dtype.kind == "M": + return arr.astype(dtype) + + raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") + + elif is_timedelta64_dtype(arr.dtype): + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + if dtype == np.int64: # type: ignore[comparison-overlap] + warnings.warn( + f"casting {arr.dtype} values to int64 with .astype(...) " + "is deprecated and will raise in a future version. " + "Use .view(...) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") + return arr.view(dtype) + + elif dtype.kind == "m": + return astype_td64_unit_conversion(arr, dtype, copy=copy) + + raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") + + elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer): + return _astype_float_to_int_nansafe(arr, dtype, copy) + + elif is_object_dtype(arr.dtype): + + # work around NumPy brokenness, #1987 + if np.issubdtype(dtype.type, np.integer): + return lib.astype_intsafe(arr, dtype) + + # if we have a datetime/timedelta array of objects + # then coerce to a proper dtype and recall astype_nansafe + + elif is_datetime64_dtype(dtype): + from pandas import to_datetime + + return astype_nansafe( + to_datetime(arr).values, + dtype, + copy=copy, + ) + elif is_timedelta64_dtype(dtype): + from pandas import to_timedelta + + return astype_nansafe(to_timedelta(arr)._values, dtype, copy=copy) + + if dtype.name in ("datetime64", "timedelta64"): + msg = ( + f"The '{dtype.name}' dtype has no unit. Please pass in " + f"'{dtype.name}[ns]' instead." + ) + raise ValueError(msg) + + if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype): + # Explicit copy, or required since NumPy can't view from / to object. + return arr.astype(dtype, copy=True) + + return arr.astype(dtype, copy=copy) + + +def _astype_float_to_int_nansafe( + values: np.ndarray, dtype: np.dtype, copy: bool +) -> np.ndarray: + """ + astype with a check preventing converting NaN to an meaningless integer value. + """ + if not np.isfinite(values).all(): + raise IntCastingNaNError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + return values.astype(dtype, copy=copy) + + +def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike: + """ + Cast array (ndarray or ExtensionArray) to the new dtype. + + Parameters + ---------- + values : ndarray or ExtensionArray + dtype : dtype object + copy : bool, default False + copy if indicated + + Returns + ------- + ndarray or ExtensionArray + """ + if ( + values.dtype.kind in ["m", "M"] + and dtype.kind in ["i", "u"] + and isinstance(dtype, np.dtype) + and dtype.itemsize != 8 + ): + # TODO(2.0) remove special case once deprecation on DTA/TDA is enforced + msg = rf"cannot astype a datetimelike from [{values.dtype}] to [{dtype}]" + raise TypeError(msg) + + if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): + return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) + + if is_dtype_equal(values.dtype, dtype): + if copy: + return values.copy() + return values + + if not isinstance(values, np.ndarray): + # i.e. ExtensionArray + values = values.astype(dtype, copy=copy) + + else: + values = astype_nansafe(values, dtype, copy=copy) + + # in pandas we don't store numpy str dtypes, so convert to object + if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): + values = np.array(values, dtype=object) + + return values + + +def astype_array_safe( + values: ArrayLike, dtype, copy: bool = False, errors: str = "raise" +) -> ArrayLike: + """ + Cast array (ndarray or ExtensionArray) to the new dtype. + + This basically is the implementation for DataFrame/Series.astype and + includes all custom logic for pandas (NaN-safety, converting str to object, + not allowing ) + + Parameters + ---------- + values : ndarray or ExtensionArray + dtype : str, dtype convertible + copy : bool, default False + copy if indicated + errors : str, {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + + Returns + ------- + ndarray or ExtensionArray + """ + errors_legal_values = ("raise", "ignore") + + if errors not in errors_legal_values: + invalid_arg = ( + "Expected value of kwarg 'errors' to be one of " + f"{list(errors_legal_values)}. Supplied value is '{errors}'" + ) + raise ValueError(invalid_arg) + + if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): + msg = ( + f"Expected an instance of {dtype.__name__}, " + "but got the class instead. Try instantiating 'dtype'." + ) + raise TypeError(msg) + + dtype = pandas_dtype(dtype) + if isinstance(dtype, PandasDtype): + # Ensure we don't end up with a PandasArray + dtype = dtype.numpy_dtype + + try: + new_values = astype_array(values, dtype, copy=copy) + except (ValueError, TypeError): + # e.g. astype_nansafe can fail on object-dtype of strings + # trying to convert to float + if errors == "ignore": + new_values = values + else: + raise + + return new_values + + +def astype_td64_unit_conversion( + values: np.ndarray, dtype: np.dtype, copy: bool +) -> np.ndarray: + """ + By pandas convention, converting to non-nano timedelta64 + returns an int64-dtyped array with ints representing multiples + of the desired timedelta unit. This is essentially division. + + Parameters + ---------- + values : np.ndarray[timedelta64[ns]] + dtype : np.dtype + timedelta64 with unit not-necessarily nano + copy : bool + + Returns + ------- + np.ndarray + """ + if is_dtype_equal(values.dtype, dtype): + if copy: + return values.copy() + return values + + # otherwise we are converting to non-nano + result = values.astype(dtype, copy=False) # avoid double-copying + result = result.astype(np.float64) + + mask = isna(values) + np.putmask(result, mask, np.nan) + return result + + +def astype_dt64_to_dt64tz( + values: ArrayLike, dtype: DtypeObj, copy: bool, via_utc: bool = False +) -> DatetimeArray: + # GH#33401 we have inconsistent behaviors between + # Datetimeindex[naive].astype(tzaware) + # Series[dt64].astype(tzaware) + # This collects them in one place to prevent further fragmentation. + + from pandas.core.construction import ensure_wrapped_if_datetimelike + + values = ensure_wrapped_if_datetimelike(values) + values = cast("DatetimeArray", values) + aware = isinstance(dtype, DatetimeTZDtype) + + if via_utc: + # Series.astype behavior + + # caller is responsible for checking this + assert values.tz is None and aware + dtype = cast(DatetimeTZDtype, dtype) + + if copy: + # this should be the only copy + values = values.copy() + + warnings.warn( + "Using .astype to convert from timezone-naive dtype to " + "timezone-aware dtype is deprecated and will raise in a " + "future version. Use ser.dt.tz_localize instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + # GH#33401 this doesn't match DatetimeArray.astype, which + # goes through the `not via_utc` path + return values.tz_localize("UTC").tz_convert(dtype.tz) + + else: + # DatetimeArray/DatetimeIndex.astype behavior + if values.tz is None and aware: + dtype = cast(DatetimeTZDtype, dtype) + warnings.warn( + "Using .astype to convert from timezone-naive dtype to " + "timezone-aware dtype is deprecated and will raise in a " + "future version. Use obj.tz_localize instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return values.tz_localize(dtype.tz) + + elif aware: + # GH#18951: datetime64_tz dtype but not equal means different tz + dtype = cast(DatetimeTZDtype, dtype) + result = values.tz_convert(dtype.tz) + if copy: + result = result.copy() + return result + + elif values.tz is not None: + warnings.warn( + "Using .astype to convert from timezone-aware dtype to " + "timezone-naive dtype is deprecated and will raise in a " + "future version. Use obj.tz_localize(None) or " + "obj.tz_convert('UTC').tz_localize(None) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + result = values.tz_convert("UTC").tz_localize(None) + if copy: + result = result.copy() + return result + + raise NotImplementedError("dtype_equal case should be handled elsewhere") diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 14cd725c8f066..c98108775eb2c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -10,7 +10,6 @@ timedelta, ) import functools -import inspect from typing import ( TYPE_CHECKING, Any, @@ -43,6 +42,7 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg +from pandas.core.dtypes.astype import astype_nansafe from pandas.core.dtypes.common import ( DT64NS_DTYPE, TD64NS_DTYPE, @@ -77,7 +77,6 @@ DatetimeTZDtype, ExtensionDtype, IntervalDtype, - PandasDtype, PeriodDtype, ) from pandas.core.dtypes.generic import ( @@ -102,6 +101,7 @@ TimedeltaArray, ) + _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max _int32_max = np.iinfo(np.int32).max @@ -948,377 +948,6 @@ def coerce_indexer_dtype(indexer, categories): return ensure_int64(indexer) -def astype_dt64_to_dt64tz( - values: ArrayLike, dtype: DtypeObj, copy: bool, via_utc: bool = False -) -> DatetimeArray: - # GH#33401 we have inconsistent behaviors between - # Datetimeindex[naive].astype(tzaware) - # Series[dt64].astype(tzaware) - # This collects them in one place to prevent further fragmentation. - - from pandas.core.construction import ensure_wrapped_if_datetimelike - - values = ensure_wrapped_if_datetimelike(values) - values = cast("DatetimeArray", values) - aware = isinstance(dtype, DatetimeTZDtype) - - if via_utc: - # Series.astype behavior - - # caller is responsible for checking this - assert values.tz is None and aware - dtype = cast(DatetimeTZDtype, dtype) - - if copy: - # this should be the only copy - values = values.copy() - - warnings.warn( - "Using .astype to convert from timezone-naive dtype to " - "timezone-aware dtype is deprecated and will raise in a " - "future version. Use ser.dt.tz_localize instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - # GH#33401 this doesn't match DatetimeArray.astype, which - # goes through the `not via_utc` path - return values.tz_localize("UTC").tz_convert(dtype.tz) - - else: - # DatetimeArray/DatetimeIndex.astype behavior - if values.tz is None and aware: - dtype = cast(DatetimeTZDtype, dtype) - warnings.warn( - "Using .astype to convert from timezone-naive dtype to " - "timezone-aware dtype is deprecated and will raise in a " - "future version. Use obj.tz_localize instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - return values.tz_localize(dtype.tz) - - elif aware: - # GH#18951: datetime64_tz dtype but not equal means different tz - dtype = cast(DatetimeTZDtype, dtype) - result = values.tz_convert(dtype.tz) - if copy: - result = result.copy() - return result - - elif values.tz is not None: - warnings.warn( - "Using .astype to convert from timezone-aware dtype to " - "timezone-naive dtype is deprecated and will raise in a " - "future version. Use obj.tz_localize(None) or " - "obj.tz_convert('UTC').tz_localize(None) instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - - result = values.tz_convert("UTC").tz_localize(None) - if copy: - result = result.copy() - return result - - raise NotImplementedError("dtype_equal case should be handled elsewhere") - - -def astype_td64_unit_conversion( - values: np.ndarray, dtype: np.dtype, copy: bool -) -> np.ndarray: - """ - By pandas convention, converting to non-nano timedelta64 - returns an int64-dtyped array with ints representing multiples - of the desired timedelta unit. This is essentially division. - - Parameters - ---------- - values : np.ndarray[timedelta64[ns]] - dtype : np.dtype - timedelta64 with unit not-necessarily nano - copy : bool - - Returns - ------- - np.ndarray - """ - if is_dtype_equal(values.dtype, dtype): - if copy: - return values.copy() - return values - - # otherwise we are converting to non-nano - result = values.astype(dtype, copy=False) # avoid double-copying - result = result.astype(np.float64) - - mask = isna(values) - np.putmask(result, mask, np.nan) - return result - - -@overload -def astype_nansafe( - arr: np.ndarray, dtype: np.dtype, copy: bool = ..., skipna: bool = ... -) -> np.ndarray: - ... - - -@overload -def astype_nansafe( - arr: np.ndarray, dtype: ExtensionDtype, copy: bool = ..., skipna: bool = ... -) -> ExtensionArray: - ... - - -def astype_nansafe( - arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False -) -> ArrayLike: - """ - Cast the elements of an array to a given dtype a nan-safe manner. - - Parameters - ---------- - arr : ndarray - dtype : np.dtype or ExtensionDtype - copy : bool, default True - If False, a view will be attempted but may fail, if - e.g. the item sizes don't align. - skipna: bool, default False - Whether or not we should skip NaN when casting as a string-type. - - Raises - ------ - ValueError - The dtype was a datetime64/timedelta64 dtype, but it had no unit. - """ - if arr.ndim > 1: - flat = arr.ravel() - result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna) - # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no - # attribute "reshape" - return result.reshape(arr.shape) # type: ignore[union-attr] - - # We get here with 0-dim from sparse - arr = np.atleast_1d(arr) - - # dispatch on extension dtype if needed - if isinstance(dtype, ExtensionDtype): - return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) - - elif not isinstance(dtype, np.dtype): # pragma: no cover - raise ValueError("dtype must be np.dtype or ExtensionDtype") - - if arr.dtype.kind in ["m", "M"] and ( - issubclass(dtype.type, str) or dtype == _dtype_obj - ): - from pandas.core.construction import ensure_wrapped_if_datetimelike - - arr = ensure_wrapped_if_datetimelike(arr) - return arr.astype(dtype, copy=copy) - - if issubclass(dtype.type, str): - return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) - - elif is_datetime64_dtype(arr.dtype): - # Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - if dtype == np.int64: # type: ignore[comparison-overlap] - warnings.warn( - f"casting {arr.dtype} values to int64 with .astype(...) " - "is deprecated and will raise in a future version. " - "Use .view(...) instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if isna(arr).any(): - raise ValueError("Cannot convert NaT values to integer") - return arr.view(dtype) - - # allow frequency conversions - if dtype.kind == "M": - return arr.astype(dtype) - - raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") - - elif is_timedelta64_dtype(arr.dtype): - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[signedinteger[Any]]") - if dtype == np.int64: # type: ignore[comparison-overlap] - warnings.warn( - f"casting {arr.dtype} values to int64 with .astype(...) " - "is deprecated and will raise in a future version. " - "Use .view(...) instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if isna(arr).any(): - raise ValueError("Cannot convert NaT values to integer") - return arr.view(dtype) - - elif dtype.kind == "m": - return astype_td64_unit_conversion(arr, dtype, copy=copy) - - raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") - - elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer): - return astype_float_to_int_nansafe(arr, dtype, copy) - - elif is_object_dtype(arr.dtype): - - # work around NumPy brokenness, #1987 - if np.issubdtype(dtype.type, np.integer): - return lib.astype_intsafe(arr, dtype) - - # if we have a datetime/timedelta array of objects - # then coerce to a proper dtype and recall astype_nansafe - - elif is_datetime64_dtype(dtype): - from pandas import to_datetime - - return astype_nansafe( - to_datetime(arr).values, - dtype, - copy=copy, - ) - elif is_timedelta64_dtype(dtype): - from pandas import to_timedelta - - return astype_nansafe(to_timedelta(arr)._values, dtype, copy=copy) - - if dtype.name in ("datetime64", "timedelta64"): - msg = ( - f"The '{dtype.name}' dtype has no unit. Please pass in " - f"'{dtype.name}[ns]' instead." - ) - raise ValueError(msg) - - if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype): - # Explicit copy, or required since NumPy can't view from / to object. - return arr.astype(dtype, copy=True) - - return arr.astype(dtype, copy=copy) - - -def astype_float_to_int_nansafe( - values: np.ndarray, dtype: np.dtype, copy: bool -) -> np.ndarray: - """ - astype with a check preventing converting NaN to an meaningless integer value. - """ - if not np.isfinite(values).all(): - raise IntCastingNaNError( - "Cannot convert non-finite values (NA or inf) to integer" - ) - return values.astype(dtype, copy=copy) - - -def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike: - """ - Cast array (ndarray or ExtensionArray) to the new dtype. - - Parameters - ---------- - values : ndarray or ExtensionArray - dtype : dtype object - copy : bool, default False - copy if indicated - - Returns - ------- - ndarray or ExtensionArray - """ - if ( - values.dtype.kind in ["m", "M"] - and dtype.kind in ["i", "u"] - and isinstance(dtype, np.dtype) - and dtype.itemsize != 8 - ): - # TODO(2.0) remove special case once deprecation on DTA/TDA is enforced - msg = rf"cannot astype a datetimelike from [{values.dtype}] to [{dtype}]" - raise TypeError(msg) - - if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): - return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) - - if is_dtype_equal(values.dtype, dtype): - if copy: - return values.copy() - return values - - if not isinstance(values, np.ndarray): - # i.e. ExtensionArray - values = values.astype(dtype, copy=copy) - - else: - values = astype_nansafe(values, dtype, copy=copy) - - # in pandas we don't store numpy str dtypes, so convert to object - if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): - values = np.array(values, dtype=object) - - return values - - -def astype_array_safe( - values: ArrayLike, dtype, copy: bool = False, errors: str = "raise" -) -> ArrayLike: - """ - Cast array (ndarray or ExtensionArray) to the new dtype. - - This basically is the implementation for DataFrame/Series.astype and - includes all custom logic for pandas (NaN-safety, converting str to object, - not allowing ) - - Parameters - ---------- - values : ndarray or ExtensionArray - dtype : str, dtype convertible - copy : bool, default False - copy if indicated - errors : str, {'raise', 'ignore'}, default 'raise' - - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original object - - Returns - ------- - ndarray or ExtensionArray - """ - errors_legal_values = ("raise", "ignore") - - if errors not in errors_legal_values: - invalid_arg = ( - "Expected value of kwarg 'errors' to be one of " - f"{list(errors_legal_values)}. Supplied value is '{errors}'" - ) - raise ValueError(invalid_arg) - - if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): - msg = ( - f"Expected an instance of {dtype.__name__}, " - "but got the class instead. Try instantiating 'dtype'." - ) - raise TypeError(msg) - - dtype = pandas_dtype(dtype) - if isinstance(dtype, PandasDtype): - # Ensure we don't end up with a PandasArray - dtype = dtype.numpy_dtype - - try: - new_values = astype_array(values, dtype, copy=copy) - except (ValueError, TypeError): - # e.g. astype_nansafe can fail on object-dtype of strings - # trying to convert to float - if errors == "ignore": - new_values = values - else: - raise - - return new_values - - def soft_convert_objects( values: np.ndarray, datetime: bool = True, diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 2dc4241c6a303..83c2668242129 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -15,10 +15,8 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.cast import ( - astype_array, - find_common_type, -) +from pandas.core.dtypes.astype import astype_array +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_categorical_dtype, is_dtype_equal, diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 5cd4cc9dfaec2..4a93b46e02b0c 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -23,7 +23,7 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.astype import astype_nansafe from pandas.core.dtypes.common import ( is_dtype_equal, is_extension_array_dtype, diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index ec3a9e8b493e3..54fe10a799531 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -23,8 +23,8 @@ ) from pandas.util._validators import validate_bool_kwarg +from pandas.core.dtypes.astype import astype_array_safe from pandas.core.dtypes.cast import ( - astype_array_safe, ensure_dtype_can_hold_na, infer_dtype_from_scalar, soft_convert_objects, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9f242226739ec..c11e80a3aa4b1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -35,8 +35,8 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg +from pandas.core.dtypes.astype import astype_array_safe from pandas.core.dtypes.cast import ( - astype_array_safe, can_hold_element, find_common_type, infer_dtype_from, diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 318dd659d46bf..950ecca41e9c8 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -41,7 +41,7 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.astype import astype_nansafe from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e2c5f893b6a2c..b58d10978d408 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -7,7 +7,7 @@ import pandas.util._test_decorators as td -from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.astype import astype_nansafe import pandas.core.dtypes.common as com from pandas.core.dtypes.dtypes import ( CategoricalDtype,