Skip to content

BUG: reindex with fill_value that should give EA dtype #52586

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,8 @@ Other
- Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`)
- Bug in :func:`assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`)
- Bug in :meth:`Series.map` when giving a callable to an empty series, the returned series had ``object`` dtype. It now keeps the original dtype (:issue:`52384`)
- Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`)
-

.. ***DO NOT USE THIS SECTION***

Expand Down
49 changes: 7 additions & 42 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import (
TYPE_CHECKING,
Sequence,
cast,
)

import numpy as np
Expand All @@ -29,20 +28,14 @@
needs_i8_conversion,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
isna_all,
)

from pandas.core.arrays import (
DatetimeArray,
ExtensionArray,
)
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.internals.array_manager import (
Expand All @@ -53,7 +46,10 @@
ensure_block_shape,
new_block_2d,
)
from pandas.core.internals.managers import BlockManager
from pandas.core.internals.managers import (
BlockManager,
make_na_array,
)

if TYPE_CHECKING:
from pandas._typing import (
Expand Down Expand Up @@ -474,38 +470,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
if len(values) and values[0] is None:
fill_value = None

if isinstance(empty_dtype, DatetimeTZDtype):
# NB: exclude e.g. pyarrow[dt64tz] dtypes
i8values = np.full(self.block.shape, fill_value._value)
return DatetimeArray(i8values, dtype=empty_dtype)

elif is_1d_only_ea_dtype(empty_dtype):
empty_dtype = cast(ExtensionDtype, empty_dtype)
cls = empty_dtype.construct_array_type()

missing_arr = cls._from_sequence([], dtype=empty_dtype)
ncols, nrows = self.block.shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(
empty_arr, allow_fill=True, fill_value=fill_value
)
elif isinstance(empty_dtype, ExtensionDtype):
# TODO: no tests get here, a handful would if we disabled
# the dt64tz special-case above (which is faster)
cls = empty_dtype.construct_array_type()
missing_arr = cls._empty(shape=self.block.shape, dtype=empty_dtype)
missing_arr[:] = fill_value
return missing_arr
else:
# NB: we should never get here with empty_dtype integer or bool;
# if we did, the missing_arr.fill would cast to gibberish
missing_arr = np.empty(self.block.shape, dtype=empty_dtype)
missing_arr.fill(fill_value)

if empty_dtype.kind in "mM":
missing_arr = ensure_wrapped_if_datetimelike(missing_arr)
return missing_arr
return make_na_array(empty_dtype, self.block.shape, fill_value)

if not self.block._can_consolidate:
# preserve these for validation in concat_compat
Expand Down
54 changes: 43 additions & 11 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@
is_dtype_equal,
is_list_like,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
Expand All @@ -49,6 +52,7 @@
)

import pandas.core.algorithms as algos
from pandas.core.arrays import DatetimeArray
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.arrays.sparse import SparseDtype
import pandas.core.common as com
Expand Down Expand Up @@ -915,16 +919,11 @@ def _make_na_block(

if fill_value is None:
fill_value = np.nan
block_shape = list(self.shape)
block_shape[0] = len(placement)

dtype, fill_value = infer_dtype_from_scalar(fill_value)
# error: Argument "dtype" to "empty" has incompatible type "Union[dtype,
# ExtensionDtype]"; expected "Union[dtype, None, type, _SupportsDtype, str,
# Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], _DtypeDict,
# Tuple[Any, Any]]"
block_values = np.empty(block_shape, dtype=dtype) # type: ignore[arg-type]
block_values.fill(fill_value)

shape = (len(placement), self.shape[1])

dtype, fill_value = infer_dtype_from_scalar(fill_value, pandas_dtype=True)
block_values = make_na_array(dtype, shape, fill_value)
return new_block_2d(block_values, placement=placement)

def take(
Expand Down Expand Up @@ -2359,3 +2358,36 @@ def _preprocess_slice_or_indexer(
if not allow_fill:
indexer = maybe_convert_indices(indexer, length)
return "fancy", indexer, len(indexer)


def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike:
if isinstance(dtype, DatetimeTZDtype):
# NB: exclude e.g. pyarrow[dt64tz] dtypes
i8values = np.full(shape, fill_value._value)
return DatetimeArray(i8values, dtype=dtype)

elif is_1d_only_ea_dtype(dtype):
dtype = cast(ExtensionDtype, dtype)
cls = dtype.construct_array_type()

missing_arr = cls._from_sequence([], dtype=dtype)
ncols, nrows = shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value)
elif isinstance(dtype, ExtensionDtype):
# TODO: no tests get here, a handful would if we disabled
# the dt64tz special-case above (which is faster)
cls = dtype.construct_array_type()
missing_arr = cls._empty(shape=shape, dtype=dtype)
missing_arr[:] = fill_value
return missing_arr
else:
# NB: we should never get here with dtype integer or bool;
# if we did, the missing_arr.fill would cast to gibberish
missing_arr = np.empty(shape, dtype=dtype)
missing_arr.fill(fill_value)

if dtype.kind in "mM":
missing_arr = ensure_wrapped_if_datetimelike(missing_arr)
return missing_arr
23 changes: 23 additions & 0 deletions pandas/tests/frame/methods/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,29 @@ class TestDataFrameSelectReindex:
# These are specific reindex-based tests; other indexing tests should go in
# test_indexing

@td.skip_array_manager_not_yet_implemented
def test_reindex_tzaware_fill_value(self):
# GH#52586
df = DataFrame([[1]])

ts = pd.Timestamp("2023-04-10 17:32", tz="US/Pacific")
res = df.reindex([0, 1], axis=1, fill_value=ts)
assert res.dtypes[1] == pd.DatetimeTZDtype(tz="US/Pacific")
expected = DataFrame({0: [1], 1: [ts]})
tm.assert_frame_equal(res, expected)

per = ts.tz_localize(None).to_period("s")
res = df.reindex([0, 1], axis=1, fill_value=per)
assert res.dtypes[1] == pd.PeriodDtype("s")
expected = DataFrame({0: [1], 1: [per]})
tm.assert_frame_equal(res, expected)

interval = pd.Interval(ts, ts + pd.Timedelta(seconds=1))
res = df.reindex([0, 1], axis=1, fill_value=interval)
assert res.dtypes[1] == pd.IntervalDtype("datetime64[ns, US/Pacific]", "right")
expected = DataFrame({0: [1], 1: [interval]})
tm.assert_frame_equal(res, expected)

def test_reindex_copies(self):
# based on asv time_reindex_axis1
N = 10
Expand Down