Skip to content

PERF: avoid unnecessary check in concat #52535

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 8, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 17 additions & 29 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pandas._libs import (
NaT,
internals as libinternals,
lib,
)
from pandas._libs.missing import NA
from pandas.util._decorators import cache_readonly
Expand Down Expand Up @@ -403,56 +404,41 @@ def __init__(self, block: Block, shape: Shape, indexers=None) -> None:
# Note: block is None implies indexers is None, but not vice-versa
if indexers is None:
indexers = {}
# Otherwise we may have only {0: np.array(...)} and only non-negative
# entries.
self.block = block
self.indexers = indexers
self.shape = shape

def __repr__(self) -> str:
return f"{type(self).__name__}({repr(self.block)}, {self.indexers})"

@cache_readonly
def needs_filling(self) -> bool:
for indexer in self.indexers.values():
# FIXME: cache results of indexer == -1 checks.
if (indexer == -1).any():
return True

return False

@cache_readonly
def dtype(self) -> DtypeObj:
blk = self.block
if blk.values.dtype.kind == "V":
raise AssertionError("Block is None, no dtype")

if not self.needs_filling:
return blk.dtype
return ensure_dtype_can_hold_na(blk.dtype)

def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
"""
Check that we are all-NA of a type/dtype that is compatible with this dtype.
Augments `self.is_na` with an additional check of the type of NA values.
"""
if not self.is_na:
return False
if self.block.dtype.kind == "V":

blk = self.block
if blk.dtype.kind == "V":
return True

if self.dtype == object:
values = self.block.values
if blk.dtype == object:
values = blk.values
return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))

na_value = self.block.fill_value
if na_value is NaT and not is_dtype_equal(self.dtype, dtype):
na_value = blk.fill_value
if na_value is NaT and not is_dtype_equal(blk.dtype, dtype):
# e.g. we are dt64 and other is td64
# fill_values match but we should not cast self.block.values to dtype
# fill_values match but we should not cast blk.values to dtype
# TODO: this will need updating if we ever have non-nano dt64/td64
return False

if na_value is NA and needs_i8_conversion(dtype):
# FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat
# e.g. self.dtype == "Int64" and dtype is td64, we dont want
# e.g. blk.dtype == "Int64" and dtype is td64, we dont want
# to consider these as matching
return False

Expand Down Expand Up @@ -663,9 +649,11 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:

has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)

dtypes = [unit.dtype for unit in join_units if not unit.is_na]
dtypes = [unit.block.dtype for unit in join_units if not unit.is_na]
if not len(dtypes):
dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"]
dtypes = [
unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
]

dtype = find_common_type(dtypes)
if has_none_blocks:
Expand Down Expand Up @@ -712,7 +700,7 @@ def _is_uniform_reindex(join_units) -> bool:
return (
# TODO: should this be ju.block._can_hold_na?
all(ju.block.is_extension for ju in join_units)
and len({ju.block.dtype.name for ju in join_units}) == 1
and lib.dtypes_all_equal([ju.block.dtype for ju in join_units])
)


Expand Down