From cb30e97324f298e67f3ace432c94d43d4645ecd7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Aug 2019 18:33:36 -0700 Subject: [PATCH 01/10] passing --- pandas/_libs/index.pyx | 20 ++++++++++---------- pandas/core/internals/blocks.py | 25 +++++++++++++++++++------ 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 7424c4ddc3d92..97fe928093833 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -528,30 +528,30 @@ cpdef convert_scalar(ndarray arr, object value): if util.is_array(value): pass elif isinstance(value, (datetime, np.datetime64, date)): - return Timestamp(value).value + return Timestamp(value).to_datetime64() elif util.is_timedelta64_object(value): # exclude np.timedelta64("NaT") from value != value below pass elif value is None or value != value: - return NPY_NAT - elif isinstance(value, str): - return Timestamp(value).value - raise ValueError("cannot set a Timestamp with a non-timestamp") + return np.datetime64("NaT", "ns") + raise ValueError("cannot set a Timestamp with a non-timestamp {typ}" + .format(typ=type(value).__name__)) elif arr.descr.type_num == NPY_TIMEDELTA: if util.is_array(value): pass elif isinstance(value, timedelta) or util.is_timedelta64_object(value): - return Timedelta(value).value + if value != value: + return np.timedelta64("NaT", "ns") + return Timedelta(value).to_timedelta64() elif util.is_datetime64_object(value): # exclude np.datetime64("NaT") which would otherwise be picked up # by the `value != value check below pass elif value is None or value != value: - return NPY_NAT - elif isinstance(value, str): - return Timedelta(value).value - raise ValueError("cannot set a Timedelta with a non-timedelta") + return np.timedelta64("NaT", "ns") + raise ValueError("cannot set a Timedelta with a non-timedelta {typ}" + .format(typ=type(value).__name__)) if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and not issubclass(arr.dtype.type, np.bool_)): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8c3cf7cc51495..de4b9f44d6b2e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -8,6 +8,7 @@ import numpy as np from pandas._libs import NaT, Timestamp, lib, tslib, tslibs +from pandas._libs.index import convert_scalar import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare @@ -415,7 +416,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): return self.copy() if self._can_hold_element(value): - # equivalent: self._try_coerce_args(value) would not raise + # equivalent: _try_coerce_args(value) would not raise blocks = self.putmask(mask, value, inplace=inplace) return self._maybe_downcast(blocks, downcast) @@ -765,7 +766,11 @@ def replace( ) values = self.values - to_replace = self._try_coerce_args(to_replace) + if lib.is_scalar(to_replace) and isinstance(values, np.ndarray): + # The only non-DatetimeLike class that also has a non-trivial + # try_coerce_args is ObjectBlock, but that overrides replace, + # so does not get here. + to_replace = convert_scalar(values, to_replace) mask = missing.mask_missing(values, to_replace) if filter is not None: @@ -827,7 +832,10 @@ def setitem(self, indexer, value): # coerce if block dtype can store value values = self.values if self._can_hold_element(value): - value = self._try_coerce_args(value) + # We only get here for non-Extension Blocks, so _try_coerce_args + # is only relevant for DatetimeBlock and TimedeltaBlock + if lib.is_scalar(value): + value = convert_scalar(values, value) # can keep its own dtype if hasattr(value, "dtype") and is_dtype_equal(values.dtype, value.dtype): @@ -935,7 +943,10 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) new = self.fill_value if self._can_hold_element(new): - new = self._try_coerce_args(new) + # We only get here for non-Extension Blocks, so _try_coerce_args + # is only relevant for DatetimeBlock and TimedeltaBlock + if lib.is_scalar(new): + new = convert_scalar(new_values, new) if transpose: new_values = new_values.T @@ -1173,7 +1184,10 @@ def _interpolate_with_fill( return [self.copy()] values = self.values if inplace else self.values.copy() - fill_value = self._try_coerce_args(fill_value) + + # We only get here for non-ExtensionBlock + fill_value = convert_scalar(self.values, fill_value) + values = missing.interpolate_2d( values, method=method, @@ -1650,7 +1664,6 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) # use block's copy logic. # .values may be an Index which does shallow copy by default new_values = self.values if inplace else self.copy().values - new = self._try_coerce_args(new) if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] From ee82bc9508e4209c049c8c2b53cf1450d4957bd2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Aug 2019 19:39:45 -0700 Subject: [PATCH 02/10] remove try_coerce_arg --- pandas/core/internals/blocks.py | 137 +---------------------- pandas/tests/internals/test_internals.py | 9 +- 2 files changed, 9 insertions(+), 137 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index de4b9f44d6b2e..b41e566e2fb00 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,4 +1,4 @@ -from datetime import date, datetime, timedelta +from datetime import datetime, timedelta import functools import inspect import re @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import NaT, Timestamp, lib, tslib, tslibs +from pandas._libs import NaT, lib, tslib from pandas._libs.index import convert_scalar import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion @@ -55,7 +55,6 @@ from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeIndex, ABCExtensionArray, ABCPandasArray, ABCSeries, @@ -65,7 +64,6 @@ array_equivalent, is_valid_nat_for_dtype, isna, - notna, ) import pandas.core.algorithms as algos @@ -686,28 +684,6 @@ def _can_hold_element(self, element): return issubclass(tipo.type, dtype) return isinstance(element, dtype) - def _try_coerce_args(self, other): - """ provide coercion to our input arguments """ - - if np.any(notna(other)) and not self._can_hold_element(other): - # coercion issues - # let higher levels handle - raise TypeError( - "cannot convert {} to an {}".format( - type(other).__name__, - type(self).__name__.lower().replace("Block", ""), - ) - ) - if np.any(isna(other)) and not self._can_hold_na: - raise TypeError( - "cannot convert {} to an {}".format( - type(other).__name__, - type(self).__name__.lower().replace("Block", ""), - ) - ) - - return other - def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -1386,7 +1362,10 @@ def func(cond, values, other): and np.isnan(other) ): # np.where will cast integer array to floats in this case - other = self._try_coerce_args(other) + if not self._can_hold_element(other): + raise TypeError + if lib.is_scalar(other) and isinstance(values, np.ndarray): + other = convert_scalar(values, other) try: fastres = expressions.where(cond, values, other) @@ -2213,38 +2192,6 @@ def _can_hold_element(self, element): return is_valid_nat_for_dtype(element, self.dtype) - def _try_coerce_args(self, other): - """ - Coerce other to dtype 'i8'. NaN and NaT convert to - the smallest i8, and will correctly round-trip to NaT if converted - back in _try_coerce_result. values is always ndarray-like, other - may not be - - Parameters - ---------- - other : ndarray-like or scalar - - Returns - ------- - base-type other - """ - if is_valid_nat_for_dtype(other, self.dtype): - other = np.datetime64("NaT", "ns") - elif isinstance(other, (datetime, np.datetime64, date)): - other = Timestamp(other) - if other.tz is not None: - raise TypeError("cannot coerce a Timestamp with a tz on a naive Block") - other = other.asm8 - elif hasattr(other, "dtype") and is_datetime64_dtype(other): - # TODO: can we get here with non-nano? - pass - else: - # coercion issues - # let higher levels handle - raise TypeError(other) - - return other - def to_native_types( self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs ): @@ -2382,35 +2329,6 @@ def _slice(self, slicer): return self.values[loc] return self.values[slicer] - def _try_coerce_args(self, other): - """ - localize and return i8 for the values - - Parameters - ---------- - other : ndarray-like or scalar - - Returns - ------- - base-type other - """ - if is_valid_nat_for_dtype(other, self.dtype): - other = np.datetime64("NaT", "ns") - elif isinstance(other, self._holder): - if not tz_compare(other.tz, self.values.tz): - raise ValueError("incompatible or non tz-aware value") - - elif isinstance(other, (np.datetime64, datetime, date)): - other = tslibs.Timestamp(other) - - # test we can have an equal time zone - if not tz_compare(other.tz, self.values.tz): - raise ValueError("incompatible or non tz-aware value") - else: - raise TypeError(other) - - return other - def diff(self, n, axis=0): """1st discrete difference @@ -2545,34 +2463,6 @@ def fillna(self, value, **kwargs): value = Timedelta(value, unit="s") return super().fillna(value, **kwargs) - def _try_coerce_args(self, other): - """ - Coerce values and other to datetime64[ns], with null values - converted to datetime64("NaT", "ns"). - - Parameters - ---------- - other : ndarray-like or scalar - - Returns - ------- - base-type other - """ - - if is_valid_nat_for_dtype(other, self.dtype): - other = np.timedelta64("NaT", "ns") - elif isinstance(other, (timedelta, np.timedelta64)): - other = Timedelta(other).to_timedelta64() - elif hasattr(other, "dtype") and is_timedelta64_dtype(other): - # TODO: can we get here with non-nano dtype? - pass - else: - # coercion issues - # let higher levels handle - raise TypeError(other) - - return other - def should_store(self, value): return issubclass( value.dtype.type, np.timedelta64 @@ -2708,21 +2598,6 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] def _can_hold_element(self, element): return True - def _try_coerce_args(self, other): - """ provide coercion to our input arguments """ - - if isinstance(other, ABCDatetimeIndex): - # May get a DatetimeIndex here. Unbox it. - other = other.array - - if isinstance(other, DatetimeArray): - # hit in pandas/tests/indexing/test_coercion.py - # ::TestWhereCoercion::test_where_series_datetime64[datetime64tz] - # when falling back to ObjectBlock.where - other = other.astype(object) - - return other - def should_store(self, value): return not ( issubclass( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index f87d6dba72e68..f0a0959c16f60 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -327,19 +327,16 @@ def test_make_block_same_class(self): class TestDatetimeBlock: - def test_try_coerce_arg(self): + def test_can_hold_element(self): block = create_block("datetime", [0]) # coerce None - none_coerced = block._try_coerce_args(None) - assert pd.Timestamp(none_coerced) is pd.NaT + assert block._can_hold_element(None) # coerce different types of date bojects vals = (np.datetime64("2010-10-10"), datetime(2010, 10, 10), date(2010, 10, 10)) for val in vals: - coerced = block._try_coerce_args(val) - assert np.datetime64 == type(coerced) - assert pd.Timestamp("2010-10-10") == pd.Timestamp(coerced) + assert block._can_hold_element(val) class TestBlockManager: From 16774509881b272803e5ec260ea207efe74f8c4f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Aug 2019 20:47:03 -0700 Subject: [PATCH 03/10] comment --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b41e566e2fb00..6bda6d747b149 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1046,7 +1046,7 @@ def coerce_to_target_dtype(self, other): mytz = getattr(self.dtype, "tz", None) othertz = getattr(dtype, "tz", None) - if str(mytz) != str(othertz): + if str(mytz) != str(othertz): # TODO: use tz_compare return self.astype(object) raise AssertionError( From 32050739d2ffe35a0e888a1bfe748c0d9593e6bd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Oct 2019 20:33:49 -0700 Subject: [PATCH 04/10] update test --- pandas/tests/internals/test_internals.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 38b75a13ea63a..0d661bbd52806 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -333,11 +333,14 @@ def test_can_hold_element(self): # coerce None assert block._can_hold_element(None) - # coerce different types of date bojects - vals = (np.datetime64("2010-10-10"), datetime(2010, 10, 10), date(2010, 10, 10)) + # coerce different types of datetime objects + vals = [np.datetime64("2010-10-10"), datetime(2010, 10, 10)] for val in vals: assert block._can_hold_element(val) + val = date(2010, 10, 10) + assert not block._can_hold_element(val) + class TestBlockManager: def test_constructor_corner(self): From 750ad232fc93c06f0e45d306308e5f6d962fa870 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 19:18:56 -0700 Subject: [PATCH 05/10] TST: _can_hold_element is equivalent to __setitem__ being allowed --- pandas/tests/internals/test_internals.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 0d661bbd52806..ed1a321a3d7e6 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -330,16 +330,24 @@ class TestDatetimeBlock: def test_can_hold_element(self): block = create_block("datetime", [0]) + # We will check that block._can_hold_element iff arr.__setitem__ works + arr = pd.array(block.values.ravel()) + # coerce None assert block._can_hold_element(None) + arr[0] = None + assert arr[0] is pd.NaT # coerce different types of datetime objects vals = [np.datetime64("2010-10-10"), datetime(2010, 10, 10)] for val in vals: assert block._can_hold_element(val) + arr[0] = val val = date(2010, 10, 10) assert not block._can_hold_element(val) + with pytest.raises(TypeError): + arr[0] = val class TestBlockManager: From 86597ee2c80ee0ea74327c1e4f860219bb29776c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 21 Oct 2019 15:33:17 -0700 Subject: [PATCH 06/10] fix for some plats where apparently nat!=nat doesnt hold --- pandas/_libs/index.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 8c3a0ba03cdef..28e5bbe09957c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -560,9 +560,10 @@ cpdef convert_scalar(ndarray arr, object value): if util.is_array(value): pass elif isinstance(value, timedelta) or util.is_timedelta64_object(value): - if value != value: + values = Timedelta(value) + if value is NaT: return np.timedelta64("NaT", "ns") - return Timedelta(value).to_timedelta64() + return value.to_timedelta64() elif util.is_datetime64_object(value): # exclude np.datetime64("NaT") which would otherwise be picked up # by the `value != value check below From a321f4be2781bdb1b230fbf165df85a17c315b03 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 21 Oct 2019 15:50:55 -0700 Subject: [PATCH 07/10] fixup missing import --- pandas/_libs/index.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 28e5bbe09957c..6109e868d597f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -18,6 +18,7 @@ cnp.import_array() cimport pandas._libs.util as util from pandas._libs.tslibs.conversion cimport maybe_datetimelike_to_i8 +from pandas._libs.tslibs.nattype cimport c_NaT as NaT from pandas._libs.hashtable cimport HashTable From 92ce0fb3291f02025013c90eb2599a0e00f730b2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 21 Oct 2019 17:05:31 -0700 Subject: [PATCH 08/10] typo fixup --- pandas/_libs/index.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 6109e868d597f..255fd85531d14 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -561,7 +561,7 @@ cpdef convert_scalar(ndarray arr, object value): if util.is_array(value): pass elif isinstance(value, timedelta) or util.is_timedelta64_object(value): - values = Timedelta(value) + value = Timedelta(value) if value is NaT: return np.timedelta64("NaT", "ns") return value.to_timedelta64() From cee23e8d9b2f03c11906f512eae5977e3e4e1ea9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 23 Oct 2019 11:52:49 -0700 Subject: [PATCH 09/10] Avert cast to ndarray --- pandas/core/internals/blocks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 449cfdf0eafd3..6f1aa4570e224 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -794,7 +794,8 @@ def _replace_single(self, *args, **kwargs): return self if kwargs["inplace"] else self.copy() def setitem(self, indexer, value): - """Set the value inplace, returning a a maybe different typed block. + """ + Set the value inplace, returning a a maybe different typed block. Parameters ---------- @@ -846,7 +847,10 @@ def setitem(self, indexer, value): return b.setitem(indexer, value) # value must be storeable at this moment - arr_value = np.array(value) + if not is_extension_array_dtype(value): + arr_value = np.array(value) + else: + arr_value = value # cast the values to a type that can hold nan (if necessary) if not self._can_hold_element(value): From bc5c6b6ce190fe6b550885e21b8520b60b343553 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 23 Oct 2019 13:05:14 -0700 Subject: [PATCH 10/10] tighten check --- pandas/core/internals/blocks.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6f1aa4570e224..51108d9a5a573 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -847,10 +847,12 @@ def setitem(self, indexer, value): return b.setitem(indexer, value) # value must be storeable at this moment - if not is_extension_array_dtype(value): - arr_value = np.array(value) - else: + if is_extension_array_dtype(getattr(value, "dtype", None)): + # We need to be careful not to allow through strings that + # can be parsed to EADtypes arr_value = value + else: + arr_value = np.array(value) # cast the values to a type that can hold nan (if necessary) if not self._can_hold_element(value):