From a7cab81d6de23cf0bfd2176dd063df0b510b5220 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 6 Mar 2013 19:56:13 -0500 Subject: [PATCH] BUG: Bug in idxmin/idxmax of ``datetime64[ns]`` Series with ``NaT`` (GH2982_) refactor in core/nanops.py to generically handle ops with datetime64[ns]/timedelta64[ns] Series.argsort to always return int64 dtype (and ignore numpy dtype on ints) minor doc update in io.rst (pytables) --- RELEASE.rst | 2 + doc/source/io.rst | 2 +- pandas/core/common.py | 43 +++++++++---- pandas/core/nanops.py | 118 +++++++++++++++++++----------------- pandas/core/series.py | 4 +- pandas/tests/test_series.py | 30 +++++++-- 6 files changed, 124 insertions(+), 75 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 8724dd4f7daa6..cf3fd598a8186 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -126,6 +126,7 @@ pandas 0.11.0 - Bug on in-place putmasking on an ``integer`` series that needs to be converted to ``float`` (GH2746_) - Bug in argsort of ``datetime64[ns]`` Series with ``NaT`` (GH2967_) + - Bug in idxmin/idxmax of ``datetime64[ns]`` Series with ``NaT`` (GH2982__) .. _GH622: https://github.com/pydata/pandas/issues/622 .. _GH797: https://github.com/pydata/pandas/issues/797 @@ -147,6 +148,7 @@ pandas 0.11.0 .. _GH2931: https://github.com/pydata/pandas/issues/2931 .. _GH2973: https://github.com/pydata/pandas/issues/2973 .. _GH2967: https://github.com/pydata/pandas/issues/2967 +.. _GH2982: https://github.com/pydata/pandas/issues/2982 pandas 0.10.1 diff --git a/doc/source/io.rst b/doc/source/io.rst index ebd6f5c77c658..86d590965f141 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1186,7 +1186,7 @@ A query is specified using the ``Term`` class under the hood. Valid terms can be created from ``dict, list, tuple, or string``. Objects can be embeded as values. Allowed operations are: ``<, -<=, >, >=, =``. ``=`` will be inferred as an implicit set operation +<=, >, >=, =, !=``. ``=`` will be inferred as an implicit set operation (e.g. if 2 or more values are provided). The following are all valid terms. diff --git a/pandas/core/common.py b/pandas/core/common.py index 03d43250f9265..97ab861d6a3a7 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -682,19 +682,30 @@ def _infer_dtype_from_scalar(val): def _maybe_promote(dtype, fill_value=np.nan): + + # if we passed an array here, determine the fill value by dtype + if isinstance(fill_value,np.ndarray): + if issubclass(fill_value.dtype.type, (np.datetime64,np.timedelta64)): + fill_value = tslib.iNaT + else: + fill_value = np.nan + # returns tuple of (dtype, fill_value) - if issubclass(dtype.type, np.datetime64): + if issubclass(dtype.type, (np.datetime64,np.timedelta64)): # for now: refuse to upcast datetime64 # (this is because datetime64 will not implicitly upconvert # to object correctly as of numpy 1.6.1) if isnull(fill_value): fill_value = tslib.iNaT else: - try: - fill_value = lib.Timestamp(fill_value).value - except: - # the proper thing to do here would probably be to upcast to - # object (but numpy 1.6.1 doesn't do this properly) + if issubclass(dtype.type, np.datetime64): + try: + fill_value = lib.Timestamp(fill_value).value + except: + # the proper thing to do here would probably be to upcast to + # object (but numpy 1.6.1 doesn't do this properly) + fill_value = tslib.iNaT + else: fill_value = tslib.iNaT elif is_float(fill_value): if issubclass(dtype.type, np.bool_): @@ -722,7 +733,7 @@ def _maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def _maybe_upcast_putmask(result, mask, other): +def _maybe_upcast_putmask(result, mask, other, dtype=None): """ a safe version of put mask that (potentially upcasts the result return the result and a changed flag """ try: @@ -730,16 +741,25 @@ def _maybe_upcast_putmask(result, mask, other): except: # our type is wrong here, need to upcast if (-mask).any(): - result, fill_value = _maybe_upcast(result, copy=True) + result, fill_value = _maybe_upcast(result, fill_value=other, dtype=dtype, copy=True) np.putmask(result, mask, other) return result, True return result, False -def _maybe_upcast(values, fill_value=np.nan, copy=False): +def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): """ provide explicty type promotion and coercion - if copy == True, then a copy is created even if no upcast is required """ - new_dtype, fill_value = _maybe_promote(values.dtype, fill_value) + + Parameters + ---------- + values : the ndarray that we want to maybe upcast + fill_value : what we want to fill with + dtype : if None, then use the dtype of the values, else coerce to this type + copy : if True always make a copy even if no upcast is required """ + + if dtype is None: + dtype = values.dtype + new_dtype, fill_value = _maybe_promote(dtype, fill_value) if new_dtype != values.dtype: values = values.astype(new_dtype) elif copy: @@ -915,7 +935,6 @@ def _possibly_convert_platform(values): return values - def _possibly_cast_to_timedelta(value, coerce=True): """ try to cast to timedelta64 w/o coercion """ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 881cef2311b27..93f06aae2b1b7 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -8,6 +8,7 @@ import pandas.lib as lib import pandas.algos as algos import pandas.hashtable as _hash +import pandas.tslib as tslib try: import bottleneck as bn @@ -69,8 +70,61 @@ def _has_infs(result): else: return np.isinf(result) or np.isneginf(result) +def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): + """ return the correct fill value for the dtype of the values """ + if fill_value is not None: + return fill_value + if _na_ok_dtype(dtype): + if fill_value_typ is None: + return np.nan + else: + if fill_value_typ == '+inf': + return np.inf + else: + return -np.inf + else: + if fill_value_typ is None: + return tslib.iNaT + else: + if fill_value_typ == '+inf': + # need the max int here + return np.iinfo(np.int64).max + else: + return tslib.iNaT + +def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True): + """ utility to get the values view, mask, dtype + if necessary copy and mask using the specified fill_value + copy = True will force the copy """ + if isfinite: + mask = _isfinite(values) + else: + mask = isnull(values) + + dtype = values.dtype + dtype_ok = _na_ok_dtype(dtype) + + # get our fill value (in case we need to provide an alternative dtype for it) + fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) + + if skipna: + if copy: + values = values.copy() + if dtype_ok: + np.putmask(values, mask, fill_value) + + # promote if needed + else: + values, changed = com._maybe_upcast_putmask(values, mask, fill_value) + + elif copy: + values = values.copy() + + values = _view_if_needed(values) + return values, mask, dtype + def _isfinite(values): - if issubclass(values.dtype.type, np.timedelta64): + if issubclass(values.dtype.type, (np.timedelta64,np.datetime64)): return isnull(values) return -np.isfinite(values) @@ -99,43 +153,21 @@ def _wrap_results(result,dtype): return result def nanany(values, axis=None, skipna=True): - mask = isnull(values) - - if skipna: - values = values.copy() - np.putmask(values, mask, False) + values, mask, dtype = _get_values(values, skipna, False, copy=skipna) return values.any(axis) - def nanall(values, axis=None, skipna=True): - mask = isnull(values) - - if skipna: - values = values.copy() - np.putmask(values, mask, True) + values, mask, dtype = _get_values(values, skipna, True, copy=skipna) return values.all(axis) - def _nansum(values, axis=None, skipna=True): - mask = isnull(values) - - if skipna and not issubclass(values.dtype.type, np.integer): - values = values.copy() - np.putmask(values, mask, 0) - + values, mask, dtype = _get_values(values, skipna, 0) the_sum = values.sum(axis) the_sum = _maybe_null_out(the_sum, axis, mask) - return the_sum - def _nanmean(values, axis=None, skipna=True): - mask = isnull(values) - - if skipna and not issubclass(values.dtype.type, np.integer): - values = values.copy() - np.putmask(values, mask, 0) - + values, mask, dtype = _get_values(values, skipna, 0) the_sum = _ensure_numeric(values.sum(axis)) count = _get_counts(mask, axis) @@ -186,15 +218,7 @@ def _nanvar(values, axis=None, skipna=True, ddof=1): def _nanmin(values, axis=None, skipna=True): - mask = isnull(values) - - dtype = values.dtype - - if skipna and _na_ok_dtype(dtype): - values = values.copy() - np.putmask(values, mask, np.inf) - - values = _view_if_needed(values) + values, mask, dtype = _get_values(values, skipna, fill_value_typ = '+inf') # numpy 1.6.1 workaround in Python 3.x if (values.dtype == np.object_ @@ -218,15 +242,7 @@ def _nanmin(values, axis=None, skipna=True): def _nanmax(values, axis=None, skipna=True): - mask = isnull(values) - - dtype = values.dtype - - if skipna and _na_ok_dtype(dtype): - values = values.copy() - np.putmask(values, mask, -np.inf) - - values = _view_if_needed(values) + values, mask, dtype = _get_values(values, skipna, fill_value_typ ='-inf') # numpy 1.6.1 workaround in Python 3.x if (values.dtype == np.object_ @@ -254,11 +270,7 @@ def nanargmax(values, axis=None, skipna=True): """ Returns -1 in the NA case """ - mask = _isfinite(values) - values = _view_if_needed(values) - if not issubclass(values.dtype.type, np.integer): - values = values.copy() - np.putmask(values, mask, -np.inf) + values, mask, dtype = _get_values(values, skipna, fill_value_typ = '-inf', isfinite=True) result = values.argmax(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result @@ -268,11 +280,7 @@ def nanargmin(values, axis=None, skipna=True): """ Returns -1 in the NA case """ - mask = _isfinite(values) - values = _view_if_needed(values) - if not issubclass(values.dtype.type, np.integer): - values = values.copy() - np.putmask(values, mask, np.inf) + values, mask, dtype = _get_values(values, skipna, fill_value_typ = '+inf', isfinite=True) result = values.argmin(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 8eceb7798d31d..b349dd65ff82d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2112,13 +2112,13 @@ def argsort(self, axis=0, kind='quicksort', order=None): mask = isnull(values) if mask.any(): - result = Series(-1,index=self.index,name=self.name) + result = Series(-1,index=self.index,name=self.name,dtype='int64') notmask = -mask result.values[notmask] = np.argsort(self.values[notmask], kind=kind) return result else: return Series(np.argsort(values, kind=kind), index=self.index, - name=self.name) + name=self.name,dtype='int64') def rank(self, method='average', na_option='keep', ascending=True): """ diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 312789e10c252..ee288fda120d3 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1816,14 +1816,15 @@ def test_timedelta64_functions(self): result = td.idxmax() self.assert_(result == 2) - # with NaT (broken) + # GH 2982 + # with NaT td[0] = np.nan - #result = td.idxmin() - #self.assert_(result == 1) + result = td.idxmin() + self.assert_(result == 1) - #result = td.idxmax() - #self.assert_(result == 2) + result = td.idxmax() + self.assert_(result == 2) # abs s1 = Series(date_range('20120101',periods=3)) @@ -2065,6 +2066,16 @@ def test_idxmin(self): allna = self.series * nan self.assert_(isnull(allna.idxmin())) + # datetime64[ns] + from pandas import date_range + s = Series(date_range('20130102',periods=6)) + result = s.idxmin() + self.assert_(result == 0) + + s[0] = np.nan + result = s.idxmin() + self.assert_(result == 1) + def test_idxmax(self): # test idxmax # _check_stat_op approach can not be used here because of isnull check. @@ -2086,6 +2097,15 @@ def test_idxmax(self): allna = self.series * nan self.assert_(isnull(allna.idxmax())) + from pandas import date_range + s = Series(date_range('20130102',periods=6)) + result = s.idxmax() + self.assert_(result == 5) + + s[5] = np.nan + result = s.idxmax() + self.assert_(result == 4) + def test_operators_corner(self): series = self.ts