From 75b93454efbe2c8678bdd1d5021c4b82b572b793 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 5 Jul 2014 19:15:22 +0900 Subject: [PATCH] PERF: improve resample perf --- pandas/core/base.py | 43 ++++++++++++++++++------------------- pandas/lib.pyx | 35 ++++++++++++++++++------------ pandas/src/generate_code.py | 2 +- pandas/tseries/offsets.py | 18 ++++++++++++++-- pandas/tseries/resample.py | 8 +++---- pandas/tslib.pyx | 23 +++++++++++++------- 6 files changed, 78 insertions(+), 51 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index b06b0856d5909..1ba5061cd7e9a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,6 +4,9 @@ from pandas import compat import numpy as np from pandas.core import common as com +import pandas.core.nanops as nanops +import pandas.tslib as tslib + class StringMixin(object): @@ -236,13 +239,11 @@ def _wrap_access_object(self, obj): def max(self): """ The maximum value of the object """ - import pandas.core.nanops - return pandas.core.nanops.nanmax(self.values) + return nanops.nanmax(self.values) def min(self): """ The minimum value of the object """ - import pandas.core.nanops - return pandas.core.nanops.nanmin(self.values) + return nanops.nanmin(self.values) def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): @@ -406,31 +407,29 @@ def min(self, axis=None): """ Overridden ndarray.min to return an object """ - import pandas.tslib as tslib - mask = self.asi8 == tslib.iNaT - masked = self[~mask] - if len(masked) == 0: - return self._na_value - elif self.is_monotonic: - return masked[0] - else: - min_stamp = masked.asi8.min() + try: + mask = self.asi8 == tslib.iNaT + if mask.any(): + min_stamp = self[~mask].asi8.min() + else: + min_stamp = self.asi8.min() return self._box_func(min_stamp) + except ValueError: + return self._na_value def max(self, axis=None): """ Overridden ndarray.max to return an object """ - import pandas.tslib as tslib - mask = self.asi8 == tslib.iNaT - masked = self[~mask] - if len(masked) == 0: - return self._na_value - elif self.is_monotonic: - return masked[-1] - else: - max_stamp = masked.asi8.max() + try: + mask = self.asi8 == tslib.iNaT + if mask.any(): + max_stamp = self[~mask].asi8.max() + else: + max_stamp = self.asi8.max() return self._box_func(max_stamp) + except ValueError: + return self._na_value @property def _formatter_func(self): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 89e681e6f1c90..a064e714e7f89 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -965,12 +965,14 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, cdef: Py_ssize_t lenidx, lenbin, i, j, bc, vc ndarray[int64_t] bins - int64_t l_bin, r_bin + int64_t l_bin, r_bin, nat_count bint right_closed = closed == 'right' mask = values == iNaT - nat_count = values[mask].size - values = values[~mask] + nat_count = 0 + if mask.any(): + nat_count = np.sum(mask) + values = values[~mask] lenidx = len(values) lenbin = len(binner) @@ -991,17 +993,22 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, bc = 0 # bin count # linear scan - for i in range(0, lenbin - 1): - l_bin = binner[i] - r_bin = binner[i+1] - - # count values in current bin, advance to next bin - while j < lenidx and (values[j] < r_bin or - (right_closed and values[j] == r_bin)): - j += 1 - - bins[bc] = j - bc += 1 + if right_closed: + for i in range(0, lenbin - 1): + r_bin = binner[i+1] + # count values in current bin, advance to next bin + while j < lenidx and values[j] <= r_bin: + j += 1 + bins[bc] = j + bc += 1 + else: + for i in range(0, lenbin - 1): + r_bin = binner[i+1] + # count values in current bin, advance to next bin + while j < lenidx and values[j] < r_bin: + j += 1 + bins[bc] = j + bc += 1 if nat_count > 0: # shift bins by the number of NaT diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index 4098ac06c2da2..842be5a1645bf 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -1584,7 +1584,7 @@ def group_mean_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, for i in range(ngroups): for j in range(K): count = nobs[i, j] - if nobs[i, j] == 0: + if count == 0: out[i, j] = nan else: out[i, j] = sumx[i, j] / count diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index bcb68ded6fda7..d1fe287bf33be 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -27,6 +27,8 @@ # convert to/from datetime/timestamp to allow invalid Timestamp ranges to pass thru def as_timestamp(obj): try: + if isinstance(obj, Timestamp): + return obj return Timestamp(obj) except (OutOfBoundsDatetime): pass @@ -2014,9 +2016,21 @@ def delta(self): def nanos(self): return _delta_to_nanoseconds(self.delta) - @apply_wraps def apply(self, other): - if isinstance(other, (datetime, timedelta)): + # Timestamp can handle tz and nano sec, thus no need to use apply_wraps + if type(other) == date: + other = datetime(other.year, other.month, other.day) + elif isinstance(other, (np.datetime64, datetime)): + other = as_timestamp(other) + + if isinstance(other, datetime): + result = other + self.delta + if self.normalize: + # normalize_date returns normal datetime + result = tslib.normalize_date(result) + return as_timestamp(result) + + elif isinstance(other, timedelta): return other + self.delta elif isinstance(other, type(self)): return type(self)(self.n + other.n) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 059a6bfd06719..1ee7664f7bb9a 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -152,7 +152,8 @@ def _get_time_bins(self, ax): binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name) return binner, [], labels - first, last = _get_range_edges(ax, self.freq, closed=self.closed, + first, last = ax.min(), ax.max() + first, last = _get_range_edges(first, last, self.freq, closed=self.closed, base=self.base) tz = ax.tz binner = labels = DatetimeIndex(freq=self.freq, @@ -163,7 +164,7 @@ def _get_time_bins(self, ax): # a little hack trimmed = False - if (len(binner) > 2 and binner[-2] == ax.max() and + if (len(binner) > 2 and binner[-2] == last and self.closed == 'right'): binner = binner[:-1] @@ -353,11 +354,10 @@ def _take_new_index(obj, indexer, new_index, axis=0): raise NotImplementedError -def _get_range_edges(axis, offset, closed='left', base=0): +def _get_range_edges(first, last, offset, closed='left', base=0): if isinstance(offset, compat.string_types): offset = to_offset(offset) - first, last = axis.min(), axis.max() if isinstance(offset, Tick): day_nanos = _delta_to_nanoseconds(timedelta(1)) # #1165 diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 090b49bde68a6..70b6b308b6b37 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -3134,14 +3134,21 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): else: relation = START - for i in range(n): - if arr[i] == iNaT: - result[i] = iNaT - continue - val = func(arr[i], relation, &finfo) - if val == INT32_MIN: - raise ValueError("Unable to convert to desired frequency.") - result[i] = val + mask = arr == iNaT + if mask.any(): # NaT process + for i in range(n): + val = arr[i] + if val != iNaT: + val = func(val, relation, &finfo) + if val == INT32_MIN: + raise ValueError("Unable to convert to desired frequency.") + result[i] = val + else: + for i in range(n): + val = func(arr[i], relation, &finfo) + if val == INT32_MIN: + raise ValueError("Unable to convert to desired frequency.") + result[i] = val return result