From 75b93454efbe2c8678bdd1d5021c4b82b572b793 Mon Sep 17 00:00:00 2001
From: sinhrks <sinhrks@gmail.com>
Date: Sat, 5 Jul 2014 19:15:22 +0900
Subject: [PATCH] PERF: improve resample perf

---
 pandas/core/base.py         | 43 ++++++++++++++++++-------------------
 pandas/lib.pyx              | 35 ++++++++++++++++++------------
 pandas/src/generate_code.py |  2 +-
 pandas/tseries/offsets.py   | 18 ++++++++++++++--
 pandas/tseries/resample.py  |  8 +++----
 pandas/tslib.pyx            | 23 +++++++++++++-------
 6 files changed, 78 insertions(+), 51 deletions(-)

diff --git a/pandas/core/base.py b/pandas/core/base.py
index b06b0856d5909..1ba5061cd7e9a 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -4,6 +4,9 @@
 from pandas import compat
 import numpy as np
 from pandas.core import common as com
+import pandas.core.nanops as nanops
+import pandas.tslib as tslib
+
 
 class StringMixin(object):
 
@@ -236,13 +239,11 @@ def _wrap_access_object(self, obj):
 
     def max(self):
         """ The maximum value of the object """
-        import pandas.core.nanops
-        return pandas.core.nanops.nanmax(self.values)
+        return nanops.nanmax(self.values)
 
     def min(self):
         """ The minimum value of the object """
-        import pandas.core.nanops
-        return pandas.core.nanops.nanmin(self.values)
+        return nanops.nanmin(self.values)
 
     def value_counts(self, normalize=False, sort=True, ascending=False,
                      bins=None, dropna=True):
@@ -406,31 +407,29 @@ def min(self, axis=None):
         """
         Overridden ndarray.min to return an object
         """
-        import pandas.tslib as tslib
-        mask = self.asi8 == tslib.iNaT
-        masked = self[~mask]
-        if len(masked) == 0:
-            return self._na_value
-        elif self.is_monotonic:
-            return masked[0]
-        else:
-            min_stamp = masked.asi8.min()
+        try:
+            mask = self.asi8 == tslib.iNaT
+            if mask.any():
+                min_stamp = self[~mask].asi8.min()
+            else:
+                min_stamp = self.asi8.min()
             return self._box_func(min_stamp)
+        except ValueError:
+            return self._na_value
 
     def max(self, axis=None):
         """
         Overridden ndarray.max to return an object
         """
-        import pandas.tslib as tslib
-        mask = self.asi8 == tslib.iNaT
-        masked = self[~mask]
-        if len(masked) == 0:
-            return self._na_value
-        elif self.is_monotonic:
-            return masked[-1]
-        else:
-            max_stamp = masked.asi8.max()
+        try:
+            mask = self.asi8 == tslib.iNaT
+            if mask.any():
+                max_stamp = self[~mask].asi8.max()
+            else:
+                max_stamp = self.asi8.max()
             return self._box_func(max_stamp)
+        except ValueError:
+            return self._na_value
 
     @property
     def _formatter_func(self):
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
index 89e681e6f1c90..a064e714e7f89 100644
--- a/pandas/lib.pyx
+++ b/pandas/lib.pyx
@@ -965,12 +965,14 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
     cdef:
         Py_ssize_t lenidx, lenbin, i, j, bc, vc
         ndarray[int64_t] bins
-        int64_t l_bin, r_bin
+        int64_t l_bin, r_bin, nat_count
         bint right_closed = closed == 'right'
 
     mask = values == iNaT
-    nat_count = values[mask].size
-    values = values[~mask]
+    nat_count = 0
+    if mask.any():
+        nat_count = np.sum(mask)
+        values = values[~mask]
 
     lenidx = len(values)
     lenbin = len(binner)
@@ -991,17 +993,22 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
     bc = 0 # bin count
 
     # linear scan
-    for i in range(0, lenbin - 1):
-        l_bin = binner[i]
-        r_bin = binner[i+1]
-
-        # count values in current bin, advance to next bin
-        while j < lenidx and (values[j] < r_bin or
-                              (right_closed and values[j] == r_bin)):
-            j += 1
-
-        bins[bc] = j
-        bc += 1
+    if right_closed:
+        for i in range(0, lenbin - 1):
+            r_bin = binner[i+1]
+            # count values in current bin, advance to next bin
+            while j < lenidx and values[j] <= r_bin:
+                j += 1
+            bins[bc] = j
+            bc += 1
+    else:
+        for i in range(0, lenbin - 1):
+            r_bin = binner[i+1]
+            # count values in current bin, advance to next bin
+            while j < lenidx and values[j] < r_bin:
+                j += 1
+            bins[bc] = j
+            bc += 1
 
     if nat_count > 0:
         # shift bins by the number of NaT
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
index 4098ac06c2da2..842be5a1645bf 100644
--- a/pandas/src/generate_code.py
+++ b/pandas/src/generate_code.py
@@ -1584,7 +1584,7 @@ def group_mean_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
     for i in range(ngroups):
         for j in range(K):
             count = nobs[i, j]
-            if nobs[i, j] == 0:
+            if count == 0:
                 out[i, j] = nan
             else:
                 out[i, j] = sumx[i, j] / count
diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py
index bcb68ded6fda7..d1fe287bf33be 100644
--- a/pandas/tseries/offsets.py
+++ b/pandas/tseries/offsets.py
@@ -27,6 +27,8 @@
 # convert to/from datetime/timestamp to allow invalid Timestamp ranges to pass thru
 def as_timestamp(obj):
     try:
+        if isinstance(obj, Timestamp):
+            return obj
         return Timestamp(obj)
     except (OutOfBoundsDatetime):
         pass
@@ -2014,9 +2016,21 @@ def delta(self):
     def nanos(self):
         return _delta_to_nanoseconds(self.delta)
 
-    @apply_wraps
     def apply(self, other):
-        if isinstance(other, (datetime, timedelta)):
+        # Timestamp can handle tz and nano sec, thus no need to use apply_wraps
+        if type(other) == date:
+            other = datetime(other.year, other.month, other.day)
+        elif isinstance(other, (np.datetime64, datetime)):
+            other = as_timestamp(other)
+
+        if isinstance(other, datetime):
+            result = other + self.delta
+            if self.normalize:
+                # normalize_date returns normal datetime
+                result = tslib.normalize_date(result)
+            return as_timestamp(result)
+
+        elif isinstance(other, timedelta):
             return other + self.delta
         elif isinstance(other, type(self)):
             return type(self)(self.n + other.n)
diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py
index 059a6bfd06719..1ee7664f7bb9a 100644
--- a/pandas/tseries/resample.py
+++ b/pandas/tseries/resample.py
@@ -152,7 +152,8 @@ def _get_time_bins(self, ax):
             binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name)
             return binner, [], labels
 
-        first, last = _get_range_edges(ax, self.freq, closed=self.closed,
+        first, last = ax.min(), ax.max()
+        first, last = _get_range_edges(first, last, self.freq, closed=self.closed,
                                        base=self.base)
         tz = ax.tz
         binner = labels = DatetimeIndex(freq=self.freq,
@@ -163,7 +164,7 @@ def _get_time_bins(self, ax):
 
         # a little hack
         trimmed = False
-        if (len(binner) > 2 and binner[-2] == ax.max() and
+        if (len(binner) > 2 and binner[-2] == last and
                 self.closed == 'right'):
 
             binner = binner[:-1]
@@ -353,11 +354,10 @@ def _take_new_index(obj, indexer, new_index, axis=0):
         raise NotImplementedError
 
 
-def _get_range_edges(axis, offset, closed='left', base=0):
+def _get_range_edges(first, last, offset, closed='left', base=0):
     if isinstance(offset, compat.string_types):
         offset = to_offset(offset)
 
-    first, last = axis.min(), axis.max()
     if isinstance(offset, Tick):
         day_nanos = _delta_to_nanoseconds(timedelta(1))
         # #1165
diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
index 090b49bde68a6..70b6b308b6b37 100644
--- a/pandas/tslib.pyx
+++ b/pandas/tslib.pyx
@@ -3134,14 +3134,21 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end):
     else:
         relation = START
 
-    for i in range(n):
-        if arr[i] == iNaT:
-            result[i] = iNaT
-            continue
-        val = func(arr[i], relation, &finfo)
-        if val == INT32_MIN:
-            raise ValueError("Unable to convert to desired frequency.")
-        result[i] = val
+    mask = arr == iNaT
+    if mask.any():      # NaT process
+        for i in range(n):
+            val = arr[i]
+            if val != iNaT:
+                val = func(val, relation, &finfo)
+                if val == INT32_MIN:
+                    raise ValueError("Unable to convert to desired frequency.")
+            result[i] = val
+    else:
+        for i in range(n):
+            val = func(arr[i], relation, &finfo)
+            if val == INT32_MIN:
+                raise ValueError("Unable to convert to desired frequency.")
+            result[i] = val
 
     return result