Skip to content

PERF: improve resample perf #7673

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 7, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 21 additions & 22 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from pandas import compat
import numpy as np
from pandas.core import common as com
import pandas.core.nanops as nanops
import pandas.tslib as tslib


class StringMixin(object):

Expand Down Expand Up @@ -236,13 +239,11 @@ def _wrap_access_object(self, obj):

def max(self):
""" The maximum value of the object """
import pandas.core.nanops
return pandas.core.nanops.nanmax(self.values)
return nanops.nanmax(self.values)

def min(self):
""" The minimum value of the object """
import pandas.core.nanops
return pandas.core.nanops.nanmin(self.values)
return nanops.nanmin(self.values)

def value_counts(self, normalize=False, sort=True, ascending=False,
bins=None, dropna=True):
Expand Down Expand Up @@ -406,31 +407,29 @@ def min(self, axis=None):
"""
Overridden ndarray.min to return an object
"""
import pandas.tslib as tslib
mask = self.asi8 == tslib.iNaT
masked = self[~mask]
if len(masked) == 0:
return self._na_value
elif self.is_monotonic:
return masked[0]
else:
min_stamp = masked.asi8.min()
try:
mask = self.asi8 == tslib.iNaT
if mask.any():
min_stamp = self[~mask].asi8.min()
else:
min_stamp = self.asi8.min()
return self._box_func(min_stamp)
except ValueError:
return self._na_value

def max(self, axis=None):
"""
Overridden ndarray.max to return an object
"""
import pandas.tslib as tslib
mask = self.asi8 == tslib.iNaT
masked = self[~mask]
if len(masked) == 0:
return self._na_value
elif self.is_monotonic:
return masked[-1]
else:
max_stamp = masked.asi8.max()
try:
mask = self.asi8 == tslib.iNaT
if mask.any():
max_stamp = self[~mask].asi8.max()
else:
max_stamp = self.asi8.max()
return self._box_func(max_stamp)
except ValueError:
return self._na_value

@property
def _formatter_func(self):
Expand Down
35 changes: 21 additions & 14 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -965,12 +965,14 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
cdef:
Py_ssize_t lenidx, lenbin, i, j, bc, vc
ndarray[int64_t] bins
int64_t l_bin, r_bin
int64_t l_bin, r_bin, nat_count
bint right_closed = closed == 'right'

mask = values == iNaT
nat_count = values[mask].size
values = values[~mask]
nat_count = 0
if mask.any():
nat_count = np.sum(mask)
values = values[~mask]

lenidx = len(values)
lenbin = len(binner)
Expand All @@ -991,17 +993,22 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
bc = 0 # bin count

# linear scan
for i in range(0, lenbin - 1):
l_bin = binner[i]
r_bin = binner[i+1]

# count values in current bin, advance to next bin
while j < lenidx and (values[j] < r_bin or
(right_closed and values[j] == r_bin)):
j += 1

bins[bc] = j
bc += 1
if right_closed:
for i in range(0, lenbin - 1):
r_bin = binner[i+1]
# count values in current bin, advance to next bin
while j < lenidx and values[j] <= r_bin:
j += 1
bins[bc] = j
bc += 1
else:
for i in range(0, lenbin - 1):
r_bin = binner[i+1]
# count values in current bin, advance to next bin
while j < lenidx and values[j] < r_bin:
j += 1
bins[bc] = j
bc += 1

if nat_count > 0:
# shift bins by the number of NaT
Expand Down
2 changes: 1 addition & 1 deletion pandas/src/generate_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -1584,7 +1584,7 @@ def group_mean_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
for i in range(ngroups):
for j in range(K):
count = nobs[i, j]
if nobs[i, j] == 0:
if count == 0:
out[i, j] = nan
else:
out[i, j] = sumx[i, j] / count
Expand Down
18 changes: 16 additions & 2 deletions pandas/tseries/offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
# convert to/from datetime/timestamp to allow invalid Timestamp ranges to pass thru
def as_timestamp(obj):
try:
if isinstance(obj, Timestamp):
return obj
return Timestamp(obj)
except (OutOfBoundsDatetime):
pass
Expand Down Expand Up @@ -2014,9 +2016,21 @@ def delta(self):
def nanos(self):
return _delta_to_nanoseconds(self.delta)

@apply_wraps
def apply(self, other):
if isinstance(other, (datetime, timedelta)):
# Timestamp can handle tz and nano sec, thus no need to use apply_wraps
if type(other) == date:
other = datetime(other.year, other.month, other.day)
elif isinstance(other, (np.datetime64, datetime)):
other = as_timestamp(other)

if isinstance(other, datetime):
result = other + self.delta
if self.normalize:
# normalize_date returns normal datetime
result = tslib.normalize_date(result)
return as_timestamp(result)

elif isinstance(other, timedelta):
return other + self.delta
elif isinstance(other, type(self)):
return type(self)(self.n + other.n)
Expand Down
8 changes: 4 additions & 4 deletions pandas/tseries/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ def _get_time_bins(self, ax):
binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name)
return binner, [], labels

first, last = _get_range_edges(ax, self.freq, closed=self.closed,
first, last = ax.min(), ax.max()
first, last = _get_range_edges(first, last, self.freq, closed=self.closed,
base=self.base)
tz = ax.tz
binner = labels = DatetimeIndex(freq=self.freq,
Expand All @@ -163,7 +164,7 @@ def _get_time_bins(self, ax):

# a little hack
trimmed = False
if (len(binner) > 2 and binner[-2] == ax.max() and
if (len(binner) > 2 and binner[-2] == last and
self.closed == 'right'):

binner = binner[:-1]
Expand Down Expand Up @@ -353,11 +354,10 @@ def _take_new_index(obj, indexer, new_index, axis=0):
raise NotImplementedError


def _get_range_edges(axis, offset, closed='left', base=0):
def _get_range_edges(first, last, offset, closed='left', base=0):
if isinstance(offset, compat.string_types):
offset = to_offset(offset)

first, last = axis.min(), axis.max()
if isinstance(offset, Tick):
day_nanos = _delta_to_nanoseconds(timedelta(1))
# #1165
Expand Down
23 changes: 15 additions & 8 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3134,14 +3134,21 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end):
else:
relation = START

for i in range(n):
if arr[i] == iNaT:
result[i] = iNaT
continue
val = func(arr[i], relation, &finfo)
if val == INT32_MIN:
raise ValueError("Unable to convert to desired frequency.")
result[i] = val
mask = arr == iNaT
if mask.any(): # NaT process
for i in range(n):
val = arr[i]
if val != iNaT:
val = func(val, relation, &finfo)
if val == INT32_MIN:
raise ValueError("Unable to convert to desired frequency.")
result[i] = val
else:
for i in range(n):
val = func(arr[i], relation, &finfo)
if val == INT32_MIN:
raise ValueError("Unable to convert to desired frequency.")
result[i] = val

return result

Expand Down