-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF: Improve performance in rolling.mean(engine="numba") #43612
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
5e0b2cc
4f2d298
8ada0be
f201dbd
bf22d88
9ec1ef0
8132622
d9b39bd
9ddf423
68524fd
cc786dd
2d19aa0
705bb8c
3622700
0fa551a
675b5a1
a169423
2842199
46f1b6b
05341e3
f4f59c8
c4bd78c
8801cf9
ce6aafb
f0b38fd
ca0653b
09e1eb2
fd595c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
from __future__ import annotations | ||
|
||
from typing import ( | ||
Any, | ||
Callable, | ||
) | ||
|
||
import numpy as np | ||
|
||
from pandas._typing import Scalar | ||
from pandas.compat._optional import import_optional_dependency | ||
|
||
from pandas.core.util.numba_ import ( | ||
NUMBA_FUNC_CACHE, | ||
get_jit_arguments, | ||
) | ||
|
||
|
||
def generate_shared_aggregator( | ||
func: Callable[..., Scalar], | ||
kwargs: dict[str, Any], | ||
engine_kwargs: dict[str, bool] | None, | ||
cache_key_str: str, | ||
): | ||
""" | ||
Generate a Numba function that loops over the columns 2D object and applies | ||
a 1D numba kernel over each column. | ||
|
||
Parameters | ||
---------- | ||
func : function | ||
aggregation function to be applied to each column | ||
kwargs : dict | ||
**kwargs to be passed into the function. Should be unused as not | ||
supported by Numba | ||
engine_kwargs : dict | ||
dictionary of arguments to be passed into numba.jit | ||
cache_key_str: str | ||
string to access the compiled function of the form | ||
<caller_type>_<aggregation_type> e.g. rolling_mean, groupby_mean | ||
|
||
Returns | ||
------- | ||
Numba function | ||
""" | ||
nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs) | ||
|
||
cache_key = (func, cache_key_str) | ||
if cache_key in NUMBA_FUNC_CACHE: | ||
return NUMBA_FUNC_CACHE[cache_key] | ||
|
||
numba = import_optional_dependency("numba") | ||
|
||
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) | ||
def column_looper( | ||
values: np.ndarray, | ||
start: np.ndarray, | ||
end: np.ndarray, | ||
min_periods: int, | ||
): | ||
result = np.empty((len(start), values.shape[1])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can dtype by specified? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Added dtype. |
||
for i in numba.prange(values.shape[1]): | ||
mzeitlin11 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
result[:, i] = func(values[:, i], start, end, min_periods) | ||
return result | ||
|
||
return column_looper |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from pandas.core._numba.kernels.mean_ import sliding_mean | ||
|
||
__all__ = ["sliding_mean"] |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,119 @@ | ||||
""" | ||||
Numba 1D aggregation kernels that can be shared by | ||||
* Dataframe / Series | ||||
* groupby | ||||
* rolling / expanding | ||||
|
||||
Mirrors pandas/_libs/window/aggregation.pyx | ||||
""" | ||||
from __future__ import annotations | ||||
|
||||
import numba | ||||
import numpy as np | ||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False) | ||||
def is_monotonic_increasing(bounds: np.ndarray) -> bool: | ||||
n = len(bounds) | ||||
if n == 1: | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand this block. n==1 and n < 2 -> n==0? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point I was able to simplify this block. |
||||
return bounds[0] == bounds[0] | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this to stop single element There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe so, yes. This snippet was taken from translating this function, but I was able to remove this condition since we know the inputs should be Line 792 in ae049ae
|
||||
elif n < 2: | ||||
return True | ||||
prev = bounds[0] | ||||
for i in range(1, n): | ||||
cur = bounds[i] | ||||
if cur < prev: | ||||
return False | ||||
return True | ||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False) | ||||
def add_mean( | ||||
val: float, nobs: float, sum_x: float, neg_ct: int, compensation: float | ||||
) -> tuple[float, float, int, float]: | ||||
if not np.isnan(val): | ||||
nobs += 1 | ||||
y = val - compensation | ||||
t = sum_x + y | ||||
compensation = t - sum_x - y | ||||
sum_x = t | ||||
if val < 0: | ||||
neg_ct += 1 | ||||
return nobs, sum_x, neg_ct, compensation | ||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False) | ||||
def remove_mean( | ||||
val: float, nobs: float, sum_x: float, neg_ct: int, compensation: float | ||||
) -> tuple[float, float, int, float]: | ||||
if not np.isnan(val): | ||||
nobs -= 1 | ||||
y = -val - compensation | ||||
t = sum_x + y | ||||
compensation = t - sum_x - y | ||||
sum_x = t | ||||
if val < 0: | ||||
neg_ct -= 1 | ||||
return nobs, sum_x, neg_ct, compensation | ||||
|
||||
|
||||
@numba.jit(nopython=True, nogil=True, parallel=False) | ||||
def sliding_mean( | ||||
values: np.ndarray, | ||||
start: np.ndarray, | ||||
end: np.ndarray, | ||||
min_periods: int, | ||||
) -> np.ndarray: | ||||
N = len(start) | ||||
nobs = 0.0 | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could nobs ever overflow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose with sufficient observations ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is an actual build for the maximum size of a NumPy array, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, changed nobs to a int. |
||||
sum_x = 0.0 | ||||
neg_ct = 0 | ||||
compensation_add = 0.0 | ||||
compensation_remove = 0.0 | ||||
|
||||
is_monotonic_increasing_bounds = is_monotonic_increasing( | ||||
start | ||||
) and is_monotonic_increasing(end) | ||||
|
||||
output = np.empty(N, dtype=np.float64) | ||||
|
||||
for i in range(N): | ||||
s = start[i] | ||||
e = end[i] | ||||
if i == 0 or not is_monotonic_increasing_bounds: | ||||
for j in range(s, e): | ||||
val = values[j] | ||||
nobs, sum_x, neg_ct, compensation_add = add_mean( | ||||
val, nobs, sum_x, neg_ct, compensation_add | ||||
) | ||||
else: | ||||
for j in range(start[i - 1], s): | ||||
val = values[j] | ||||
nobs, sum_x, neg_ct, compensation_remove = remove_mean( | ||||
val, nobs, sum_x, neg_ct, compensation_remove | ||||
) | ||||
|
||||
for j in range(end[i - 1], e): | ||||
val = values[j] | ||||
nobs, sum_x, neg_ct, compensation_add = add_mean( | ||||
val, nobs, sum_x, neg_ct, compensation_add | ||||
) | ||||
|
||||
if nobs >= min_periods and nobs > 0: | ||||
result = sum_x / nobs | ||||
if neg_ct == 0 and result < 0: | ||||
result = 0 | ||||
elif neg_ct == nobs and result > 0: | ||||
result = 0 | ||||
else: | ||||
result = np.nan | ||||
|
||||
output[i] = result | ||||
|
||||
if not is_monotonic_increasing_bounds: | ||||
nobs = 0.0 | ||||
sum_x = 0.0 | ||||
neg_ct = 0 | ||||
compensation_remove = 0.0 | ||||
|
||||
return output |
Uh oh!
There was an error while loading. Please reload this page.