Skip to content

Commit a0100e6

Browse files
committed
WIP: Experimental changes in rolling_var related to pandas-dev#7900
Added logic to `rolling_var` to detect windows where all non-NaN values are identical. Need to assess both correctness and performance impact.
1 parent 989a51b commit a0100e6

File tree

1 file changed

+36
-43
lines changed

1 file changed

+36
-43
lines changed

pandas/algos.pyx

Lines changed: 36 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,75 +1160,68 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
11601160
"""
11611161
Numerically stable implementation using Welford's method.
11621162
"""
1163-
cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta
1164-
cdef Py_ssize_t i
1163+
cdef double val, prev, mean_x = 0, ssqdm_x = 0, delta, rep = NaN
1164+
cdef Py_ssize_t nobs = 0, nrep = 0, i
11651165
cdef Py_ssize_t N = len(input)
11661166

11671167
cdef ndarray[double_t] output = np.empty(N, dtype=float)
11681168

11691169
minp = _check_minp(win, minp, N)
11701170

1171-
# Check for windows larger than array, addresses #7297
1172-
win = min(win, N)
1173-
1174-
# Over the first window, observations can only be added, never removed
1175-
for i from 0 <= i < win:
1171+
for i from 0 <= i < N:
11761172
val = input[i]
1173+
prev = NaN if i < win else input[i - win]
1174+
1175+
# First, count the number of observations and consecutive repeats
1176+
if prev == prev:
1177+
# prev is not NaN, removing an observation...
1178+
if nobs == nrep:
1179+
# ...and removing a repeat
1180+
nrep -= 1
1181+
if nrep == 0:
1182+
rep = NaN
1183+
nobs -= 1
11771184

1178-
# Not NaN
11791185
if val == val:
1180-
nobs += 1
1181-
delta = (val - mean_x)
1182-
mean_x += delta / nobs
1183-
ssqdm_x += delta * (val - mean_x)
1184-
1185-
if nobs >= minp:
1186-
#pathological case
1187-
if nobs == 1:
1188-
val = 0
1186+
# next is not NaN, adding an observation...
1187+
if val == prev:
1188+
# ...and adding a repeat
1189+
nrep += 1
11891190
else:
1190-
val = ssqdm_x / (nobs - ddof)
1191-
if val < 0:
1192-
val = 0
1193-
else:
1194-
val = NaN
1195-
1196-
output[i] = val
1197-
1198-
# After the first window, observations can both be added and removed
1199-
for i from win <= i < N:
1200-
val = input[i]
1201-
prev = input[i - win]
1191+
# ...and resetting repeats
1192+
nrep = 1
1193+
rep = val
1194+
nobs += 1
12021195

1203-
if val == val:
1196+
# Then, compute the new mean and sum of squared differences
1197+
if nobs == nrep:
1198+
# All non-NaN values in window are identical...
1199+
ssqdm_x = 0
1200+
mean_x = rep if nobs > 0 else 0
1201+
elif val == val:
1202+
# Adding one observation...
12041203
if prev == prev:
1205-
# Adding one observation and removing another one
1204+
# ...and removing another
12061205
delta = val - prev
12071206
prev -= mean_x
12081207
mean_x += delta / nobs
12091208
val -= mean_x
12101209
ssqdm_x += (val + prev) * delta
12111210
else:
1212-
# Adding one observation and not removing any
1213-
nobs += 1
1211+
# ...and not removing any
12141212
delta = (val - mean_x)
12151213
mean_x += delta / nobs
12161214
ssqdm_x += delta * (val - mean_x)
12171215
elif prev == prev:
12181216
# Adding no new observation, but removing one
1219-
nobs -= 1
1220-
if nobs:
1221-
delta = (prev - mean_x)
1222-
mean_x -= delta / nobs
1223-
ssqdm_x -= delta * (prev - mean_x)
1224-
else:
1225-
mean_x = 0
1226-
ssqdm_x = 0
1217+
delta = (prev - mean_x)
1218+
mean_x -= delta / nobs
1219+
ssqdm_x -= delta * (prev - mean_x)
12271220
# Variance is unchanged if no observation is added or removed
12281221

1222+
# Finally, compute and write the rolling variance to the output array
12291223
if nobs >= minp:
1230-
#pathological case
1231-
if nobs == 1:
1224+
if nobs <= ddof:
12321225
val = 0
12331226
else:
12341227
val = ssqdm_x / (nobs - ddof)

0 commit comments

Comments
 (0)