diff --git a/doc/source/release.rst b/doc/source/release.rst index 7824a69c92561..c494db2ae91c4 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -276,6 +276,7 @@ Improvements to existing features - Add option to turn off escaping in ``DataFrame.to_latex`` (:issue:`6472`) - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max, :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`) +- ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`) .. _release.bug_fixes-0.14.0: diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 27e25c3954dad..bba6b46c52e37 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1122,7 +1122,10 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): # Rolling variance def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): - cdef double val, prev, sum_x = 0, sum_xx = 0, nobs = 0 + """ + Numerically stable implementation using Welford's method. + """ + cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta cdef Py_ssize_t i cdef Py_ssize_t N = len(input) @@ -1130,48 +1133,71 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): minp = _check_minp(win, minp, N) - for i from 0 <= i < minp - 1: + for i from 0 <= i < win: val = input[i] # Not NaN if val == val: nobs += 1 - sum_x += val - sum_xx += val * val + delta = (val - mean_x) + mean_x += delta / nobs + ssqdm_x += delta * (val - mean_x) - output[i] = NaN + if nobs >= minp: + #pathological case + if nobs == 1: + val = 0 + else: + val = ssqdm_x / (nobs - ddof) + if val < 0: + val = 0 + else: + val = NaN - for i from minp - 1 <= i < N: + output[i] = val + + for i from win <= i < N: val = input[i] + prev = input[i - win] if val == val: - nobs += 1 - sum_x += val - sum_xx += val * val - - if i > win - 1: - prev = input[i - win] if prev == prev: - sum_x -= prev - sum_xx -= prev * prev - nobs -= 1 + delta = val - prev + prev -= mean_x + mean_x += delta / nobs + val -= mean_x + ssqdm_x += (val + prev) * delta + else: + nobs += 1 + delta = (val - mean_x) + mean_x += delta / nobs + ssqdm_x += delta * (val - mean_x) + elif prev == prev: + nobs -= 1 + if nobs: + delta = (prev - mean_x) + mean_x -= delta / nobs + ssqdm_x -= delta * (prev - mean_x) + else: + mean_x = 0 + ssqdm_x = 0 if nobs >= minp: - # pathological case + #pathological case if nobs == 1: - output[i] = 0 - continue - - val = (nobs * sum_xx - sum_x * sum_x) / (nobs * (nobs - ddof)) - if val < 0: val = 0 - - output[i] = val + else: + val = ssqdm_x / (nobs - ddof) + if val < 0: + val = 0 else: - output[i] = NaN + val = NaN + + output[i] = val return output + #------------------------------------------------------------------------------- # Rolling skewness diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 246037c7d7009..42da19f1a241d 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -751,7 +751,7 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). - + By default, the result is set to the right edge of the window. This can be changed to the center of the window by setting ``center=True``. @@ -978,7 +978,7 @@ def expanding_apply(arg, func, min_periods=1, freq=None, center=False, Returns ------- y : type of input argument - + Notes ----- The `freq` keyword is used to conform time series data to a specified diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index 8c9eb080cfc61..06e7484bbd536 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -295,7 +295,8 @@ def test_rolling_std_neg_sqrt(self): def test_rolling_var(self): self._check_moment_func(mom.rolling_var, - lambda x: np.var(x, ddof=1)) + lambda x: np.var(x, ddof=1), + test_stable=True) self._check_moment_func(functools.partial(mom.rolling_var, ddof=0), lambda x: np.var(x, ddof=0)) @@ -349,13 +350,15 @@ def _check_moment_func(self, func, static_comp, window=50, has_center=True, has_time_rule=True, preserve_nan=True, - fill_value=None): + fill_value=None, + test_stable=False): self._check_ndarray(func, static_comp, window=window, has_min_periods=has_min_periods, preserve_nan=preserve_nan, has_center=has_center, - fill_value=fill_value) + fill_value=fill_value, + test_stable=test_stable) self._check_structures(func, static_comp, has_min_periods=has_min_periods, @@ -367,7 +370,8 @@ def _check_ndarray(self, func, static_comp, window=50, has_min_periods=True, preserve_nan=True, has_center=True, - fill_value=None): + fill_value=None, + test_stable=False): result = func(self.arr, window) assert_almost_equal(result[-1], @@ -425,6 +429,12 @@ def _check_ndarray(self, func, static_comp, window=50, self.assert_(np.isnan(expected[-5])) self.assert_(np.isnan(result[-14])) + if test_stable: + result = func(self.arr + 1e9, window) + assert_almost_equal(result[-1], + static_comp(self.arr[-50:] + 1e9)) + + def _check_structures(self, func, static_comp, has_min_periods=True, has_time_rule=True, has_center=True,