ENH: Use Welford's method in stats.moments.rolling_var

jaimefrio · jaimefrio · commit f154c6e2e54c · 2014-04-22T13:15:10.000-07:00
This PR implements a modified version of Welford's method to compute the rolling variance. Instead of keeping track of the sum and sum of the squares of the items in the window, it tracks the mean and the sum of squared differences from the mean. This turns out to be (much) more numerically stable. The formulas to update these two variables when adding or removing an item from the sequence are well known, see e.g. http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance The formulas used when both adding one and removing one item I have not seen explicitly worked out anywhere, but are not too hard to come up with if you put pen to (a lot of) paper.
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -276,6 +276,7 @@ Improvements to existing features
 - Add option to turn off escaping in ``DataFrame.to_latex`` (:issue:`6472`)
 - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max,
   :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`)
+- ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`)
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -1122,56 +1122,82 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
 # Rolling variance
 
 def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
-    cdef double val, prev, sum_x = 0, sum_xx = 0, nobs = 0
+    """
+    Numerically stable implementation using Welford's method.
+    """
+    cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta
     cdef Py_ssize_t i
     cdef Py_ssize_t N = len(input)
 
     cdef ndarray[double_t] output = np.empty(N, dtype=float)
 
     minp = _check_minp(win, minp, N)
 
-    for i from 0 <= i < minp - 1:
+    for i from 0 <= i < win:
         val = input[i]
 
         # Not NaN
         if val == val:
             nobs += 1
-            sum_x += val
-            sum_xx += val * val
+            delta = (val - mean_x)
+            mean_x += delta / nobs
+            ssqdm_x += delta * (val - mean_x)
 
-        output[i] = NaN
+        if nobs >= minp:
+            #pathological case
+            if nobs == 1:
+                val = 0
+            else:
+                val = ssqdm_x / (nobs - ddof)
+                if val < 0:
+                    val = 0
+        else:
+            val = NaN
 
-    for i from minp - 1 <= i < N:
+        output[i] = val
+
+    for i from win <= i < N:
         val = input[i]
+        prev = input[i - win]
 
         if val == val:
-            nobs += 1
-            sum_x += val
-            sum_xx += val * val
-
-        if i > win - 1:
-            prev = input[i - win]
             if prev == prev:
-                sum_x -= prev
-                sum_xx -= prev * prev
-                nobs -= 1
+                delta = val - prev
+                prev -= mean_x
+                mean_x += delta / nobs
+                val -= mean_x
+                ssqdm_x += (val + prev) * delta
+            else:
+                nobs += 1
+                delta = (val - mean_x)
+                mean_x += delta / nobs
+                ssqdm_x += delta * (val - mean_x)
+        elif prev == prev:
+            nobs -= 1
+            if nobs:
+                delta = (prev - mean_x)
+                mean_x -= delta  / nobs
+                ssqdm_x -= delta * (prev - mean_x)
+            else:
+                mean_x = 0
+                ssqdm_x = 0
 
         if nobs >= minp:
-            # pathological case
+            #pathological case
             if nobs == 1:
-                output[i] = 0
-                continue
-
-            val = (nobs * sum_xx - sum_x * sum_x) / (nobs * (nobs - ddof))
-            if val < 0:
                 val = 0
-
-            output[i] = val
+            else:
+                val = ssqdm_x / (nobs - ddof)
+                if val < 0:
+                    val = 0
         else:
-            output[i] = NaN
+            val = NaN
+
+        output[i] = val
 
     return output
 
+
 #-------------------------------------------------------------------------------
 # Rolling skewness
 
diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py
@@ -751,7 +751,7 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None,
     * ``gaussian`` (needs std)
     * ``general_gaussian`` (needs power, width)
     * ``slepian`` (needs width).
-    
+
     By default, the result is set to the right edge of the window. This can be
     changed to the center of the window by setting ``center=True``.
 
@@ -978,7 +978,7 @@ def expanding_apply(arg, func, min_periods=1, freq=None, center=False,
     Returns
     -------
     y : type of input argument
-    
+
     Notes
     -----
     The `freq` keyword is used to conform time series data to a specified
diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py
@@ -295,7 +295,8 @@ def test_rolling_std_neg_sqrt(self):
 
     def test_rolling_var(self):
         self._check_moment_func(mom.rolling_var,
-                                lambda x: np.var(x, ddof=1))
+                                lambda x: np.var(x, ddof=1),
+                                test_stable=True)
         self._check_moment_func(functools.partial(mom.rolling_var, ddof=0),
                                 lambda x: np.var(x, ddof=0))
 
@@ -349,13 +350,15 @@ def _check_moment_func(self, func, static_comp, window=50,
                            has_center=True,
                            has_time_rule=True,
                            preserve_nan=True,
-                           fill_value=None):
+                           fill_value=None,
+                           test_stable=False):
 
         self._check_ndarray(func, static_comp, window=window,
                             has_min_periods=has_min_periods,
                             preserve_nan=preserve_nan,
                             has_center=has_center,
-                            fill_value=fill_value)
+                            fill_value=fill_value,
+                            test_stable=test_stable)
 
         self._check_structures(func, static_comp,
                                has_min_periods=has_min_periods,
@@ -367,7 +370,8 @@ def _check_ndarray(self, func, static_comp, window=50,
                        has_min_periods=True,
                        preserve_nan=True,
                        has_center=True,
-                       fill_value=None):
+                       fill_value=None,
+                       test_stable=False):
 
         result = func(self.arr, window)
         assert_almost_equal(result[-1],
@@ -425,6 +429,12 @@ def _check_ndarray(self, func, static_comp, window=50,
                 self.assert_(np.isnan(expected[-5]))
                 self.assert_(np.isnan(result[-14]))
 
+        if test_stable:
+            result = func(self.arr + 1e9, window)
+            assert_almost_equal(result[-1],
+                                static_comp(self.arr[-50:] + 1e9))
+
+
     def _check_structures(self, func, static_comp,
                           has_min_periods=True, has_time_rule=True,
                           has_center=True,