From 7c65e4e4aff6a659ea055426e6a0f59352afeb76 Mon Sep 17 00:00:00 2001 From: Marat Kopytjuk Date: Sat, 25 Mar 2023 14:06:56 +0100 Subject: [PATCH 1/9] docs: improve resample interpolate docs --- pandas/core/resample.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3b31932952867..0ea9d8643545b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -839,7 +839,24 @@ def interpolate( **kwargs, ): """ - Interpolate values according to different methods. + Interpolate values according to different methods. Note that the points used + as anchor points in the interpolation are taken from the original timeseries + and thus it can lead to information loss and wrong interpolation! + + ``` + # example of 2Hz timeseries resampled with rule="400ms" + 2023-03-01 07:00:00.000 1.0 <- anchor 0 + 2023-03-01 07:00:00.500 0.0 + 2023-03-01 07:00:01.000 -1.0 + 2023-03-01 07:00:01.500 0.5 + 2023-03-01 07:00:02.000 2.0 <- anchor 1 + 2023-03-01 07:00:02.500 1.5 + 2023-03-01 07:00:03.000 1.0 + 2023-03-01 07:00:03.500 2.0 + 2023-03-01 07:00:04.000 3.0 <- anchor 2 + ``` + + The new time-series based on the `rule` is based solely on the anchor points. """ result = self._upsample("asfreq") return result.interpolate( From 73a937df9db573585c186cdb2c4c3ccd87c51855 Mon Sep 17 00:00:00 2001 From: Marat Kopytjuk Date: Thu, 30 Mar 2023 20:31:41 +0200 Subject: [PATCH 2/9] Improve docs --- pandas/core/resample.py | 65 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e6695a17d6953..85927bd184fb3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -839,12 +839,60 @@ def interpolate( **kwargs, ): """ - Interpolate values according to different methods. Note that the points used - as anchor points in the interpolation are taken from the original timeseries - and thus it can lead to information loss and wrong interpolation! + Interpolate values according to different methods. - ``` - # example of 2Hz timeseries resampled with rule="400ms" + Notes + ----- + The original index is first reindexed to new time buckets (anchors), + then the interpolation happens. For non-equidistant time-series this + behaviour may lead to data loss as shown in the last example. + + Examples + -------- + + >>> import datetime as dt + >>> timesteps = [ + ... dt.datetime(2023, 3, 1, 7, 0, 0), + ... dt.datetime(2023, 3, 1, 7, 0, 1), + ... dt.datetime(2023, 3, 1, 7, 0, 2), + ... dt.datetime(2023, 3, 1, 7, 0, 3), + ... dt.datetime(2023, 3, 1, 7, 0, 4)] + >>> series = pd.Series(data=[1, -1, 2, 1, 3], index=timesteps) + >>> series + 2023-03-01 07:00:00 1 + 2023-03-01 07:00:01 -1 + 2023-03-01 07:00:02 2 + 2023-03-01 07:00:03 1 + 2023-03-01 07:00:04 3 + dtype: int64 + + Upsample the dataframe to 0.5Hz + + >>> series.resample("2s").interpolate("linear") + 2023-03-01 07:00:00 1 + 2023-03-01 07:00:02 2 + 2023-03-01 07:00:04 3 + Freq: 2S, dtype: int64 + + Downsample the dataframe to 2Hz + + >>> series.resample("500ms").interpolate("linear") + 2023-03-01 07:00:00.000 1.0 + 2023-03-01 07:00:00.500 0.0 + 2023-03-01 07:00:01.000 -1.0 + 2023-03-01 07:00:01.500 0.5 + 2023-03-01 07:00:02.000 2.0 + 2023-03-01 07:00:02.500 1.5 + 2023-03-01 07:00:03.000 1.0 + 2023-03-01 07:00:03.500 2.0 + 2023-03-01 07:00:04.000 3.0 + Freq: 500L, dtype: float64 + + Internal reindexing prior to interpolation leads to a timeseries + interpolated given the reindexed timestamps (anchros) which can lead + to misleading interpolation results: + + >>> series.resample("400ms").interpolate("linear") 2023-03-01 07:00:00.000 1.0 <- anchor 0 2023-03-01 07:00:00.500 0.0 2023-03-01 07:00:01.000 -1.0 @@ -854,10 +902,11 @@ def interpolate( 2023-03-01 07:00:03.000 1.0 2023-03-01 07:00:03.500 2.0 2023-03-01 07:00:04.000 3.0 <- anchor 2 - ``` - - The new time-series based on the `rule` is based solely on the anchor points. + Freq: 1500L, dtype: float64 + + Note that the series erroneously increases between two anchors. """ + result = self._upsample("asfreq") return result.interpolate( method=method, From a2725a6c7b86ac2b82fec3c6bb2208170b7302a1 Mon Sep 17 00:00:00 2001 From: Marat Kopytjuk Date: Sun, 2 Apr 2023 09:56:28 +0200 Subject: [PATCH 3/9] Fix comments after pr review --- pandas/core/resample.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 85927bd184fb3..95b9bec2216f0 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -888,23 +888,25 @@ def interpolate( 2023-03-01 07:00:04.000 3.0 Freq: 500L, dtype: float64 - Internal reindexing prior to interpolation leads to a timeseries - interpolated given the reindexed timestamps (anchros) which can lead - to misleading interpolation results: + Internal reindexing with ``as_freq()`` prior to interpolation leads to + an interpolated timeseries on the basis the reindexed timestamps (anchors). + Since not all datapoints from original series become anchors, + it can lead to misleading interpolation results as in the following exapmple: >>> series.resample("400ms").interpolate("linear") - 2023-03-01 07:00:00.000 1.0 <- anchor 0 + 2023-03-01 07:00:00.000 1.0 2023-03-01 07:00:00.500 0.0 2023-03-01 07:00:01.000 -1.0 2023-03-01 07:00:01.500 0.5 - 2023-03-01 07:00:02.000 2.0 <- anchor 1 + 2023-03-01 07:00:02.000 2.0 2023-03-01 07:00:02.500 1.5 2023-03-01 07:00:03.000 1.0 2023-03-01 07:00:03.500 2.0 - 2023-03-01 07:00:04.000 3.0 <- anchor 2 + 2023-03-01 07:00:04.000 3.0 Freq: 1500L, dtype: float64 - Note that the series erroneously increases between two anchors. + Note that the series erroneously increases between two anchors + ``07:00:00`` and ``07:00:02``. """ result = self._upsample("asfreq") From ef710dbe03a12e3d7c17312765162ab3cb0290bf Mon Sep 17 00:00:00 2001 From: Marat Kopytjuk Date: Sun, 2 Apr 2023 10:18:56 +0200 Subject: [PATCH 4/9] fix black linter outputs --- pandas/core/resample.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 95b9bec2216f0..ee1e987adfb30 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -839,11 +839,12 @@ def interpolate( **kwargs, ): """ - Interpolate values according to different methods. - + Interpolate values according to different methods. + Notes ----- - The original index is first reindexed to new time buckets (anchors), + The original index is first reindexed + (see :meth:`pandas.Resampler.asfreq`) to new time buckets (anchors), then the interpolation happens. For non-equidistant time-series this behaviour may lead to data loss as shown in the last example. @@ -890,7 +891,7 @@ def interpolate( Internal reindexing with ``as_freq()`` prior to interpolation leads to an interpolated timeseries on the basis the reindexed timestamps (anchors). - Since not all datapoints from original series become anchors, + Since not all datapoints from original series become anchors, it can lead to misleading interpolation results as in the following exapmple: >>> series.resample("400ms").interpolate("linear") From 2bcf395e8313c7b7de3a896dbd7dc278c8077ff6 Mon Sep 17 00:00:00 2001 From: Marat Kopytjuk Date: Sun, 2 Apr 2023 10:28:53 +0200 Subject: [PATCH 5/9] Remove DataFrame.interpolate docs from resample docs --- pandas/core/resample.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ee1e987adfb30..ce742051cc76e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -825,7 +825,6 @@ def fillna(self, method, limit: int | None = None): """ return self._upsample(method, limit=limit) - @doc(NDFrame.interpolate, **_shared_docs_kwargs) def interpolate( self, method: QuantileInterpolation = "linear", @@ -844,10 +843,17 @@ def interpolate( Notes ----- The original index is first reindexed - (see :meth:`pandas.Resampler.asfreq`) to new time buckets (anchors), - then the interpolation happens. For non-equidistant time-series this + (see :meth:`pandas.core.resample.Resampler.asfreq`) to new time buckets, + then the interpolation`of NaNs via `DataFrame.interpolate` happens. + For non-equidistant time-series this behaviour may lead to data loss as shown in the last example. + See Also + -------- + pandas.core.resample.Resampler.asfreq: Return the values at the new freq, + essentially a reindex. + DataFrame.interpolate: Fill NaN values using an interpolation method. + Examples -------- From 1285a814f3d9b7d1e9a9fa5661e1b65a9386ab18 Mon Sep 17 00:00:00 2001 From: Marat Kopytjuk Date: Sun, 2 Apr 2023 10:43:18 +0200 Subject: [PATCH 6/9] fix typo --- pandas/core/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ce742051cc76e..1cbd52257c9c5 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -898,7 +898,7 @@ def interpolate( Internal reindexing with ``as_freq()`` prior to interpolation leads to an interpolated timeseries on the basis the reindexed timestamps (anchors). Since not all datapoints from original series become anchors, - it can lead to misleading interpolation results as in the following exapmple: + it can lead to misleading interpolation results as in the following example: >>> series.resample("400ms").interpolate("linear") 2023-03-01 07:00:00.000 1.0 From 0ba4073072159baa52fe0a656a9437ceaed3504f Mon Sep 17 00:00:00 2001 From: Marat Kopytjuk Date: Sun, 2 Apr 2023 10:56:57 +0200 Subject: [PATCH 7/9] Add returns section --- pandas/core/resample.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 1cbd52257c9c5..e00018a48cee4 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -840,20 +840,25 @@ def interpolate( """ Interpolate values according to different methods. + Returns + ------- + DataFrame or Series + Interpolated values at the specified freq. + + See Also + -------- + core.resample.Resampler.asfreq: Return the values at the new freq, + essentially a reindex. + DataFrame.interpolate: Fill NaN values using an interpolation method. + Notes ----- The original index is first reindexed - (see :meth:`pandas.core.resample.Resampler.asfreq`) to new time buckets, + (see :meth:`core.resample.Resampler.asfreq`) to new time buckets, then the interpolation`of NaNs via `DataFrame.interpolate` happens. For non-equidistant time-series this behaviour may lead to data loss as shown in the last example. - See Also - -------- - pandas.core.resample.Resampler.asfreq: Return the values at the new freq, - essentially a reindex. - DataFrame.interpolate: Fill NaN values using an interpolation method. - Examples -------- @@ -902,15 +907,17 @@ def interpolate( >>> series.resample("400ms").interpolate("linear") 2023-03-01 07:00:00.000 1.0 - 2023-03-01 07:00:00.500 0.0 - 2023-03-01 07:00:01.000 -1.0 - 2023-03-01 07:00:01.500 0.5 + 2023-03-01 07:00:00.400 1.2 + 2023-03-01 07:00:00.800 1.4 + 2023-03-01 07:00:01.200 1.6 + 2023-03-01 07:00:01.600 1.8 2023-03-01 07:00:02.000 2.0 - 2023-03-01 07:00:02.500 1.5 - 2023-03-01 07:00:03.000 1.0 - 2023-03-01 07:00:03.500 2.0 + 2023-03-01 07:00:02.400 2.2 + 2023-03-01 07:00:02.800 2.4 + 2023-03-01 07:00:03.200 2.6 + 2023-03-01 07:00:03.600 2.8 2023-03-01 07:00:04.000 3.0 - Freq: 1500L, dtype: float64 + Freq: 400L, dtype: float64 Note that the series erroneously increases between two anchors ``07:00:00`` and ``07:00:02``. From 71ec305299b4ba53570a9b34bb4981ba046079b7 Mon Sep 17 00:00:00 2001 From: Marat Kopytjuk Date: Thu, 6 Apr 2023 17:54:10 +0200 Subject: [PATCH 8/9] Add parameters to core.resample.Resampler.interpolate --- .gitignore | 1 + pandas/core/resample.py | 88 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 80 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 88ed58b70925d..d348a67ea7934 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ .tags .cache/ .vscode/ +.venv/ # Compiled source # ################### diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e00018a48cee4..8cc578b7fd0b6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -838,7 +838,80 @@ def interpolate( **kwargs, ): """ - Interpolate values according to different methods. + Interpolate values between target timestamps according to different methods. + + The original index is first reindexed to target timestamps + (see :meth:`core.resample.Resampler.asfreq`), + then the interpolation of ``NaN`` values via :meth`DataFrame.interpolate` + happens. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. One of: + + * 'linear': Ignore the index and treat the values as equally + spaced. This is the only method supported on MultiIndexes. + * 'time': Works on daily and higher resolution data to interpolate + given length of interval. + * 'index', 'values': use the actual numerical values of the index. + * 'pad': Fill in NaNs using existing values. + * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'barycentric', 'polynomial': Passed to + `scipy.interpolate.interp1d`, whereas 'spline' is passed to + `scipy.interpolate.UnivariateSpline`. These methods use the numerical + values of the index. Both 'polynomial' and 'spline' require that + you also specify an `order` (int), e.g. + ``df.interpolate(method='polynomial', order=5)``. Note that, + `slinear` method in Pandas refers to the Scipy first order `spline` + instead of Pandas first order `spline`. + * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima', + 'cubicspline': Wrappers around the SciPy interpolation methods of + similar names. See `Notes`. + * 'from_derivatives': Refers to + `scipy.interpolate.BPoly.from_derivatives` which + replaces 'piecewise_polynomial' interpolation method in + scipy 0.18. + + axis : {{0 or 'index', 1 or 'columns', None}}, default None + Axis to interpolate along. For `Series` this parameter is unused + and defaults to 0. + limit : int, optional + Maximum number of consecutive NaNs to fill. Must be greater than + 0. + inplace : bool, default False + Update the data in place if possible. + limit_direction : {{'forward', 'backward', 'both'}}, Optional + Consecutive NaNs will be filled in this direction. + + If limit is specified: + * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'. + * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be + 'backwards'. + + If 'limit' is not specified: + * If 'method' is 'backfill' or 'bfill', the default is 'backward' + * else the default is 'forward' + + .. versionchanged:: 1.1.0 + raises ValueError if `limit_direction` is 'forward' or 'both' and + method is 'backfill' or 'bfill'. + raises ValueError if `limit_direction` is 'backward' or 'both' and + method is 'pad' or 'ffill'. + + limit_area : {{`None`, 'inside', 'outside'}}, default None + If limit is specified, consecutive NaNs will be filled with this + restriction. + + * ``None``: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values + (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + + downcast : optional, 'infer' or None, defaults to None + Downcast dtypes if possible. + ``**kwargs`` : optional + Keyword arguments to pass on to the interpolating function. Returns ------- @@ -853,11 +926,9 @@ def interpolate( Notes ----- - The original index is first reindexed - (see :meth:`core.resample.Resampler.asfreq`) to new time buckets, - then the interpolation`of NaNs via `DataFrame.interpolate` happens. - For non-equidistant time-series this - behaviour may lead to data loss as shown in the last example. + For high-frequent or non-equidistant time-series with timestamps + the reindexing followed by interpolation may lead to information loss + as shown in the last example. Examples -------- @@ -878,7 +949,7 @@ def interpolate( 2023-03-01 07:00:04 3 dtype: int64 - Upsample the dataframe to 0.5Hz + Upsample the dataframe to 0.5Hz by providing the period time of 2s. >>> series.resample("2s").interpolate("linear") 2023-03-01 07:00:00 1 @@ -886,7 +957,7 @@ def interpolate( 2023-03-01 07:00:04 3 Freq: 2S, dtype: int64 - Downsample the dataframe to 2Hz + Downsample the dataframe to 2Hz by providing the period time of 500ms. >>> series.resample("500ms").interpolate("linear") 2023-03-01 07:00:00.000 1.0 @@ -922,7 +993,6 @@ def interpolate( Note that the series erroneously increases between two anchors ``07:00:00`` and ``07:00:02``. """ - result = self._upsample("asfreq") return result.interpolate( method=method, From cd3e67e21a7f4e7f14b47bf1378a8f864305beb1 Mon Sep 17 00:00:00 2001 From: Marat Kopytjuk Date: Thu, 6 Apr 2023 18:03:25 +0200 Subject: [PATCH 9/9] revert gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index d348a67ea7934..88ed58b70925d 100644 --- a/.gitignore +++ b/.gitignore @@ -23,7 +23,6 @@ .tags .cache/ .vscode/ -.venv/ # Compiled source # ###################