From cc36ec91d4a7650adc44e411adebd4cc4770d21b Mon Sep 17 00:00:00 2001 From: dezmond22 Date: Sat, 3 Oct 2020 13:31:19 +0300 Subject: [PATCH 1/2] DOC: update code style for remaining intro tutorial docs for #36777 computation.rst --- doc/source/user_guide/computation.rst | 166 +++++++++++++------------- 1 file changed, 86 insertions(+), 80 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index e7edda90610b5..2836cc69c2916 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -64,7 +64,7 @@ series in the DataFrame, also excluding NA/null values. .. ipython:: python frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + columns=["a", "b", "c", "d", "e"]) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -73,9 +73,9 @@ in order to have a valid result. .. ipython:: python - frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.loc[frame.index[:5], 'a'] = np.nan - frame.loc[frame.index[5:10], 'b'] = np.nan + frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + frame.loc[frame.index[:5], "a"] = np.nan + frame.loc[frame.index[5:10], "b"] = np.nan frame.cov() @@ -117,12 +117,12 @@ Wikipedia has articles covering the above correlation coefficients: .. ipython:: python frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + columns=["a", "b", "c", "d", "e"]) frame.iloc[::2] = np.nan # Series with Series - frame['a'].corr(frame['b']) - frame['a'].corr(frame['b'], method='spearman') + frame["a"].corr(frame["b"]) + frame["a"].corr(frame["b"], method="spearman") # Pairwise correlation of DataFrame columns frame.corr() @@ -134,9 +134,9 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: .. ipython:: python - frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.loc[frame.index[:5], 'a'] = np.nan - frame.loc[frame.index[5:10], 'b'] = np.nan + frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + frame.loc[frame.index[:5], "a"] = np.nan + frame.loc[frame.index[5:10], "b"] = np.nan frame.corr() @@ -165,8 +165,8 @@ DataFrame objects. .. ipython:: python - index = ['a', 'b', 'c', 'd', 'e'] - columns = ['one', 'two', 'three', 'four'] + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] df1 = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns) df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) df1.corrwith(df2) @@ -182,8 +182,8 @@ assigned the mean of the ranks (by default) for the group: .. ipython:: python - s = pd.Series(np.random.randn(5), index=list('abcde')) - s['d'] = s['b'] # so there's a tie + s = pd.Series(np.random.randn(5), index=list("abcde")) + s["d"] = s["b"] # so there's a tie s.rank() :meth:`~DataFrame.rank` is also a DataFrame method and can rank either the rows @@ -244,7 +244,7 @@ objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expan .. ipython:: python s = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + index=pd.date_range("1/1/2000", periods=1000)) s = s.cumsum() s @@ -279,15 +279,15 @@ We can then call methods on these ``rolling`` objects. These return like-indexed .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig rolling_mean_ex.png - r.mean().plot(style='k') + r.mean().plot(style="k") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") They can also be applied to DataFrame objects. This is really just syntactic sugar for applying the moving window operator to all of the DataFrame's columns: @@ -295,8 +295,8 @@ sugar for applying the moving window operator to all of the DataFrame's columns: .. ipython:: python df = pd.DataFrame(np.random.randn(1000, 4), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C', 'D']) + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C", "D"]) df = df.cumsum() @savefig rolling_mean_frame.png @@ -368,7 +368,7 @@ compute the mean absolute deviation on a rolling basis: return np.fabs(x - x.mean()).mean() @savefig rolling_apply_ex.png - s.rolling(window=60).apply(mad, raw=True).plot(style='k') + s.rolling(window=60).apply(mad, raw=True).plot(style="k") Using the Numba engine ~~~~~~~~~~~~~~~~~~~~~~ @@ -377,7 +377,7 @@ Using the Numba engine Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying -``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). +``engine="numba"`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). Numba will be applied in potentially two routines: 1. If ``func`` is a standard Python function, the engine will `JIT `__ @@ -407,13 +407,13 @@ and their default values are set to ``False``, ``True`` and ``False`` respective In [3]: def f(x): ...: return np.sum(x) + 5 # Run the first time, compilation time will affect performance - In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225 + In [4]: %timeit -r 1 -n 1 roll.apply(f, engine="numba", raw=True) # noqa: E225 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) # Function is cached and performance will improve - In [5]: %timeit roll.apply(f, engine='numba', raw=True) + In [5]: %timeit roll.apply(f, engine="numba", raw=True) 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - In [6]: %timeit roll.apply(f, engine='cython', raw=True) + In [6]: %timeit roll.apply(f, engine="cython", raw=True) 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) .. _stats.rolling_window: @@ -454,22 +454,22 @@ The list of recognized types are the `scipy.signal window functions .. ipython:: python ser = pd.Series(np.random.randn(10), - index=pd.date_range('1/1/2000', periods=10)) + index=pd.date_range("1/1/2000", periods=10)) - ser.rolling(window=5, win_type='triang').mean() + ser.rolling(window=5, win_type="triang").mean() Note that the ``boxcar`` window is equivalent to :meth:`~Rolling.mean`. .. ipython:: python - ser.rolling(window=5, win_type='boxcar').mean() + ser.rolling(window=5, win_type="boxcar").mean() ser.rolling(window=5).mean() For some windowing functions, additional parameters must be specified: .. ipython:: python - ser.rolling(window=5, win_type='gaussian').mean(std=0.1) + ser.rolling(window=5, win_type="gaussian").mean(std=0.1) .. _stats.moments.normalization: @@ -498,10 +498,10 @@ This can be particularly useful for a non-regular time frequency index. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', + dft = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}, + index=pd.date_range("20130101 09:00:00", periods=5, - freq='s')) + freq="s")) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -515,20 +515,24 @@ Specifying an offset allows a more intuitive specification of the rolling freque .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + dft = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}, + index=pd.Index( + [ + pd.Timestamp("20130101 09:00:00"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:05"), + pd.Timestamp("20130101 09:00:06") + ], + name="foo") + ) dft dft.rolling(2).sum() @@ -537,7 +541,7 @@ Using the time-specification generates variable windows for this sparse data. .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the default of the index) in a DataFrame. @@ -546,7 +550,7 @@ default of the index) in a DataFrame. dft = dft.reset_index() dft - dft.rolling('2s', on='foo').sum() + dft.rolling("2s", on="foo").sum() .. _stats.custom_rolling_window: @@ -569,7 +573,7 @@ For example, if we have the following ``DataFrame``: use_expanding = [True, False, True, False, True] use_expanding - df = pd.DataFrame({'values': range(5)}) + df = pd.DataFrame({"values": range(5)}) df and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size @@ -615,7 +619,8 @@ rolling operations over a non-fixed offset like a ``BusinessDay``. .. ipython:: python from pandas.api.indexers import VariableOffsetWindowIndexer - df = pd.DataFrame(range(10), index=pd.date_range('2020', periods=10)) + + df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10)) offset = pd.offsets.BDay(1) indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset) df @@ -631,6 +636,7 @@ forward-looking rolling window, and we can use it as follows: .. ipython:: ipython from pandas.api.indexers import FixedForwardWindowIndexer + indexer = FixedForwardWindowIndexer(window_size=2) df.rolling(indexer, min_periods=1).sum() @@ -657,17 +663,17 @@ from present information back to past information. This allows the rolling windo .. ipython:: python - df = pd.DataFrame({'x': 1}, - index=[pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:06')]) + df = pd.DataFrame({"x": 1}, + index=[pd.Timestamp("20130101 09:00:01"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:04"), + pd.Timestamp("20130101 09:00:06")]) - df["right"] = df.rolling('2s', closed='right').x.sum() # default - df["both"] = df.rolling('2s', closed='both').x.sum() - df["left"] = df.rolling('2s', closed='left').x.sum() - df["neither"] = df.rolling('2s', closed='neither').x.sum() + df["right"] = df.rolling("2s", closed="right").x.sum() # default + df["both"] = df.rolling("2s", closed="both").x.sum() + df["left"] = df.rolling("2s", closed="left").x.sum() + df["neither"] = df.rolling("2s", closed="neither").x.sum() df @@ -746,12 +752,12 @@ For example: .. ipython:: python df = pd.DataFrame(np.random.randn(1000, 4), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C', 'D']) + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C", "D"]) df = df.cumsum() df2 = df[:20] - df2.rolling(window=5).corr(df2['B']) + df2.rolling(window=5).corr(df2["B"]) .. _stats.moments.corr_pairwise: @@ -776,14 +782,14 @@ can even be omitted: .. ipython:: python - covs = (df[['B', 'C', 'D']].rolling(window=50) - .cov(df[['A', 'B', 'C']], pairwise=True)) - covs.loc['2002-09-22':] + covs = (df[["B", "C", "D"]].rolling(window=50) + .cov(df[["A", "B", "C"]], pairwise=True)) + covs.loc["2002-09-22":] .. ipython:: python correls = df.rolling(window=50).corr() - correls.loc['2002-09-22':] + correls.loc["2002-09-22":] You can efficiently retrieve the time series of correlations between two columns by reshaping and indexing: @@ -791,12 +797,12 @@ columns by reshaping and indexing: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. ipython:: python @savefig rolling_corr_pairwise_ex.png - correls.unstack(1)[('A', 'C')].plot() + correls.unstack(1)[("A", "C")].plot() .. _stats.aggregate: @@ -811,8 +817,8 @@ perform multiple computations on the data. These operations are similar to the : .. ipython:: python dfa = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C']) + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C"]) r = dfa.rolling(window=60, min_periods=1) r @@ -823,9 +829,9 @@ Series (or multiple Series) via standard ``__getitem__``. r.aggregate(np.sum) - r['A'].aggregate(np.sum) + r["A"].aggregate(np.sum) - r[['A', 'B']].aggregate(np.sum) + r[["A", "B"]].aggregate(np.sum) As you can see, the result of the aggregation will have the selected columns, or all columns if none are selected. @@ -840,7 +846,7 @@ aggregation with, outputting a DataFrame: .. ipython:: python - r['A'].agg([np.sum, np.mean, np.std]) + r["A"].agg([np.sum, np.mean, np.std]) On a windowed DataFrame, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -860,20 +866,20 @@ columns of a ``DataFrame``: .. ipython:: python - r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) + r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the windowed object .. ipython:: python - r.agg({'A': 'sum', 'B': 'std'}) + r.agg({"A": "sum", "B": "std"}) Furthermore you can pass a nested dict to indicate different aggregations on different columns. .. ipython:: python - r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + r.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) .. _stats.moments.expanding: @@ -967,7 +973,7 @@ all accept are: sn.expanding().sum() sn.cumsum() - sn.cumsum().fillna(method='ffill') + sn.cumsum().fillna(method="ffill") An expanding window statistic will be more stable (and less responsive) than @@ -978,14 +984,14 @@ relative impact of an individual data point. As an example, here is the .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig expanding_mean_frame.png - s.expanding().mean().plot(style='k') + s.expanding().mean().plot(style="k") .. _stats.moments.exponentially_weighted: @@ -1115,10 +1121,10 @@ of ``times``. .. ipython:: python - df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) df - times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17'] - df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean() + times = ["2020-01-01", "2020-01-03", "2020-01-10", "2020-01-15", "2020-01-17"] + df.ewm(halflife="4 days", times=pd.DatetimeIndex(times)).mean() The following formula is used to compute exponentially weighted mean with an input vector of times: @@ -1130,10 +1136,10 @@ Here is an example for a univariate time series: .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig ewma_ex.png - s.ewm(span=20).mean().plot(style='k') + s.ewm(span=20).mean().plot(style="k") ExponentialMovingWindow has a ``min_periods`` argument, which has the same meaning it does for all the ``.expanding`` and ``.rolling`` methods: From aa2a00baa3017468bbc822a4aba71ea55b9d459e Mon Sep 17 00:00:00 2001 From: dezmond22 Date: Sat, 3 Oct 2020 14:53:31 +0300 Subject: [PATCH 2/2] DOC: update code style for remaining intro tutorial docs for #36777 computation.rst --- doc/source/user_guide/computation.rst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 2836cc69c2916..0f323f7796433 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -523,16 +523,16 @@ Using a non-regular, but still monotonic index, rolling with an integer window d .. ipython:: python dft = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}, - index=pd.Index( - [ - pd.Timestamp("20130101 09:00:00"), - pd.Timestamp("20130101 09:00:02"), - pd.Timestamp("20130101 09:00:03"), - pd.Timestamp("20130101 09:00:05"), - pd.Timestamp("20130101 09:00:06") - ], - name="foo") - ) + index=pd.Index( + [ + pd.Timestamp("20130101 09:00:00"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:05"), + pd.Timestamp("20130101 09:00:06") + ], + name="foo") + ) dft dft.rolling(2).sum()