pandas-dev · jreback · May 27, 2022 · May 19, 2022 · May 20, 2022 · May 20, 2022
diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
@@ -1,6 +1,3 @@
-from functools import wraps
-import threading
-
 import numpy as np
 
 from pandas import (
@@ -33,57 +30,21 @@
     from pandas._libs import algos
 except ImportError:
     from pandas import algos
+try:
+    from pandas._testing import test_parallel  # noqa: PDF014
 
+    have_real_test_parallel = True
+except ImportError:
+    have_real_test_parallel = False
 
-from .pandas_vb_common import BaseIO  # isort:skip
-
-
-def test_parallel(num_threads=2, kwargs_list=None):
-    """
-    Decorator to run the same function multiple times in parallel.
-
-    Parameters
-    ----------
-    num_threads : int, optional
-        The number of times the function is run in parallel.
-    kwargs_list : list of dicts, optional
-        The list of kwargs to update original
-        function kwargs on different threads.
-
-    Notes
-    -----
-    This decorator does not pass the return value of the decorated function.
-
-    Original from scikit-image:
-
-    https://github.com/scikit-image/scikit-image/pull/1519
-
-    """
-    assert num_threads > 0
-    has_kwargs_list = kwargs_list is not None
-    if has_kwargs_list:
-        assert len(kwargs_list) == num_threads
+    def test_parallel(num_threads=1):
+        def wrapper(fname):
+            return fname
 
-    def wrapper(func):
-        @wraps(func)
-        def inner(*args, **kwargs):
-            if has_kwargs_list:
-                update_kwargs = lambda i: dict(kwargs, **kwargs_list[i])
-            else:
-                update_kwargs = lambda i: kwargs
-            threads = []
-            for i in range(num_threads):
-                updated_kwargs = update_kwargs(i)
-                thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs)
-                threads.append(thread)
-            for thread in threads:
-                thread.start()
-            for thread in threads:
-                thread.join()
+        return wrapper
 
-        return inner
 
-    return wrapper
+from .pandas_vb_common import BaseIO  # isort:skip
 
 
 class ParallelGroupbyMethods:
@@ -92,7 +53,8 @@ class ParallelGroupbyMethods:
     param_names = ["threads", "method"]
 
     def setup(self, threads, method):
-
+        if not have_real_test_parallel:
+            raise NotImplementedError
         N = 10**6
         ngroups = 10**3
         df = DataFrame(
@@ -124,7 +86,8 @@ class ParallelGroups:
     param_names = ["threads"]
 
     def setup(self, threads):
-
+        if not have_real_test_parallel:
+            raise NotImplementedError
         size = 2**22
         ngroups = 10**3
         data = Series(np.random.randint(0, ngroups, size=size))
@@ -145,7 +108,8 @@ class ParallelTake1D:
     param_names = ["dtype"]
 
     def setup(self, dtype):
-
+        if not have_real_test_parallel:
+            raise NotImplementedError
         N = 10**6
         df = DataFrame({"col": np.arange(N, dtype=dtype)})
         indexer = np.arange(100, len(df) - 100)
@@ -167,7 +131,8 @@ class ParallelKth:
     repeat = 5
 
     def setup(self):
-
+        if not have_real_test_parallel:
+            raise NotImplementedError
         N = 10**7
         k = 5 * 10**5
         kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}]
@@ -184,7 +149,8 @@ def time_kth_smallest(self):
 
 class ParallelDatetimeFields:
     def setup(self):
-
+        if not have_real_test_parallel:
+            raise NotImplementedError
         N = 10**6
         self.dti = date_range("1900-01-01", periods=N, freq="T")
         self.period = self.dti.to_period("D")
@@ -238,7 +204,8 @@ class ParallelRolling:
     param_names = ["method"]
 
     def setup(self, method):
-
+        if not have_real_test_parallel:
+            raise NotImplementedError
         win = 100
         arr = np.random.rand(100000)
         if hasattr(DataFrame, "rolling"):
@@ -281,7 +248,8 @@ class ParallelReadCSV(BaseIO):
     param_names = ["dtype"]
 
     def setup(self, dtype):
-
+        if not have_real_test_parallel:
+            raise NotImplementedError
         rows = 10000
         cols = 50
         data = {
@@ -316,6 +284,8 @@ class ParallelFactorize:
     param_names = ["threads"]
 
     def setup(self, threads):
+        if not have_real_test_parallel:
+            raise NotImplementedError
 
         strings = tm.makeStringIndex(100000)
 

diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst
@@ -154,11 +154,11 @@ The apply and combine steps are typically done together in pandas.
 
 In the previous example, we explicitly selected the 2 columns first. If
 not, the ``mean`` method is applied to each column containing numerical
-columns by passing ``numeric_only=True``:
+columns:
 
 .. ipython:: python
 
-    titanic.groupby("Sex").mean(numeric_only=True)
+    titanic.groupby("Sex").mean()
 
 It does not make much sense to get the average value of the ``Pclass``.
 If we are only interested in the average age for each gender, the

diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
@@ -532,7 +532,7 @@ groups:
 
 .. ipython:: python
 
-   df.groupby("A")[["C", "D"]].sum()
+   df.groupby("A").sum()
 
 Grouping by multiple columns forms a hierarchical index, and again we can
 apply the :meth:`~pandas.core.groupby.GroupBy.sum` function:

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -477,7 +477,7 @@ An obvious one is aggregation via the
 .. ipython:: python
 
    grouped = df.groupby("A")
-   grouped[["C", "D"]].aggregate(np.sum)
+   grouped.aggregate(np.sum)
 
    grouped = df.groupby(["A", "B"])
    grouped.aggregate(np.sum)
@@ -492,7 +492,7 @@ changed by using the ``as_index`` option:
    grouped = df.groupby(["A", "B"], as_index=False)
    grouped.aggregate(np.sum)
 
-   df.groupby("A", as_index=False)[["C", "D"]].sum()
+   df.groupby("A", as_index=False).sum()
 
 Note that you could use the ``reset_index`` DataFrame function to achieve the
 same result as the column names are stored in the resulting ``MultiIndex``:
@@ -730,7 +730,7 @@ optimized Cython implementations:
 
 .. ipython:: python
 
-   df.groupby("A")[["C", "D"]].sum()
+   df.groupby("A").sum()
    df.groupby(["A", "B"]).mean()
 
 Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above
@@ -1159,12 +1159,13 @@ Again consider the example DataFrame we've been looking at:
 
 Suppose we wish to compute the standard deviation grouped by the ``A``
 column. There is a slight problem, namely that we don't care about the data in
-column ``B``. We refer to this as a "nuisance" column. You can avoid nuisance
-columns by specifying ``numeric_only=True``:
+column ``B``. We refer to this as a "nuisance" column. If the passed
+aggregation function can't be applied to some columns, the troublesome columns
+will be (silently) dropped. Thus, this does not pose any problems:
 
 .. ipython:: python
 
-   df.groupby("A").std(numeric_only=True)
+   df.groupby("A").std()
 
 Note that ``df.groupby('A').colname.std().`` is more efficient than
 ``df.groupby('A').std().colname``, so if the result of an aggregation function
@@ -1179,14 +1180,7 @@ is only interesting over one column (here ``colname``), it may be filtered
    If you do wish to include decimal or object columns in an aggregation with
    other non-nuisance data types, you must do so explicitly.
 
-.. warning::
-   The automatic dropping of nuisance columns has been deprecated and will be removed
-   in a future version of pandas. If columns are included that cannot be operated
-   on, pandas will instead raise an error. In order to avoid this, either select
-   the columns you wish to operate on or specify ``numeric_only=True``.
-
 .. ipython:: python
-    :okwarning:
 
     from decimal import Decimal
 
@@ -1310,7 +1304,7 @@ Groupby a specific column with the desired frequency. This is like resampling.
 
 .. ipython:: python
 
-   df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum()
+   df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum()
 
 You have an ambiguous specification in that you have a named index and a column
 that could be potential groupers.
@@ -1319,9 +1313,9 @@ that could be potential groupers.
 
    df = df.set_index("Date")
    df["Date"] = df.index + pd.offsets.MonthEnd(2)
-   df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum()
+   df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"]).sum()
 
-   df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum()
+   df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"]).sum()
 
 
 Taking the first rows of each group

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -583,7 +583,7 @@ without using a temporary variable.
 .. ipython:: python
 
    bb = pd.read_csv('data/baseball.csv', index_col='id')
-   (bb.groupby(['year', 'team']).sum(numeric_only=True)
+   (bb.groupby(['year', 'team']).sum()
       .loc[lambda df: df['r'] > 100])
 
 

diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -414,11 +414,12 @@ We can produce pivot tables from this data very easily:
 
 The result object is a :class:`DataFrame` having potentially hierarchical indexes on the
 rows and columns. If the ``values`` column name is not given, the pivot table
-will include all of the data in an additional level of hierarchy in the columns:
+will include all of the data that can be aggregated in an additional level of
+hierarchy in the columns:
 
 .. ipython:: python
 
-   pd.pivot_table(df[["A", "B", "C", "D", "E"]], index=["A", "B"], columns=["C"])
+   pd.pivot_table(df, index=["A", "B"], columns=["C"])
 
 Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For detail of :class:`Grouper`, see :ref:`Grouping with a Grouper specification <groupby.specify>`.
 
@@ -431,7 +432,7 @@ calling :meth:`~DataFrame.to_string` if you wish:
 
 .. ipython:: python
 
-   table = pd.pivot_table(df, index=["A", "B"], columns=["C"], values=["D", "E"])
+   table = pd.pivot_table(df, index=["A", "B"], columns=["C"])
    print(table.to_string(na_rep=""))
 
 Note that :meth:`~DataFrame.pivot_table` is also available as an instance method on DataFrame,
@@ -448,13 +449,7 @@ rows and columns:
 
 .. ipython:: python
 
-   table = df.pivot_table(
-       index=["A", "B"],
-       columns="C",
-       values=["D", "E"],
-       margins=True,
-       aggfunc=np.std
-   )
+   table = df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std)
    table
 
 Additionally, you can call :meth:`DataFrame.stack` to display a pivoted DataFrame

diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -1821,15 +1821,15 @@ to resample based on datetimelike column in the frame, it can passed to the
        ),
    )
    df
-   df.resample("M", on="date")[["a"]].sum()
+   df.resample("M", on="date").sum()
 
 Similarly, if you instead want to resample by a datetimelike
 level of ``MultiIndex``, its name or location can be passed to the
 ``level`` keyword.
 
 .. ipython:: python
 
-   df.resample("M", level="d")[["a"]].sum()
+   df.resample("M", level="d").sum()
 
 .. _timeseries.iterating-label:
 

diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst
@@ -166,7 +166,7 @@ without using temporary variable.
 .. ipython:: python
 
    bb = pd.read_csv("data/baseball.csv", index_col="id")
-   (bb.groupby(["year", "team"]).sum(numeric_only=True).loc[lambda df: df.r > 100])
+   (bb.groupby(["year", "team"]).sum().loc[lambda df: df.r > 100])
 
 .. _whatsnew_0181.partial_string_indexing:
 

diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
@@ -497,8 +497,8 @@ Other enhancements
          ),
      )
      df
-     df.resample("M", on="date")[["a"]].sum()
-     df.resample("M", level="d")[["a"]].sum()
+     df.resample("M", on="date").sum()
+     df.resample("M", level="d").sum()
 
 - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials <https://developers.google.com/identity/protocols/application-default-credentials>`__. See the docs for more details (:issue:`13577`).
 - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`)

diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst
@@ -16,7 +16,6 @@ Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`)
 - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`)
-- Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`)
 - Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`)
 
 .. ---------------------------------------------------------------------------