Merge branch 'pandas-dev:main' into doc-concat

anetakahle · web-flow · commit 824b9bd4eec9 · 2022-05-20T18:59:41.000+02:00
diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
@@ -1,3 +1,6 @@
+from functools import wraps
+import threading
+
 import numpy as np
 
 from pandas import (
@@ -30,21 +33,57 @@
     from pandas._libs import algos
 except ImportError:
     from pandas import algos
-try:
-    from pandas._testing import test_parallel  # noqa: PDF014
 
-    have_real_test_parallel = True
-except ImportError:
-    have_real_test_parallel = False
 
-    def test_parallel(num_threads=1):
-        def wrapper(fname):
-            return fname
+from .pandas_vb_common import BaseIO  # isort:skip
 
-        return wrapper
 
+def test_parallel(num_threads=2, kwargs_list=None):
+    """
+    Decorator to run the same function multiple times in parallel.
 
-from .pandas_vb_common import BaseIO  # isort:skip
+    Parameters
+    ----------
+    num_threads : int, optional
+        The number of times the function is run in parallel.
+    kwargs_list : list of dicts, optional
+        The list of kwargs to update original
+        function kwargs on different threads.
+
+    Notes
+    -----
+    This decorator does not pass the return value of the decorated function.
+
+    Original from scikit-image:
+
+    https://github.com/scikit-image/scikit-image/pull/1519
+
+    """
+    assert num_threads > 0
+    has_kwargs_list = kwargs_list is not None
+    if has_kwargs_list:
+        assert len(kwargs_list) == num_threads
+
+    def wrapper(func):
+        @wraps(func)
+        def inner(*args, **kwargs):
+            if has_kwargs_list:
+                update_kwargs = lambda i: dict(kwargs, **kwargs_list[i])
+            else:
+                update_kwargs = lambda i: kwargs
+            threads = []
+            for i in range(num_threads):
+                updated_kwargs = update_kwargs(i)
+                thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs)
+                threads.append(thread)
+            for thread in threads:
+                thread.start()
+            for thread in threads:
+                thread.join()
+
+        return inner
+
+    return wrapper
 
 
 class ParallelGroupbyMethods:
@@ -53,8 +92,7 @@ class ParallelGroupbyMethods:
     param_names = ["threads", "method"]
 
     def setup(self, threads, method):
-        if not have_real_test_parallel:
-            raise NotImplementedError
+
         N = 10**6
         ngroups = 10**3
         df = DataFrame(
@@ -86,8 +124,7 @@ class ParallelGroups:
     param_names = ["threads"]
 
     def setup(self, threads):
-        if not have_real_test_parallel:
-            raise NotImplementedError
+
         size = 2**22
         ngroups = 10**3
         data = Series(np.random.randint(0, ngroups, size=size))
@@ -108,8 +145,7 @@ class ParallelTake1D:
     param_names = ["dtype"]
 
     def setup(self, dtype):
-        if not have_real_test_parallel:
-            raise NotImplementedError
+
         N = 10**6
         df = DataFrame({"col": np.arange(N, dtype=dtype)})
         indexer = np.arange(100, len(df) - 100)
@@ -131,8 +167,7 @@ class ParallelKth:
     repeat = 5
 
     def setup(self):
-        if not have_real_test_parallel:
-            raise NotImplementedError
+
         N = 10**7
         k = 5 * 10**5
         kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}]
@@ -149,8 +184,7 @@ def time_kth_smallest(self):
 
 class ParallelDatetimeFields:
     def setup(self):
-        if not have_real_test_parallel:
-            raise NotImplementedError
+
         N = 10**6
         self.dti = date_range("1900-01-01", periods=N, freq="T")
         self.period = self.dti.to_period("D")
@@ -204,8 +238,7 @@ class ParallelRolling:
     param_names = ["method"]
 
     def setup(self, method):
-        if not have_real_test_parallel:
-            raise NotImplementedError
+
         win = 100
         arr = np.random.rand(100000)
         if hasattr(DataFrame, "rolling"):
@@ -248,8 +281,7 @@ class ParallelReadCSV(BaseIO):
     param_names = ["dtype"]
 
     def setup(self, dtype):
-        if not have_real_test_parallel:
-            raise NotImplementedError
+
         rows = 10000
         cols = 50
         data = {
@@ -284,8 +316,6 @@ class ParallelFactorize:
     param_names = ["threads"]
 
     def setup(self, threads):
-        if not have_real_test_parallel:
-            raise NotImplementedError
 
         strings = tm.makeStringIndex(100000)
 
diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst
@@ -154,11 +154,11 @@ The apply and combine steps are typically done together in pandas.
 
 In the previous example, we explicitly selected the 2 columns first. If
 not, the ``mean`` method is applied to each column containing numerical
-columns:
+columns by passing ``numeric_only=True``:
 
 .. ipython:: python
 
-    titanic.groupby("Sex").mean()
+    titanic.groupby("Sex").mean(numeric_only=True)
 
 It does not make much sense to get the average value of the ``Pclass``.
 If we are only interested in the average age for each gender, the
diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
@@ -532,7 +532,7 @@ groups:
 
 .. ipython:: python
 
-   df.groupby("A").sum()
+   df.groupby("A")[["C", "D"]].sum()
 
 Grouping by multiple columns forms a hierarchical index, and again we can
 apply the :meth:`~pandas.core.groupby.GroupBy.sum` function:
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -477,7 +477,7 @@ An obvious one is aggregation via the
 .. ipython:: python
 
    grouped = df.groupby("A")
-   grouped.aggregate(np.sum)
+   grouped[["C", "D"]].aggregate(np.sum)
 
    grouped = df.groupby(["A", "B"])
    grouped.aggregate(np.sum)
@@ -492,7 +492,7 @@ changed by using the ``as_index`` option:
    grouped = df.groupby(["A", "B"], as_index=False)
    grouped.aggregate(np.sum)
 
-   df.groupby("A", as_index=False).sum()
+   df.groupby("A", as_index=False)[["C", "D"]].sum()
 
 Note that you could use the ``reset_index`` DataFrame function to achieve the
 same result as the column names are stored in the resulting ``MultiIndex``:
@@ -730,7 +730,7 @@ optimized Cython implementations:
 
 .. ipython:: python
 
-   df.groupby("A").sum()
+   df.groupby("A")[["C", "D"]].sum()
    df.groupby(["A", "B"]).mean()
 
 Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above
@@ -1159,13 +1159,12 @@ Again consider the example DataFrame we've been looking at:
 
 Suppose we wish to compute the standard deviation grouped by the ``A``
 column. There is a slight problem, namely that we don't care about the data in
-column ``B``. We refer to this as a "nuisance" column. If the passed
-aggregation function can't be applied to some columns, the troublesome columns
-will be (silently) dropped. Thus, this does not pose any problems:
+column ``B``. We refer to this as a "nuisance" column. You can avoid nuisance
+columns by specifying ``numeric_only=True``:
 
 .. ipython:: python
 
-   df.groupby("A").std()
+   df.groupby("A").std(numeric_only=True)
 
 Note that ``df.groupby('A').colname.std().`` is more efficient than
 ``df.groupby('A').std().colname``, so if the result of an aggregation function
@@ -1180,7 +1179,14 @@ is only interesting over one column (here ``colname``), it may be filtered
    If you do wish to include decimal or object columns in an aggregation with
    other non-nuisance data types, you must do so explicitly.
 
+.. warning::
+   The automatic dropping of nuisance columns has been deprecated and will be removed
+   in a future version of pandas. If columns are included that cannot be operated
+   on, pandas will instead raise an error. In order to avoid this, either select
+   the columns you wish to operate on or specify ``numeric_only=True``.
+
 .. ipython:: python
+    :okwarning:
 
     from decimal import Decimal
 
@@ -1304,7 +1310,7 @@ Groupby a specific column with the desired frequency. This is like resampling.
 
 .. ipython:: python
 
-   df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum()
+   df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum()
 
 You have an ambiguous specification in that you have a named index and a column
 that could be potential groupers.
@@ -1313,9 +1319,9 @@ that could be potential groupers.
 
    df = df.set_index("Date")
    df["Date"] = df.index + pd.offsets.MonthEnd(2)
-   df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"]).sum()
+   df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum()
 
-   df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"]).sum()
+   df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum()
 
 
 Taking the first rows of each group
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -583,7 +583,7 @@ without using a temporary variable.
 .. ipython:: python
 
    bb = pd.read_csv('data/baseball.csv', index_col='id')
-   (bb.groupby(['year', 'team']).sum()
+   (bb.groupby(['year', 'team']).sum(numeric_only=True)
       .loc[lambda df: df['r'] > 100])
 
 
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -414,12 +414,11 @@ We can produce pivot tables from this data very easily:
 
 The result object is a :class:`DataFrame` having potentially hierarchical indexes on the
 rows and columns. If the ``values`` column name is not given, the pivot table
-will include all of the data that can be aggregated in an additional level of
-hierarchy in the columns:
+will include all of the data in an additional level of hierarchy in the columns:
 
 .. ipython:: python
 
-   pd.pivot_table(df, index=["A", "B"], columns=["C"])
+   pd.pivot_table(df[["A", "B", "C", "D", "E"]], index=["A", "B"], columns=["C"])
 
 Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For detail of :class:`Grouper`, see :ref:`Grouping with a Grouper specification <groupby.specify>`.
 
@@ -432,7 +431,7 @@ calling :meth:`~DataFrame.to_string` if you wish:
 
 .. ipython:: python
 
-   table = pd.pivot_table(df, index=["A", "B"], columns=["C"])
+   table = pd.pivot_table(df, index=["A", "B"], columns=["C"], values=["D", "E"])
    print(table.to_string(na_rep=""))
 
 Note that :meth:`~DataFrame.pivot_table` is also available as an instance method on DataFrame,
@@ -449,7 +448,13 @@ rows and columns:
 
 .. ipython:: python
 
-   table = df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std)
+   table = df.pivot_table(
+       index=["A", "B"],
+       columns="C",
+       values=["D", "E"],
+       margins=True,
+       aggfunc=np.std
+   )
    table
 
 Additionally, you can call :meth:`DataFrame.stack` to display a pivoted DataFrame
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -1821,15 +1821,15 @@ to resample based on datetimelike column in the frame, it can passed to the
        ),
    )
    df
-   df.resample("M", on="date").sum()
+   df.resample("M", on="date")[["a"]].sum()
 
 Similarly, if you instead want to resample by a datetimelike
 level of ``MultiIndex``, its name or location can be passed to the
 ``level`` keyword.
 
 .. ipython:: python
 
-   df.resample("M", level="d").sum()
+   df.resample("M", level="d")[["a"]].sum()
 
 .. _timeseries.iterating-label:
 
diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst
@@ -166,7 +166,7 @@ without using temporary variable.
 .. ipython:: python
 
    bb = pd.read_csv("data/baseball.csv", index_col="id")
-   (bb.groupby(["year", "team"]).sum().loc[lambda df: df.r > 100])
+   (bb.groupby(["year", "team"]).sum(numeric_only=True).loc[lambda df: df.r > 100])
 
 .. _whatsnew_0181.partial_string_indexing:
 
diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
@@ -497,8 +497,8 @@ Other enhancements
          ),
      )
      df
-     df.resample("M", on="date").sum()
-     df.resample("M", level="d").sum()
+     df.resample("M", on="date")[["a"]].sum()
+     df.resample("M", level="d")[["a"]].sum()
 
 - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials <https://developers.google.com/identity/protocols/application-default-credentials>`__. See the docs for more details (:issue:`13577`).
 - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst
@@ -16,6 +16,7 @@ Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`)
 - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`)
+- Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`)
 - Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`)
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py
diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py