Skip to content

Commit 824b9bd

Browse files
authored
Merge branch 'pandas-dev:main' into doc-concat
2 parents 982a839 + 1be9d38 commit 824b9bd

File tree

14 files changed

+156
-101
lines changed

14 files changed

+156
-101
lines changed

asv_bench/benchmarks/gil.py

Lines changed: 56 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from functools import wraps
2+
import threading
3+
14
import numpy as np
25

36
from pandas import (
@@ -30,21 +33,57 @@
3033
from pandas._libs import algos
3134
except ImportError:
3235
from pandas import algos
33-
try:
34-
from pandas._testing import test_parallel # noqa: PDF014
3536

36-
have_real_test_parallel = True
37-
except ImportError:
38-
have_real_test_parallel = False
3937

40-
def test_parallel(num_threads=1):
41-
def wrapper(fname):
42-
return fname
38+
from .pandas_vb_common import BaseIO # isort:skip
4339

44-
return wrapper
4540

41+
def test_parallel(num_threads=2, kwargs_list=None):
42+
"""
43+
Decorator to run the same function multiple times in parallel.
4644
47-
from .pandas_vb_common import BaseIO # isort:skip
45+
Parameters
46+
----------
47+
num_threads : int, optional
48+
The number of times the function is run in parallel.
49+
kwargs_list : list of dicts, optional
50+
The list of kwargs to update original
51+
function kwargs on different threads.
52+
53+
Notes
54+
-----
55+
This decorator does not pass the return value of the decorated function.
56+
57+
Original from scikit-image:
58+
59+
https://github.com/scikit-image/scikit-image/pull/1519
60+
61+
"""
62+
assert num_threads > 0
63+
has_kwargs_list = kwargs_list is not None
64+
if has_kwargs_list:
65+
assert len(kwargs_list) == num_threads
66+
67+
def wrapper(func):
68+
@wraps(func)
69+
def inner(*args, **kwargs):
70+
if has_kwargs_list:
71+
update_kwargs = lambda i: dict(kwargs, **kwargs_list[i])
72+
else:
73+
update_kwargs = lambda i: kwargs
74+
threads = []
75+
for i in range(num_threads):
76+
updated_kwargs = update_kwargs(i)
77+
thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs)
78+
threads.append(thread)
79+
for thread in threads:
80+
thread.start()
81+
for thread in threads:
82+
thread.join()
83+
84+
return inner
85+
86+
return wrapper
4887

4988

5089
class ParallelGroupbyMethods:
@@ -53,8 +92,7 @@ class ParallelGroupbyMethods:
5392
param_names = ["threads", "method"]
5493

5594
def setup(self, threads, method):
56-
if not have_real_test_parallel:
57-
raise NotImplementedError
95+
5896
N = 10**6
5997
ngroups = 10**3
6098
df = DataFrame(
@@ -86,8 +124,7 @@ class ParallelGroups:
86124
param_names = ["threads"]
87125

88126
def setup(self, threads):
89-
if not have_real_test_parallel:
90-
raise NotImplementedError
127+
91128
size = 2**22
92129
ngroups = 10**3
93130
data = Series(np.random.randint(0, ngroups, size=size))
@@ -108,8 +145,7 @@ class ParallelTake1D:
108145
param_names = ["dtype"]
109146

110147
def setup(self, dtype):
111-
if not have_real_test_parallel:
112-
raise NotImplementedError
148+
113149
N = 10**6
114150
df = DataFrame({"col": np.arange(N, dtype=dtype)})
115151
indexer = np.arange(100, len(df) - 100)
@@ -131,8 +167,7 @@ class ParallelKth:
131167
repeat = 5
132168

133169
def setup(self):
134-
if not have_real_test_parallel:
135-
raise NotImplementedError
170+
136171
N = 10**7
137172
k = 5 * 10**5
138173
kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}]
@@ -149,8 +184,7 @@ def time_kth_smallest(self):
149184

150185
class ParallelDatetimeFields:
151186
def setup(self):
152-
if not have_real_test_parallel:
153-
raise NotImplementedError
187+
154188
N = 10**6
155189
self.dti = date_range("1900-01-01", periods=N, freq="T")
156190
self.period = self.dti.to_period("D")
@@ -204,8 +238,7 @@ class ParallelRolling:
204238
param_names = ["method"]
205239

206240
def setup(self, method):
207-
if not have_real_test_parallel:
208-
raise NotImplementedError
241+
209242
win = 100
210243
arr = np.random.rand(100000)
211244
if hasattr(DataFrame, "rolling"):
@@ -248,8 +281,7 @@ class ParallelReadCSV(BaseIO):
248281
param_names = ["dtype"]
249282

250283
def setup(self, dtype):
251-
if not have_real_test_parallel:
252-
raise NotImplementedError
284+
253285
rows = 10000
254286
cols = 50
255287
data = {
@@ -284,8 +316,6 @@ class ParallelFactorize:
284316
param_names = ["threads"]
285317

286318
def setup(self, threads):
287-
if not have_real_test_parallel:
288-
raise NotImplementedError
289319

290320
strings = tm.makeStringIndex(100000)
291321

doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,11 @@ The apply and combine steps are typically done together in pandas.
154154

155155
In the previous example, we explicitly selected the 2 columns first. If
156156
not, the ``mean`` method is applied to each column containing numerical
157-
columns:
157+
columns by passing ``numeric_only=True``:
158158

159159
.. ipython:: python
160160
161-
titanic.groupby("Sex").mean()
161+
titanic.groupby("Sex").mean(numeric_only=True)
162162
163163
It does not make much sense to get the average value of the ``Pclass``.
164164
If we are only interested in the average age for each gender, the

doc/source/user_guide/10min.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ groups:
532532

533533
.. ipython:: python
534534
535-
df.groupby("A").sum()
535+
df.groupby("A")[["C", "D"]].sum()
536536
537537
Grouping by multiple columns forms a hierarchical index, and again we can
538538
apply the :meth:`~pandas.core.groupby.GroupBy.sum` function:

doc/source/user_guide/groupby.rst

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ An obvious one is aggregation via the
477477
.. ipython:: python
478478
479479
grouped = df.groupby("A")
480-
grouped.aggregate(np.sum)
480+
grouped[["C", "D"]].aggregate(np.sum)
481481
482482
grouped = df.groupby(["A", "B"])
483483
grouped.aggregate(np.sum)
@@ -492,7 +492,7 @@ changed by using the ``as_index`` option:
492492
grouped = df.groupby(["A", "B"], as_index=False)
493493
grouped.aggregate(np.sum)
494494
495-
df.groupby("A", as_index=False).sum()
495+
df.groupby("A", as_index=False)[["C", "D"]].sum()
496496
497497
Note that you could use the ``reset_index`` DataFrame function to achieve the
498498
same result as the column names are stored in the resulting ``MultiIndex``:
@@ -730,7 +730,7 @@ optimized Cython implementations:
730730

731731
.. ipython:: python
732732
733-
df.groupby("A").sum()
733+
df.groupby("A")[["C", "D"]].sum()
734734
df.groupby(["A", "B"]).mean()
735735
736736
Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above
@@ -1159,13 +1159,12 @@ Again consider the example DataFrame we've been looking at:
11591159
11601160
Suppose we wish to compute the standard deviation grouped by the ``A``
11611161
column. There is a slight problem, namely that we don't care about the data in
1162-
column ``B``. We refer to this as a "nuisance" column. If the passed
1163-
aggregation function can't be applied to some columns, the troublesome columns
1164-
will be (silently) dropped. Thus, this does not pose any problems:
1162+
column ``B``. We refer to this as a "nuisance" column. You can avoid nuisance
1163+
columns by specifying ``numeric_only=True``:
11651164

11661165
.. ipython:: python
11671166
1168-
df.groupby("A").std()
1167+
df.groupby("A").std(numeric_only=True)
11691168
11701169
Note that ``df.groupby('A').colname.std().`` is more efficient than
11711170
``df.groupby('A').std().colname``, so if the result of an aggregation function
@@ -1180,7 +1179,14 @@ is only interesting over one column (here ``colname``), it may be filtered
11801179
If you do wish to include decimal or object columns in an aggregation with
11811180
other non-nuisance data types, you must do so explicitly.
11821181

1182+
.. warning::
1183+
The automatic dropping of nuisance columns has been deprecated and will be removed
1184+
in a future version of pandas. If columns are included that cannot be operated
1185+
on, pandas will instead raise an error. In order to avoid this, either select
1186+
the columns you wish to operate on or specify ``numeric_only=True``.
1187+
11831188
.. ipython:: python
1189+
:okwarning:
11841190
11851191
from decimal import Decimal
11861192
@@ -1304,7 +1310,7 @@ Groupby a specific column with the desired frequency. This is like resampling.
13041310

13051311
.. ipython:: python
13061312
1307-
df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum()
1313+
df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum()
13081314
13091315
You have an ambiguous specification in that you have a named index and a column
13101316
that could be potential groupers.
@@ -1313,9 +1319,9 @@ that could be potential groupers.
13131319
13141320
df = df.set_index("Date")
13151321
df["Date"] = df.index + pd.offsets.MonthEnd(2)
1316-
df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"]).sum()
1322+
df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum()
13171323
1318-
df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"]).sum()
1324+
df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum()
13191325
13201326
13211327
Taking the first rows of each group

doc/source/user_guide/indexing.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ without using a temporary variable.
583583
.. ipython:: python
584584
585585
bb = pd.read_csv('data/baseball.csv', index_col='id')
586-
(bb.groupby(['year', 'team']).sum()
586+
(bb.groupby(['year', 'team']).sum(numeric_only=True)
587587
.loc[lambda df: df['r'] > 100])
588588
589589

doc/source/user_guide/reshaping.rst

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -414,12 +414,11 @@ We can produce pivot tables from this data very easily:
414414
415415
The result object is a :class:`DataFrame` having potentially hierarchical indexes on the
416416
rows and columns. If the ``values`` column name is not given, the pivot table
417-
will include all of the data that can be aggregated in an additional level of
418-
hierarchy in the columns:
417+
will include all of the data in an additional level of hierarchy in the columns:
419418

420419
.. ipython:: python
421420
422-
pd.pivot_table(df, index=["A", "B"], columns=["C"])
421+
pd.pivot_table(df[["A", "B", "C", "D", "E"]], index=["A", "B"], columns=["C"])
423422
424423
Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For detail of :class:`Grouper`, see :ref:`Grouping with a Grouper specification <groupby.specify>`.
425424

@@ -432,7 +431,7 @@ calling :meth:`~DataFrame.to_string` if you wish:
432431

433432
.. ipython:: python
434433
435-
table = pd.pivot_table(df, index=["A", "B"], columns=["C"])
434+
table = pd.pivot_table(df, index=["A", "B"], columns=["C"], values=["D", "E"])
436435
print(table.to_string(na_rep=""))
437436
438437
Note that :meth:`~DataFrame.pivot_table` is also available as an instance method on DataFrame,
@@ -449,7 +448,13 @@ rows and columns:
449448

450449
.. ipython:: python
451450
452-
table = df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std)
451+
table = df.pivot_table(
452+
index=["A", "B"],
453+
columns="C",
454+
values=["D", "E"],
455+
margins=True,
456+
aggfunc=np.std
457+
)
453458
table
454459
455460
Additionally, you can call :meth:`DataFrame.stack` to display a pivoted DataFrame

doc/source/user_guide/timeseries.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1821,15 +1821,15 @@ to resample based on datetimelike column in the frame, it can passed to the
18211821
),
18221822
)
18231823
df
1824-
df.resample("M", on="date").sum()
1824+
df.resample("M", on="date")[["a"]].sum()
18251825
18261826
Similarly, if you instead want to resample by a datetimelike
18271827
level of ``MultiIndex``, its name or location can be passed to the
18281828
``level`` keyword.
18291829

18301830
.. ipython:: python
18311831
1832-
df.resample("M", level="d").sum()
1832+
df.resample("M", level="d")[["a"]].sum()
18331833
18341834
.. _timeseries.iterating-label:
18351835

doc/source/whatsnew/v0.18.1.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ without using temporary variable.
166166
.. ipython:: python
167167
168168
bb = pd.read_csv("data/baseball.csv", index_col="id")
169-
(bb.groupby(["year", "team"]).sum().loc[lambda df: df.r > 100])
169+
(bb.groupby(["year", "team"]).sum(numeric_only=True).loc[lambda df: df.r > 100])
170170
171171
.. _whatsnew_0181.partial_string_indexing:
172172

doc/source/whatsnew/v0.19.0.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -497,8 +497,8 @@ Other enhancements
497497
),
498498
)
499499
df
500-
df.resample("M", on="date").sum()
501-
df.resample("M", level="d").sum()
500+
df.resample("M", on="date")[["a"]].sum()
501+
df.resample("M", level="d")[["a"]].sum()
502502
503503
- The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials <https://developers.google.com/identity/protocols/application-default-credentials>`__. See the docs for more details (:issue:`13577`).
504504
- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`)

doc/source/whatsnew/v1.4.3.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Fixed regressions
1616
~~~~~~~~~~~~~~~~~
1717
- Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`)
1818
- Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`)
19+
- Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`)
1920
- Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`)
2021

2122
.. ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)