Skip to content

DOC: Improve reshape\concat #47061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
May 27, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 26 additions & 56 deletions asv_bench/benchmarks/gil.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from functools import wraps
import threading

import numpy as np

from pandas import (
Expand Down Expand Up @@ -33,57 +30,21 @@
from pandas._libs import algos
except ImportError:
from pandas import algos
try:
from pandas._testing import test_parallel # noqa: PDF014

have_real_test_parallel = True
except ImportError:
have_real_test_parallel = False

from .pandas_vb_common import BaseIO # isort:skip


def test_parallel(num_threads=2, kwargs_list=None):
"""
Decorator to run the same function multiple times in parallel.

Parameters
----------
num_threads : int, optional
The number of times the function is run in parallel.
kwargs_list : list of dicts, optional
The list of kwargs to update original
function kwargs on different threads.

Notes
-----
This decorator does not pass the return value of the decorated function.

Original from scikit-image:

https://github.com/scikit-image/scikit-image/pull/1519

"""
assert num_threads > 0
has_kwargs_list = kwargs_list is not None
if has_kwargs_list:
assert len(kwargs_list) == num_threads
def test_parallel(num_threads=1):
def wrapper(fname):
return fname

def wrapper(func):
@wraps(func)
def inner(*args, **kwargs):
if has_kwargs_list:
update_kwargs = lambda i: dict(kwargs, **kwargs_list[i])
else:
update_kwargs = lambda i: kwargs
threads = []
for i in range(num_threads):
updated_kwargs = update_kwargs(i)
thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs)
threads.append(thread)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
return wrapper

return inner

return wrapper
from .pandas_vb_common import BaseIO # isort:skip


class ParallelGroupbyMethods:
Expand All @@ -92,7 +53,8 @@ class ParallelGroupbyMethods:
param_names = ["threads", "method"]

def setup(self, threads, method):

if not have_real_test_parallel:
raise NotImplementedError
N = 10**6
ngroups = 10**3
df = DataFrame(
Expand Down Expand Up @@ -124,7 +86,8 @@ class ParallelGroups:
param_names = ["threads"]

def setup(self, threads):

if not have_real_test_parallel:
raise NotImplementedError
size = 2**22
ngroups = 10**3
data = Series(np.random.randint(0, ngroups, size=size))
Expand All @@ -145,7 +108,8 @@ class ParallelTake1D:
param_names = ["dtype"]

def setup(self, dtype):

if not have_real_test_parallel:
raise NotImplementedError
N = 10**6
df = DataFrame({"col": np.arange(N, dtype=dtype)})
indexer = np.arange(100, len(df) - 100)
Expand All @@ -167,7 +131,8 @@ class ParallelKth:
repeat = 5

def setup(self):

if not have_real_test_parallel:
raise NotImplementedError
N = 10**7
k = 5 * 10**5
kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}]
Expand All @@ -184,7 +149,8 @@ def time_kth_smallest(self):

class ParallelDatetimeFields:
def setup(self):

if not have_real_test_parallel:
raise NotImplementedError
N = 10**6
self.dti = date_range("1900-01-01", periods=N, freq="T")
self.period = self.dti.to_period("D")
Expand Down Expand Up @@ -238,7 +204,8 @@ class ParallelRolling:
param_names = ["method"]

def setup(self, method):

if not have_real_test_parallel:
raise NotImplementedError
win = 100
arr = np.random.rand(100000)
if hasattr(DataFrame, "rolling"):
Expand Down Expand Up @@ -281,7 +248,8 @@ class ParallelReadCSV(BaseIO):
param_names = ["dtype"]

def setup(self, dtype):

if not have_real_test_parallel:
raise NotImplementedError
rows = 10000
cols = 50
data = {
Expand Down Expand Up @@ -316,6 +284,8 @@ class ParallelFactorize:
param_names = ["threads"]

def setup(self, threads):
if not have_real_test_parallel:
raise NotImplementedError

strings = tm.makeStringIndex(100000)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,11 @@ The apply and combine steps are typically done together in pandas.

In the previous example, we explicitly selected the 2 columns first. If
not, the ``mean`` method is applied to each column containing numerical
columns by passing ``numeric_only=True``:
columns:

.. ipython:: python

titanic.groupby("Sex").mean(numeric_only=True)
titanic.groupby("Sex").mean()

It does not make much sense to get the average value of the ``Pclass``.
If we are only interested in the average age for each gender, the
Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/10min.rst
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,7 @@ groups:

.. ipython:: python

df.groupby("A")[["C", "D"]].sum()
df.groupby("A").sum()

Grouping by multiple columns forms a hierarchical index, and again we can
apply the :meth:`~pandas.core.groupby.GroupBy.sum` function:
Expand Down
26 changes: 10 additions & 16 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ An obvious one is aggregation via the
.. ipython:: python

grouped = df.groupby("A")
grouped[["C", "D"]].aggregate(np.sum)
grouped.aggregate(np.sum)

grouped = df.groupby(["A", "B"])
grouped.aggregate(np.sum)
Expand All @@ -492,7 +492,7 @@ changed by using the ``as_index`` option:
grouped = df.groupby(["A", "B"], as_index=False)
grouped.aggregate(np.sum)

df.groupby("A", as_index=False)[["C", "D"]].sum()
df.groupby("A", as_index=False).sum()

Note that you could use the ``reset_index`` DataFrame function to achieve the
same result as the column names are stored in the resulting ``MultiIndex``:
Expand Down Expand Up @@ -730,7 +730,7 @@ optimized Cython implementations:

.. ipython:: python

df.groupby("A")[["C", "D"]].sum()
df.groupby("A").sum()
df.groupby(["A", "B"]).mean()

Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above
Expand Down Expand Up @@ -1159,12 +1159,13 @@ Again consider the example DataFrame we've been looking at:

Suppose we wish to compute the standard deviation grouped by the ``A``
column. There is a slight problem, namely that we don't care about the data in
column ``B``. We refer to this as a "nuisance" column. You can avoid nuisance
columns by specifying ``numeric_only=True``:
column ``B``. We refer to this as a "nuisance" column. If the passed
aggregation function can't be applied to some columns, the troublesome columns
will be (silently) dropped. Thus, this does not pose any problems:

.. ipython:: python

df.groupby("A").std(numeric_only=True)
df.groupby("A").std()

Note that ``df.groupby('A').colname.std().`` is more efficient than
``df.groupby('A').std().colname``, so if the result of an aggregation function
Expand All @@ -1179,14 +1180,7 @@ is only interesting over one column (here ``colname``), it may be filtered
If you do wish to include decimal or object columns in an aggregation with
other non-nuisance data types, you must do so explicitly.

.. warning::
The automatic dropping of nuisance columns has been deprecated and will be removed
in a future version of pandas. If columns are included that cannot be operated
on, pandas will instead raise an error. In order to avoid this, either select
the columns you wish to operate on or specify ``numeric_only=True``.

.. ipython:: python
:okwarning:

from decimal import Decimal

Expand Down Expand Up @@ -1310,7 +1304,7 @@ Groupby a specific column with the desired frequency. This is like resampling.

.. ipython:: python

df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum()
df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum()

You have an ambiguous specification in that you have a named index and a column
that could be potential groupers.
Expand All @@ -1319,9 +1313,9 @@ that could be potential groupers.

df = df.set_index("Date")
df["Date"] = df.index + pd.offsets.MonthEnd(2)
df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum()
df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"]).sum()

df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum()
df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"]).sum()


Taking the first rows of each group
Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ without using a temporary variable.
.. ipython:: python

bb = pd.read_csv('data/baseball.csv', index_col='id')
(bb.groupby(['year', 'team']).sum(numeric_only=True)
(bb.groupby(['year', 'team']).sum()
.loc[lambda df: df['r'] > 100])


Expand Down
15 changes: 5 additions & 10 deletions doc/source/user_guide/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -414,11 +414,12 @@ We can produce pivot tables from this data very easily:

The result object is a :class:`DataFrame` having potentially hierarchical indexes on the
rows and columns. If the ``values`` column name is not given, the pivot table
will include all of the data in an additional level of hierarchy in the columns:
will include all of the data that can be aggregated in an additional level of
hierarchy in the columns:

.. ipython:: python

pd.pivot_table(df[["A", "B", "C", "D", "E"]], index=["A", "B"], columns=["C"])
pd.pivot_table(df, index=["A", "B"], columns=["C"])

Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For detail of :class:`Grouper`, see :ref:`Grouping with a Grouper specification <groupby.specify>`.

Expand All @@ -431,7 +432,7 @@ calling :meth:`~DataFrame.to_string` if you wish:

.. ipython:: python

table = pd.pivot_table(df, index=["A", "B"], columns=["C"], values=["D", "E"])
table = pd.pivot_table(df, index=["A", "B"], columns=["C"])
print(table.to_string(na_rep=""))

Note that :meth:`~DataFrame.pivot_table` is also available as an instance method on DataFrame,
Expand All @@ -448,13 +449,7 @@ rows and columns:

.. ipython:: python

table = df.pivot_table(
index=["A", "B"],
columns="C",
values=["D", "E"],
margins=True,
aggfunc=np.std
)
table = df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std)
table

Additionally, you can call :meth:`DataFrame.stack` to display a pivoted DataFrame
Expand Down
4 changes: 2 additions & 2 deletions doc/source/user_guide/timeseries.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1821,15 +1821,15 @@ to resample based on datetimelike column in the frame, it can passed to the
),
)
df
df.resample("M", on="date")[["a"]].sum()
df.resample("M", on="date").sum()

Similarly, if you instead want to resample by a datetimelike
level of ``MultiIndex``, its name or location can be passed to the
``level`` keyword.

.. ipython:: python

df.resample("M", level="d")[["a"]].sum()
df.resample("M", level="d").sum()

.. _timeseries.iterating-label:

Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.18.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ without using temporary variable.
.. ipython:: python

bb = pd.read_csv("data/baseball.csv", index_col="id")
(bb.groupby(["year", "team"]).sum(numeric_only=True).loc[lambda df: df.r > 100])
(bb.groupby(["year", "team"]).sum().loc[lambda df: df.r > 100])

.. _whatsnew_0181.partial_string_indexing:

Expand Down
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.19.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -497,8 +497,8 @@ Other enhancements
),
)
df
df.resample("M", on="date")[["a"]].sum()
df.resample("M", level="d")[["a"]].sum()
df.resample("M", on="date").sum()
df.resample("M", level="d").sum()

- The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials <https://developers.google.com/identity/protocols/application-default-credentials>`__. See the docs for more details (:issue:`13577`).
- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
Expand Down
1 change: 0 additions & 1 deletion doc/source/whatsnew/v1.4.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`)
- Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`)
- Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`)
- Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`)

.. ---------------------------------------------------------------------------
Expand Down
Loading