Skip to content

Commit 0a5cb8f

Browse files
DOC: update query/eval figures on performance comparison (#48368)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 3020b8d commit 0a5cb8f

File tree

7 files changed

+132
-24
lines changed

7 files changed

+132
-24
lines changed

doc/scripts/eval_performance.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
from timeit import repeat as timeit
2+
3+
import numpy as np
4+
import seaborn as sns
5+
6+
from pandas import DataFrame
7+
8+
setup_common = """from pandas import DataFrame
9+
from numpy.random import randn
10+
df = DataFrame(randn(%d, 3), columns=list('abc'))
11+
%s"""
12+
13+
setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'"
14+
15+
16+
def bench_with(n, times=10, repeat=3, engine="numexpr"):
17+
return (
18+
np.array(
19+
timeit(
20+
"df.eval(s, engine=%r)" % engine,
21+
setup=setup_common % (n, setup_with),
22+
repeat=repeat,
23+
number=times,
24+
)
25+
)
26+
/ times
27+
)
28+
29+
30+
setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'"
31+
32+
33+
def bench_subset(n, times=20, repeat=3, engine="numexpr"):
34+
return (
35+
np.array(
36+
timeit(
37+
"df.query(s, engine=%r)" % engine,
38+
setup=setup_common % (n, setup_subset),
39+
repeat=repeat,
40+
number=times,
41+
)
42+
)
43+
/ times
44+
)
45+
46+
47+
def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False):
48+
r = np.logspace(mn, mx, num=num).round().astype(int)
49+
50+
ev = DataFrame(np.empty((num, len(engines))), columns=engines)
51+
qu = ev.copy(deep=True)
52+
53+
ev["size"] = qu["size"] = r
54+
55+
for engine in engines:
56+
for i, n in enumerate(r):
57+
if verbose & (i % 10 == 0):
58+
print("engine: %r, i == %d" % (engine, i))
59+
ev_times = bench_with(n, times=1, repeat=1, engine=engine)
60+
ev.loc[i, engine] = np.mean(ev_times)
61+
qu_times = bench_subset(n, times=1, repeat=1, engine=engine)
62+
qu.loc[i, engine] = np.mean(qu_times)
63+
64+
return ev, qu
65+
66+
67+
def plot_perf(df, engines, title, filename=None):
68+
from matplotlib.pyplot import figure
69+
70+
sns.set()
71+
sns.set_palette("Set2")
72+
73+
fig = figure(figsize=(4, 3), dpi=120)
74+
ax = fig.add_subplot(111)
75+
76+
for engine in engines:
77+
ax.loglog(df["size"], df[engine], label=engine, lw=2)
78+
79+
ax.set_xlabel("Number of Rows")
80+
ax.set_ylabel("Time (s)")
81+
ax.set_title(title)
82+
ax.legend(loc="best")
83+
ax.tick_params(top=False, right=False)
84+
85+
fig.tight_layout()
86+
87+
if filename is not None:
88+
fig.savefig(filename)
89+
90+
91+
if __name__ == "__main__":
92+
import os
93+
94+
pandas_dir = os.path.dirname(
95+
os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
96+
)
97+
static_path = os.path.join(pandas_dir, "doc", "source", "_static")
98+
99+
join = lambda p: os.path.join(static_path, p)
100+
101+
fn = join("eval-query-perf-data.h5")
102+
103+
engines = "python", "numexpr"
104+
105+
ev, qu = bench(verbose=True) # only this one
106+
107+
plot_perf(ev, engines, "DataFrame.eval()", filename=join("eval-perf.png"))
108+
plot_perf(qu, engines, "DataFrame.query()", filename=join("query-perf.png"))
-24.7 KB
Binary file not shown.

doc/source/_static/eval-perf.png

10.8 KB
Loading
-21.2 KB
Binary file not shown.

doc/source/_static/query-perf.png

8.79 KB
Loading

doc/source/user_guide/enhancingperf.rst

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -690,21 +690,12 @@ The equivalent in standard Python would be
690690
df["a"] = 1
691691
df
692692
693-
The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
694-
whether the query modifies the original frame.
695-
696-
.. ipython:: python
697-
698-
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
699-
df.query("a > 2")
700-
df.query("a > 2", inplace=True)
701-
df
702-
703693
Local variables
704694
~~~~~~~~~~~~~~~
705695

706696
You must *explicitly reference* any local variable that you want to use in an
707-
expression by placing the ``@`` character in front of the name. For example,
697+
expression by placing the ``@`` character in front of the name. This mechanism is
698+
the same for both :meth:`DataFrame.query` and :meth:`DataFrame.eval`. For example,
708699

709700
.. ipython:: python
710701
@@ -820,17 +811,12 @@ significant performance benefit. Here is a plot showing the running time of
820811
:func:`pandas.eval` as function of the size of the frame involved in the
821812
computation. The two lines are two different engines.
822813

814+
..
815+
The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
823816
824817
.. image:: ../_static/eval-perf.png
825818

826-
827-
.. note::
828-
829-
Operations with smallish objects (around 15k-20k rows) are faster using
830-
plain Python:
831-
832-
.. image:: ../_static/eval-perf-small.png
833-
819+
You will only see the performance benefits of using the ``numexpr`` engine with :func:`pandas.eval` if your frame has more than approximately 100,000 rows.
834820

835821
This plot was created using a :class:`DataFrame` with 3 columns each containing
836822
floating point values generated using ``numpy.random.randn()``.

doc/source/user_guide/indexing.rst

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,6 +1240,17 @@ If instead you don't want to or cannot name your index, you can use the name
12401240
renaming your columns to something less ambiguous.
12411241

12421242

1243+
The :class:`DataFrame.query` method has a ``inplace`` keyword which determines
1244+
whether the query modifies the original frame.
1245+
1246+
.. ipython:: python
1247+
1248+
df = pd.DataFrame(dict(a=range(5), b=range(5, 10)))
1249+
df.query("a > 2")
1250+
df.query("a > 2", inplace=True)
1251+
df
1252+
1253+
12431254
:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax
12441255
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12451256

@@ -1438,15 +1449,18 @@ Performance of :meth:`~pandas.DataFrame.query`
14381449
``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for
14391450
large frames.
14401451

1452+
..
1453+
The eval-perf.png figure below was generated with /doc/scripts/eval_performance.py
1454+
14411455
.. image:: ../_static/query-perf.png
14421456

1443-
.. note::
14441457

1445-
You will only see the performance benefits of using the ``numexpr`` engine
1446-
with ``DataFrame.query()`` if your frame has more than approximately 200,000
1447-
rows.
14481458

1449-
.. image:: ../_static/query-perf-small.png
1459+
You will only see the performance benefits of using the ``numexpr`` engine
1460+
with ``DataFrame.query()`` if your frame has more than approximately 100,000
1461+
rows.
1462+
1463+
14501464

14511465
This plot was created using a ``DataFrame`` with 3 columns each containing
14521466
floating point values generated using ``numpy.random.randn()``.

0 commit comments

Comments
 (0)