From 3d5db57c76781ae3e2aeaa43e511566d71694107 Mon Sep 17 00:00:00 2001 From: dlovell Date: Sun, 28 Sep 2014 13:47:35 -0400 Subject: [PATCH 1/2] BENCH: programmatically create benchmarks for large ngroups (GH6787) --- doc/source/v0.15.0.txt | 1 + vb_suite/groupby.py | 72 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 8c0e193ec6348..0d003b9f80588 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -813,6 +813,7 @@ Performance - Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`) - Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`) - Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`). +- Performance benchmarking of groupby for large value of ngroups (:issue:`6787`) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index c9746359b6ecd..dddb559a86dca 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -484,3 +484,75 @@ def f(g): groupby_agg_builtins1 = Benchmark("df.groupby('jim').agg([sum, min, max])", setup) groupby_agg_builtins2 = Benchmark("df.groupby(['jim', 'joe']).agg([sum, min, max])", setup) + +#---------------------------------------------------------------------- +# groupby with a large value for ngroups + +setup = common_setup + """ +np.random.seed(1234) +ngroups = 10000 +size = ngroups * 10 +rng = np.arange(ngroups) +df = DataFrame(dict( + timestamp=rng.take(np.random.randint(0, ngroups, size=size)), + value=np.random.randint(0, size, size=size) +)) +""" + +no_arg_func_list = [ + 'all', + 'any', + 'count', + 'cumcount', + 'cummax', + 'cummin', + 'cumprod', + 'cumsum', + 'describe', + 'diff', + 'first', + 'head', + 'last', + 'mad', + 'max', + 'mean', + 'median', + 'min', + 'nunique', + 'pct_change', + 'prod', + 'rank', + 'sem', + 'size', + 'skew', + 'std', + 'sum', + 'tail', + 'unique', + 'var', + 'value_counts', +] + + +_stmt_template = "df.groupby('value')['timestamp'].%s" +START_DATE = datetime(2011, 7, 1) + + +def make_large_ngroups_bmark(func_name, func_args=''): + bmark_name = 'groupby_large_ngroups_%s' % func_name + stmt = _stmt_template % ('%s(%s)' % (func_name, func_args)) + bmark = Benchmark(stmt, setup, start_date=START_DATE) + # MUST set name + bmark.name = bmark_name + return bmark + + +def inject_bmark_into_globals(bmark): + if not bmark.name: + raise AssertionError('benchmark must have a name') + globals()[bmark.name] = bmark + + +for func_name in no_arg_func_list: + bmark = make_large_ngroups_bmark(func_name) + inject_bmark_into_globals(bmark) From c494c03165ae1e1195893d6567729327021bb8ae Mon Sep 17 00:00:00 2001 From: dlovell Date: Tue, 30 Sep 2014 07:21:49 -0400 Subject: [PATCH 2/2] make running multiple values of ngroups easy --- vb_suite/groupby.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index dddb559a86dca..ec1befa53d383 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -486,19 +486,10 @@ def f(g): groupby_agg_builtins2 = Benchmark("df.groupby(['jim', 'joe']).agg([sum, min, max])", setup) #---------------------------------------------------------------------- -# groupby with a large value for ngroups +# groupby with a variable value for ngroups -setup = common_setup + """ -np.random.seed(1234) -ngroups = 10000 -size = ngroups * 10 -rng = np.arange(ngroups) -df = DataFrame(dict( - timestamp=rng.take(np.random.randint(0, ngroups, size=size)), - value=np.random.randint(0, size, size=size) -)) -""" +ngroups_list = [100, 10000] no_arg_func_list = [ 'all', 'any', @@ -535,12 +526,23 @@ def f(g): _stmt_template = "df.groupby('value')['timestamp'].%s" +_setup_template = common_setup + """ +np.random.seed(1234) +ngroups = %s +size = ngroups * 10 +rng = np.arange(ngroups) +df = DataFrame(dict( + timestamp=rng.take(np.random.randint(0, ngroups, size=size)), + value=np.random.randint(0, size, size=size) +)) +""" START_DATE = datetime(2011, 7, 1) -def make_large_ngroups_bmark(func_name, func_args=''): - bmark_name = 'groupby_large_ngroups_%s' % func_name +def make_large_ngroups_bmark(ngroups, func_name, func_args=''): + bmark_name = 'groupby_ngroups_%s_%s' % (ngroups, func_name) stmt = _stmt_template % ('%s(%s)' % (func_name, func_args)) + setup = _setup_template % ngroups bmark = Benchmark(stmt, setup, start_date=START_DATE) # MUST set name bmark.name = bmark_name @@ -553,6 +555,7 @@ def inject_bmark_into_globals(bmark): globals()[bmark.name] = bmark -for func_name in no_arg_func_list: - bmark = make_large_ngroups_bmark(func_name) - inject_bmark_into_globals(bmark) +for ngroups in ngroups_list: + for func_name in no_arg_func_list: + bmark = make_large_ngroups_bmark(ngroups, func_name) + inject_bmark_into_globals(bmark)