From 85b4ac670437b519413e4e340e3303c567c23fb2 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 20 Nov 2017 19:14:32 -0800 Subject: [PATCH 1/4] CLN: ASV Algorithms Try setup_cache Add base ASV class and test Hashing rework algorithms benchmarks improve algorithms benchmark Benchmarks working! cleanup --- asv_bench/benchmarks/algorithms.py | 191 +++++++++++++++++------------ 1 file changed, 114 insertions(+), 77 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 40cfec1bcd4c7..0dbcec02367e9 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,9 +1,9 @@ from importlib import import_module import numpy as np - import pandas as pd from pandas.util import testing as tm +from pandas.core.algorithms import checked_add_with_arr for imp in ['pandas.util', 'pandas.tools.hashing']: try: @@ -12,113 +12,150 @@ except: pass -class Algorithms(object): + +class Factorize(object): + + goal_time = 0.2 + + def setup(self): + N = 10**5 + np.random.seed(1234) + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + self.string_idx = tm.makeStringIndex(N) + + def time_factorize_int(self): + self.int_idx.factorize() + + def time_factorize_float(self): + self.float_idx.factorize() + + def time_factorize_string(self): + self.string_idx.factorize() + + +class Duplicated(object): + goal_time = 0.2 def setup(self): - N = 100000 + N = 10**5 np.random.seed(1234) + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + + def time_duplicated_int(self): + self.int_idx.duplicated() + + def time_duplicated_float(self): + self.float_idx.duplicated() - self.int_unique = pd.Int64Index(np.arange(N * 5)) + +class DuplicatedUniqueIndex(object): + + goal_time = 0.2 + + def setup(self): + N = 10**5 + self.idx_int_dup = pd.Int64Index(np.arange(N * 5)) # cache is_unique - self.int_unique.is_unique + self.idx_int_dup.is_unique - self.int = pd.Int64Index(np.arange(N).repeat(5)) - self.float = pd.Float64Index(np.random.randn(N).repeat(5)) + def time_duplicated_unique_int(self): + self.idx_int_dup.duplicated() - # Convenience naming. - self.checked_add = pd.core.algorithms.checked_add_with_arr - self.arr = np.arange(1000000) - self.arrpos = np.arange(1000000) - self.arrneg = np.arange(-1000000, 0) - self.arrmixed = np.array([1, -1]).repeat(500000) - self.strings = tm.makeStringIndex(100000) +class Match(object): - self.arr_nan = np.random.choice([True, False], size=1000000) - self.arrmixed_nan = np.random.choice([True, False], size=1000000) + goal_time = 0.2 - # match + def setup(self): + np.random.seed(1234) self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10) - def time_factorize_string(self): - self.strings.factorize() - - def time_factorize_int(self): - self.int.factorize() + def time_match_string(self): + pd.match(self.all, self.uniques) - def time_factorize_float(self): - self.int.factorize() - def time_duplicated_int_unique(self): - self.int_unique.duplicated() +class AddOverflowScalar(object): - def time_duplicated_int(self): - self.int.duplicated() + goal_time = 0.2 - def time_duplicated_float(self): - self.float.duplicated() + params = [1, -1, 0] - def time_match_strings(self): - pd.match(self.all, self.uniques) + def setup(self, scalar): + N = 10**6 + self.arr = np.arange(N) - def time_add_overflow_pos_scalar(self): - self.checked_add(self.arr, 1) + def time_add_overflow_scalar(self, scalar): + checked_add_with_arr(self.arr, scalar) - def time_add_overflow_neg_scalar(self): - self.checked_add(self.arr, -1) - def time_add_overflow_zero_scalar(self): - self.checked_add(self.arr, 0) +class AddOverflowArray(object): - def time_add_overflow_pos_arr(self): - self.checked_add(self.arr, self.arrpos) + goal_time = 0.2 - def time_add_overflow_neg_arr(self): - self.checked_add(self.arr, self.arrneg) + def setup(self): + np.random.seed(1234) + N = 10**6 + self.arr = np.arange(N) + self.arr_rev = np.arange(-N, 0) + self.arr_mixed = np.array([1, -1]).repeat(N / 2) + self.arr_nan_1 = np.random.choice([True, False], size=N) + self.arr_nan_2 = np.random.choice([True, False], size=N) - def time_add_overflow_mixed_arr(self): - self.checked_add(self.arr, self.arrmixed) + def time_add_overflow_arr_rev(self): + checked_add_with_arr(self.arr, self.arr_rev) - def time_add_overflow_first_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan) + def time_add_overflow_arr_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) - def time_add_overflow_second_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan) + def time_add_overflow_b_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, + b_mask=self.arr_nan_1) def time_add_overflow_both_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan, - b_mask=self.arrmixed_nan) + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, + b_mask=self.arr_nan_2) class Hashing(object): + goal_time = 0.2 - def setup(self): - N = 100000 - - self.df = pd.DataFrame( - {'A': pd.Series(tm.makeStringIndex(100).take( - np.random.randint(0, 100, size=N))), - 'B': pd.Series(tm.makeStringIndex(10000).take( - np.random.randint(0, 10000, size=N))), - 'D': np.random.randn(N), - 'E': np.arange(N), - 'F': pd.date_range('20110101', freq='s', periods=N), - 'G': pd.timedelta_range('1 day', freq='s', periods=N), - }) - self.df['C'] = self.df['B'].astype('category') - self.df.iloc[10:20] = np.nan - - def time_frame(self): - hashing.hash_pandas_object(self.df) - - def time_series_int(self): - hashing.hash_pandas_object(self.df.E) - - def time_series_string(self): - hashing.hash_pandas_object(self.df.B) - - def time_series_categorical(self): - hashing.hash_pandas_object(self.df.C) + def setup_cache(self): + np.random.seed(1234) + N = 10**5 + + df = pd.DataFrame( + {'strings': pd.Series(tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=N))), + 'floats': np.random.randn(N), + 'ints': np.arange(N), + 'dates': pd.date_range('20110101', freq='s', periods=N), + 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)}) + df['categories'] = df['strings'].astype('category') + df.iloc[10:20] = np.nan + return df + + def time_frame(self, df): + hashing.hash_pandas_object(df) + + def time_series_int(self, df): + hashing.hash_pandas_object(df['ints']) + + def time_series_string(self, df): + hashing.hash_pandas_object(df['strings']) + + def time_series_float(self, df): + hashing.hash_pandas_object(df['floats']) + + def time_series_categorical(self, df): + hashing.hash_pandas_object(df['categories']) + + def time_series_timedeltas(self, df): + hashing.hash_pandas_object(df['timedeltas']) + + def time_series_dates(self, df): + hashing.hash_pandas_object(df['dates']) From 0d5704e595ace5b2807212d188b0c35ba5fc6598 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 23 Nov 2017 17:06:27 -0800 Subject: [PATCH 2/4] Add param names --- asv_bench/benchmarks/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 0dbcec02367e9..0689751392489 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -83,6 +83,7 @@ class AddOverflowScalar(object): goal_time = 0.2 params = [1, -1, 0] + param_names = ['scalar'] def setup(self, scalar): N = 10**6 From d9237c9bcc318f1f81bbaf7c4233461616358ab7 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Fri, 24 Nov 2017 17:06:22 -0800 Subject: [PATCH 3/4] Add params for Duplicated and Factorize --- asv_bench/benchmarks/algorithms.py | 34 +++++++++++++++++++----------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 0689751392489..9c62808825f32 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -17,38 +17,48 @@ class Factorize(object): goal_time = 0.2 - def setup(self): + params = [True, False] + param_names = ['sort'] + + def setup(self, sort): N = 10**5 np.random.seed(1234) self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) self.string_idx = tm.makeStringIndex(N) - def time_factorize_int(self): - self.int_idx.factorize() + def time_factorize_int(self, sort): + self.int_idx.factorize(sort=sort) - def time_factorize_float(self): - self.float_idx.factorize() + def time_factorize_float(self, sort): + self.float_idx.factorize(sort=sort) - def time_factorize_string(self): - self.string_idx.factorize() + def time_factorize_string(self, sort): + self.string_idx.factorize(sort=sort) class Duplicated(object): goal_time = 0.2 - def setup(self): + params = ['first', 'last', False] + param_names = ['keep'] + + def setup(self, keep): N = 10**5 np.random.seed(1234) self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + self.string_idx = tm.makeStringIndex(N) + + def time_duplicated_int(self, keep): + self.int_idx.duplicated(keep=keep) - def time_duplicated_int(self): - self.int_idx.duplicated() + def time_duplicated_float(self, keep): + self.float_idx.duplicated(keep=keep) - def time_duplicated_float(self): - self.float_idx.duplicated() + def time_duplicated_string(self, keep): + self.string_idx.duplicated(keep=keep) class DuplicatedUniqueIndex(object): From 35c7af42dc0f7e9ad4204b9306544dde6c6a316d Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 25 Nov 2017 10:56:19 -0800 Subject: [PATCH 4/4] Move AddOverflow to binary_ops --- asv_bench/benchmarks/algorithms.py | 44 ------------------------------ asv_bench/benchmarks/binary_ops.py | 44 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 9c62808825f32..7ffb180b49e09 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd from pandas.util import testing as tm -from pandas.core.algorithms import checked_add_with_arr for imp in ['pandas.util', 'pandas.tools.hashing']: try: @@ -88,49 +87,6 @@ def time_match_string(self): pd.match(self.all, self.uniques) -class AddOverflowScalar(object): - - goal_time = 0.2 - - params = [1, -1, 0] - param_names = ['scalar'] - - def setup(self, scalar): - N = 10**6 - self.arr = np.arange(N) - - def time_add_overflow_scalar(self, scalar): - checked_add_with_arr(self.arr, scalar) - - -class AddOverflowArray(object): - - goal_time = 0.2 - - def setup(self): - np.random.seed(1234) - N = 10**6 - self.arr = np.arange(N) - self.arr_rev = np.arange(-N, 0) - self.arr_mixed = np.array([1, -1]).repeat(N / 2) - self.arr_nan_1 = np.random.choice([True, False], size=N) - self.arr_nan_2 = np.random.choice([True, False], size=N) - - def time_add_overflow_arr_rev(self): - checked_add_with_arr(self.arr, self.arr_rev) - - def time_add_overflow_arr_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) - - def time_add_overflow_b_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, - b_mask=self.arr_nan_1) - - def time_add_overflow_both_arg_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, - b_mask=self.arr_nan_2) - - class Hashing(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 429965c06cb48..14169ced4b71f 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,5 +1,6 @@ import numpy as np from pandas import DataFrame, Series, date_range +from pandas.core.algorithms import checked_add_with_arr try: import pandas.core.computation.expressions as expr except ImportError: @@ -108,3 +109,46 @@ def time_timestamp_ops_diff(self, tz): def time_timestamp_ops_diff_with_shift(self, tz): self.s - self.s.shift() + + +class AddOverflowScalar(object): + + goal_time = 0.2 + + params = [1, -1, 0] + param_names = ['scalar'] + + def setup(self, scalar): + N = 10**6 + self.arr = np.arange(N) + + def time_add_overflow_scalar(self, scalar): + checked_add_with_arr(self.arr, scalar) + + +class AddOverflowArray(object): + + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + N = 10**6 + self.arr = np.arange(N) + self.arr_rev = np.arange(-N, 0) + self.arr_mixed = np.array([1, -1]).repeat(N / 2) + self.arr_nan_1 = np.random.choice([True, False], size=N) + self.arr_nan_2 = np.random.choice([True, False], size=N) + + def time_add_overflow_arr_rev(self): + checked_add_with_arr(self.arr, self.arr_rev) + + def time_add_overflow_arr_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) + + def time_add_overflow_b_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, + b_mask=self.arr_nan_1) + + def time_add_overflow_both_arg_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, + b_mask=self.arr_nan_2)