From 85b4ac670437b519413e4e340e3303c567c23fb2 Mon Sep 17 00:00:00 2001
From: Matt Roeschke <emailformattr@gmail.com>
Date: Mon, 20 Nov 2017 19:14:32 -0800
Subject: [PATCH 1/4] CLN: ASV Algorithms

Try setup_cache

Add base ASV class and test Hashing

rework algorithms benchmarks

improve algorithms benchmark

Benchmarks working!

cleanup
---
 asv_bench/benchmarks/algorithms.py | 191 +++++++++++++++++------------
 1 file changed, 114 insertions(+), 77 deletions(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 40cfec1bcd4c7..0dbcec02367e9 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -1,9 +1,9 @@
 from importlib import import_module
 
 import numpy as np
-
 import pandas as pd
 from pandas.util import testing as tm
+from pandas.core.algorithms import checked_add_with_arr
 
 for imp in ['pandas.util', 'pandas.tools.hashing']:
     try:
@@ -12,113 +12,150 @@
     except:
         pass
 
-class Algorithms(object):
+
+class Factorize(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        N = 10**5
+        np.random.seed(1234)
+        self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
+        self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
+        self.string_idx = tm.makeStringIndex(N)
+
+    def time_factorize_int(self):
+        self.int_idx.factorize()
+
+    def time_factorize_float(self):
+        self.float_idx.factorize()
+
+    def time_factorize_string(self):
+        self.string_idx.factorize()
+
+
+class Duplicated(object):
+
     goal_time = 0.2
 
     def setup(self):
-        N = 100000
+        N = 10**5
         np.random.seed(1234)
+        self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
+        self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
+
+    def time_duplicated_int(self):
+        self.int_idx.duplicated()
+
+    def time_duplicated_float(self):
+        self.float_idx.duplicated()
 
-        self.int_unique = pd.Int64Index(np.arange(N * 5))
+
+class DuplicatedUniqueIndex(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        N = 10**5
+        self.idx_int_dup = pd.Int64Index(np.arange(N * 5))
         # cache is_unique
-        self.int_unique.is_unique
+        self.idx_int_dup.is_unique
 
-        self.int = pd.Int64Index(np.arange(N).repeat(5))
-        self.float = pd.Float64Index(np.random.randn(N).repeat(5))
+    def time_duplicated_unique_int(self):
+        self.idx_int_dup.duplicated()
 
-        # Convenience naming.
-        self.checked_add = pd.core.algorithms.checked_add_with_arr
 
-        self.arr = np.arange(1000000)
-        self.arrpos = np.arange(1000000)
-        self.arrneg = np.arange(-1000000, 0)
-        self.arrmixed = np.array([1, -1]).repeat(500000)
-        self.strings = tm.makeStringIndex(100000)
+class Match(object):
 
-        self.arr_nan = np.random.choice([True, False], size=1000000)
-        self.arrmixed_nan = np.random.choice([True, False], size=1000000)
+    goal_time = 0.2
 
-        # match
+    def setup(self):
+        np.random.seed(1234)
         self.uniques = tm.makeStringIndex(1000).values
         self.all = self.uniques.repeat(10)
 
-    def time_factorize_string(self):
-        self.strings.factorize()
-
-    def time_factorize_int(self):
-        self.int.factorize()
+    def time_match_string(self):
+        pd.match(self.all, self.uniques)
 
-    def time_factorize_float(self):
-        self.int.factorize()
 
-    def time_duplicated_int_unique(self):
-        self.int_unique.duplicated()
+class AddOverflowScalar(object):
 
-    def time_duplicated_int(self):
-        self.int.duplicated()
+    goal_time = 0.2
 
-    def time_duplicated_float(self):
-        self.float.duplicated()
+    params = [1, -1, 0]
 
-    def time_match_strings(self):
-        pd.match(self.all, self.uniques)
+    def setup(self, scalar):
+        N = 10**6
+        self.arr = np.arange(N)
 
-    def time_add_overflow_pos_scalar(self):
-        self.checked_add(self.arr, 1)
+    def time_add_overflow_scalar(self, scalar):
+        checked_add_with_arr(self.arr, scalar)
 
-    def time_add_overflow_neg_scalar(self):
-        self.checked_add(self.arr, -1)
 
-    def time_add_overflow_zero_scalar(self):
-        self.checked_add(self.arr, 0)
+class AddOverflowArray(object):
 
-    def time_add_overflow_pos_arr(self):
-        self.checked_add(self.arr, self.arrpos)
+    goal_time = 0.2
 
-    def time_add_overflow_neg_arr(self):
-        self.checked_add(self.arr, self.arrneg)
+    def setup(self):
+        np.random.seed(1234)
+        N = 10**6
+        self.arr = np.arange(N)
+        self.arr_rev = np.arange(-N, 0)
+        self.arr_mixed = np.array([1, -1]).repeat(N / 2)
+        self.arr_nan_1 = np.random.choice([True, False], size=N)
+        self.arr_nan_2 = np.random.choice([True, False], size=N)
 
-    def time_add_overflow_mixed_arr(self):
-        self.checked_add(self.arr, self.arrmixed)
+    def time_add_overflow_arr_rev(self):
+        checked_add_with_arr(self.arr, self.arr_rev)
 
-    def time_add_overflow_first_arg_nan(self):
-        self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)
+    def time_add_overflow_arr_mask_nan(self):
+        checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1)
 
-    def time_add_overflow_second_arg_nan(self):
-        self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan)
+    def time_add_overflow_b_mask_nan(self):
+        checked_add_with_arr(self.arr, self.arr_mixed,
+                             b_mask=self.arr_nan_1)
 
     def time_add_overflow_both_arg_nan(self):
-        self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
-                         b_mask=self.arrmixed_nan)
+        checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1,
+                             b_mask=self.arr_nan_2)
 
 
 class Hashing(object):
+
     goal_time = 0.2
 
-    def setup(self):
-        N = 100000
-
-        self.df = pd.DataFrame(
-            {'A': pd.Series(tm.makeStringIndex(100).take(
-                np.random.randint(0, 100, size=N))),
-             'B': pd.Series(tm.makeStringIndex(10000).take(
-                 np.random.randint(0, 10000, size=N))),
-             'D': np.random.randn(N),
-             'E': np.arange(N),
-             'F': pd.date_range('20110101', freq='s', periods=N),
-             'G': pd.timedelta_range('1 day', freq='s', periods=N),
-             })
-        self.df['C'] = self.df['B'].astype('category')
-        self.df.iloc[10:20] = np.nan
-
-    def time_frame(self):
-        hashing.hash_pandas_object(self.df)
-
-    def time_series_int(self):
-        hashing.hash_pandas_object(self.df.E)
-
-    def time_series_string(self):
-        hashing.hash_pandas_object(self.df.B)
-
-    def time_series_categorical(self):
-        hashing.hash_pandas_object(self.df.C)
+    def setup_cache(self):
+        np.random.seed(1234)
+        N = 10**5
+
+        df = pd.DataFrame(
+            {'strings': pd.Series(tm.makeStringIndex(10000).take(
+                np.random.randint(0, 10000, size=N))),
+             'floats': np.random.randn(N),
+             'ints': np.arange(N),
+             'dates': pd.date_range('20110101', freq='s', periods=N),
+             'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)})
+        df['categories'] = df['strings'].astype('category')
+        df.iloc[10:20] = np.nan
+        return df
+
+    def time_frame(self, df):
+        hashing.hash_pandas_object(df)
+
+    def time_series_int(self, df):
+        hashing.hash_pandas_object(df['ints'])
+
+    def time_series_string(self, df):
+        hashing.hash_pandas_object(df['strings'])
+
+    def time_series_float(self, df):
+        hashing.hash_pandas_object(df['floats'])
+
+    def time_series_categorical(self, df):
+        hashing.hash_pandas_object(df['categories'])
+
+    def time_series_timedeltas(self, df):
+        hashing.hash_pandas_object(df['timedeltas'])
+
+    def time_series_dates(self, df):
+        hashing.hash_pandas_object(df['dates'])

From 0d5704e595ace5b2807212d188b0c35ba5fc6598 Mon Sep 17 00:00:00 2001
From: Matt Roeschke <emailformattr@gmail.com>
Date: Thu, 23 Nov 2017 17:06:27 -0800
Subject: [PATCH 2/4] Add param names

---
 asv_bench/benchmarks/algorithms.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 0dbcec02367e9..0689751392489 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -83,6 +83,7 @@ class AddOverflowScalar(object):
     goal_time = 0.2
 
     params = [1, -1, 0]
+    param_names = ['scalar']
 
     def setup(self, scalar):
         N = 10**6

From d9237c9bcc318f1f81bbaf7c4233461616358ab7 Mon Sep 17 00:00:00 2001
From: Matt Roeschke <emailformattr@gmail.com>
Date: Fri, 24 Nov 2017 17:06:22 -0800
Subject: [PATCH 3/4] Add params for Duplicated and Factorize

---
 asv_bench/benchmarks/algorithms.py | 34 +++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 0689751392489..9c62808825f32 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -17,38 +17,48 @@ class Factorize(object):
 
     goal_time = 0.2
 
-    def setup(self):
+    params = [True, False]
+    param_names = ['sort']
+
+    def setup(self, sort):
         N = 10**5
         np.random.seed(1234)
         self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
         self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
         self.string_idx = tm.makeStringIndex(N)
 
-    def time_factorize_int(self):
-        self.int_idx.factorize()
+    def time_factorize_int(self, sort):
+        self.int_idx.factorize(sort=sort)
 
-    def time_factorize_float(self):
-        self.float_idx.factorize()
+    def time_factorize_float(self, sort):
+        self.float_idx.factorize(sort=sort)
 
-    def time_factorize_string(self):
-        self.string_idx.factorize()
+    def time_factorize_string(self, sort):
+        self.string_idx.factorize(sort=sort)
 
 
 class Duplicated(object):
 
     goal_time = 0.2
 
-    def setup(self):
+    params = ['first', 'last', False]
+    param_names = ['keep']
+
+    def setup(self, keep):
         N = 10**5
         np.random.seed(1234)
         self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
         self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
+        self.string_idx = tm.makeStringIndex(N)
+
+    def time_duplicated_int(self, keep):
+        self.int_idx.duplicated(keep=keep)
 
-    def time_duplicated_int(self):
-        self.int_idx.duplicated()
+    def time_duplicated_float(self, keep):
+        self.float_idx.duplicated(keep=keep)
 
-    def time_duplicated_float(self):
-        self.float_idx.duplicated()
+    def time_duplicated_string(self, keep):
+        self.string_idx.duplicated(keep=keep)
 
 
 class DuplicatedUniqueIndex(object):

From 35c7af42dc0f7e9ad4204b9306544dde6c6a316d Mon Sep 17 00:00:00 2001
From: Matt Roeschke <emailformattr@gmail.com>
Date: Sat, 25 Nov 2017 10:56:19 -0800
Subject: [PATCH 4/4] Move AddOverflow to binary_ops

---
 asv_bench/benchmarks/algorithms.py | 44 ------------------------------
 asv_bench/benchmarks/binary_ops.py | 44 ++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 9c62808825f32..7ffb180b49e09 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -3,7 +3,6 @@
 import numpy as np
 import pandas as pd
 from pandas.util import testing as tm
-from pandas.core.algorithms import checked_add_with_arr
 
 for imp in ['pandas.util', 'pandas.tools.hashing']:
     try:
@@ -88,49 +87,6 @@ def time_match_string(self):
         pd.match(self.all, self.uniques)
 
 
-class AddOverflowScalar(object):
-
-    goal_time = 0.2
-
-    params = [1, -1, 0]
-    param_names = ['scalar']
-
-    def setup(self, scalar):
-        N = 10**6
-        self.arr = np.arange(N)
-
-    def time_add_overflow_scalar(self, scalar):
-        checked_add_with_arr(self.arr, scalar)
-
-
-class AddOverflowArray(object):
-
-    goal_time = 0.2
-
-    def setup(self):
-        np.random.seed(1234)
-        N = 10**6
-        self.arr = np.arange(N)
-        self.arr_rev = np.arange(-N, 0)
-        self.arr_mixed = np.array([1, -1]).repeat(N / 2)
-        self.arr_nan_1 = np.random.choice([True, False], size=N)
-        self.arr_nan_2 = np.random.choice([True, False], size=N)
-
-    def time_add_overflow_arr_rev(self):
-        checked_add_with_arr(self.arr, self.arr_rev)
-
-    def time_add_overflow_arr_mask_nan(self):
-        checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1)
-
-    def time_add_overflow_b_mask_nan(self):
-        checked_add_with_arr(self.arr, self.arr_mixed,
-                             b_mask=self.arr_nan_1)
-
-    def time_add_overflow_both_arg_nan(self):
-        checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1,
-                             b_mask=self.arr_nan_2)
-
-
 class Hashing(object):
 
     goal_time = 0.2
diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py
index 429965c06cb48..14169ced4b71f 100644
--- a/asv_bench/benchmarks/binary_ops.py
+++ b/asv_bench/benchmarks/binary_ops.py
@@ -1,5 +1,6 @@
 import numpy as np
 from pandas import DataFrame, Series, date_range
+from pandas.core.algorithms import checked_add_with_arr
 try:
     import pandas.core.computation.expressions as expr
 except ImportError:
@@ -108,3 +109,46 @@ def time_timestamp_ops_diff(self, tz):
 
     def time_timestamp_ops_diff_with_shift(self, tz):
         self.s - self.s.shift()
+
+
+class AddOverflowScalar(object):
+
+    goal_time = 0.2
+
+    params = [1, -1, 0]
+    param_names = ['scalar']
+
+    def setup(self, scalar):
+        N = 10**6
+        self.arr = np.arange(N)
+
+    def time_add_overflow_scalar(self, scalar):
+        checked_add_with_arr(self.arr, scalar)
+
+
+class AddOverflowArray(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        np.random.seed(1234)
+        N = 10**6
+        self.arr = np.arange(N)
+        self.arr_rev = np.arange(-N, 0)
+        self.arr_mixed = np.array([1, -1]).repeat(N / 2)
+        self.arr_nan_1 = np.random.choice([True, False], size=N)
+        self.arr_nan_2 = np.random.choice([True, False], size=N)
+
+    def time_add_overflow_arr_rev(self):
+        checked_add_with_arr(self.arr, self.arr_rev)
+
+    def time_add_overflow_arr_mask_nan(self):
+        checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1)
+
+    def time_add_overflow_b_mask_nan(self):
+        checked_add_with_arr(self.arr, self.arr_mixed,
+                             b_mask=self.arr_nan_1)
+
+    def time_add_overflow_both_arg_nan(self):
+        checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1,
+                             b_mask=self.arr_nan_2)