diff --git a/asv_bench/benchmarks/algos/__init__.py b/asv_bench/benchmarks/algos/__init__.py new file mode 100644 index 0000000000000..97c9ab09b9c6b --- /dev/null +++ b/asv_bench/benchmarks/algos/__init__.py @@ -0,0 +1,12 @@ +""" +algos/ directory is intended for individual functions from core.algorithms + +In many cases these algorithms are reachable in multiple ways: + algos.foo(x, y) + Series(x).foo(y) + Index(x).foo(y) + pd.array(x).foo(y) + +In most cases we profile the Series variant directly, trusting the performance +of the others to be highly correlated. +""" diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py new file mode 100644 index 0000000000000..5d81d9d0d45a3 --- /dev/null +++ b/asv_bench/benchmarks/algos/isin.py @@ -0,0 +1,317 @@ +import numpy as np + +from pandas.compat.numpy import np_version_under1p20 + +from pandas import ( + Categorical, + NaT, + Series, + date_range, +) + + +class IsIn: + + params = [ + "int64", + "uint64", + "object", + "Int64", + "boolean", + "bool", + "datetime64[ns]", + "category[object]", + "category[int]", + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10000 + + self.mismatched = [NaT.to_datetime64()] * 2 + + if dtype in ["boolean", "bool"]: + self.series = Series(np.random.randint(0, 2, N)).astype(dtype) + self.values = [True, False] + + elif dtype == "datetime64[ns]": + # Note: values here is much larger than non-dt64ns cases + + # dti has length=115777 + dti = date_range(start="2015-10-26", end="2016-01-01", freq="50s") + self.series = Series(dti) + self.values = self.series._values[::3] + self.mismatched = [1, 2] + + elif dtype in ["category[object]", "category[int]"]: + # Note: sizes are different in this case than others + np.random.seed(1234) + + n = 5 * 10 ** 5 + sample_size = 100 + + arr = list(np.random.randint(0, n // 10, size=n)) + if dtype == "category[object]": + arr = [f"s{i:04d}" for i in arr] + + self.values = np.random.choice(arr, sample_size) + self.series = Series(arr).astype("category") + + else: + self.series = Series(np.random.randint(1, 10, N)).astype(dtype) + self.values = [1, 2] + + self.cat_values = Categorical(self.values) + + def time_isin(self, dtype): + self.series.isin(self.values) + + def time_isin_categorical(self, dtype): + self.series.isin(self.cat_values) + + def time_isin_empty(self, dtype): + self.series.isin([]) + + def time_isin_mismatched_dtype(self, dtype): + self.series.isin(self.mismatched) + + +class IsinAlmostFullWithRandomInt: + params = [ + [np.float64, np.int64, np.uint64, np.object_], + range(10, 21), + ["inside", "outside"], + ] + param_names = ["dtype", "exponent", "title"] + + def setup(self, dtype, exponent, title): + M = 3 * 2 ** (exponent - 2) + # 0.77-the maximal share of occupied buckets + np.random.seed(42) + self.series = Series(np.random.randint(0, M, M)).astype(dtype) + + values = np.random.randint(0, M, M).astype(dtype) + if title == "inside": + self.values = values + elif title == "outside": + self.values = values + M + else: + raise ValueError(title) + + def time_isin(self, dtype, exponent, title): + self.series.isin(self.values) + + +class IsinWithRandomFloat: + params = [ + [np.float64, np.object], + [ + 1_300, + 2_000, + 7_000, + 8_000, + 70_000, + 80_000, + 750_000, + 900_000, + ], + ["inside", "outside"], + ] + param_names = ["dtype", "size", "title"] + + def setup(self, dtype, size, title): + np.random.seed(42) + self.values = np.random.rand(size) + self.series = Series(self.values).astype(dtype) + np.random.shuffle(self.values) + + if title == "outside": + self.values = self.values + 0.1 + + def time_isin(self, dtype, size, title): + self.series.isin(self.values) + + +class IsinWithArangeSorted: + params = [ + [np.float64, np.int64, np.uint64, np.object], + [ + 1_000, + 2_000, + 8_000, + 100_000, + 1_000_000, + ], + ] + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + self.series = Series(np.arange(size)).astype(dtype) + self.values = np.arange(size).astype(dtype) + + def time_isin(self, dtype, size): + self.series.isin(self.values) + + +class IsinWithArange: + params = [ + [np.float64, np.int64, np.uint64, np.object], + [ + 1_000, + 2_000, + 8_000, + ], + [-2, 0, 2], + ] + param_names = ["dtype", "M", "offset_factor"] + + def setup(self, dtype, M, offset_factor): + offset = int(M * offset_factor) + np.random.seed(42) + tmp = Series(np.random.randint(offset, M + offset, 10 ** 6)) + self.series = tmp.astype(dtype) + self.values = np.arange(M).astype(dtype) + + def time_isin(self, dtype, M, offset_factor): + self.series.isin(self.values) + + +class IsInFloat64: + + params = [ + [np.float64, "Float64"], + ["many_different_values", "few_different_values", "only_nans_values"], + ] + param_names = ["dtype", "title"] + + def setup(self, dtype, title): + N_many = 10 ** 5 + N_few = 10 ** 6 + self.series = Series([1, 2], dtype=dtype) + + if title == "many_different_values": + # runtime is dominated by creation of the lookup-table + self.values = np.arange(N_many, dtype=np.float64) + elif title == "few_different_values": + # runtime is dominated by creation of the lookup-table + self.values = np.zeros(N_few, dtype=np.float64) + elif title == "only_nans_values": + # runtime is dominated by creation of the lookup-table + self.values = np.full(N_few, np.nan, dtype=np.float64) + else: + raise ValueError(title) + + def time_isin(self, dtype, title): + self.series.isin(self.values) + + +class IsInForObjects: + """ + A subset of the cartesian product of cases have special motivations: + + "nans" x "nans" + if nan-objects are different objects, + this has the potential to trigger O(n^2) running time + + "short" x "long" + running time dominated by the preprocessing + + "long" x "short" + running time dominated by look-up + + "long" x "long" + no dominating part + + "long_floats" x "long_floats" + because of nans floats are special + no dominating part + + """ + + variants = ["nans", "short", "long", "long_floats"] + + params = [variants, variants] + param_names = ["series_type", "vals_type"] + + def setup(self, series_type, vals_type): + N_many = 10 ** 5 + + if series_type == "nans": + ser_vals = np.full(10 ** 4, np.nan) + elif series_type == "short": + ser_vals = np.arange(2) + elif series_type == "long": + ser_vals = np.arange(N_many) + elif series_type == "long_floats": + ser_vals = np.arange(N_many, dtype=np.float_) + + self.series = Series(ser_vals).astype(object) + + if vals_type == "nans": + values = np.full(10 ** 4, np.nan) + elif vals_type == "short": + values = np.arange(2) + elif vals_type == "long": + values = np.arange(N_many) + elif vals_type == "long_floats": + values = np.arange(N_many, dtype=np.float_) + + self.values = values.astype(object) + + def time_isin(self, series_type, vals_type): + self.series.isin(self.values) + + +class IsInLongSeriesLookUpDominates: + params = [ + ["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], + [5, 1000], + ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], + ] + param_names = ["dtype", "MaxNumber", "series_type"] + + def setup(self, dtype, MaxNumber, series_type): + N = 10 ** 7 + + if not np_version_under1p20 and dtype in ("Int64", "Float64"): + raise NotImplementedError + + if series_type == "random_hits": + np.random.seed(42) + array = np.random.randint(0, MaxNumber, N) + if series_type == "random_misses": + np.random.seed(42) + array = np.random.randint(0, MaxNumber, N) + MaxNumber + if series_type == "monotone_hits": + array = np.repeat(np.arange(MaxNumber), N // MaxNumber) + if series_type == "monotone_misses": + array = np.arange(N) + MaxNumber + + self.series = Series(array).astype(dtype) + self.values = np.arange(MaxNumber).astype(dtype) + + def time_isin(self, dtypes, MaxNumber, series_type): + self.series.isin(self.values) + + +class IsInLongSeriesValuesDominate: + params = [ + ["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], + ["random", "monotone"], + ] + param_names = ["dtype", "series_type"] + + def setup(self, dtype, series_type): + N = 10 ** 7 + if series_type == "random": + np.random.seed(42) + vals = np.random.randint(0, 10 * N, N) + if series_type == "monotone": + vals = np.arange(N) + + self.values = vals.astype(dtype) + M = 10 ** 6 + 1 + self.series = Series(np.arange(M)).astype(dtype) + + def time_isin(self, dtypes, series_type): + self.series.isin(self.values) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 4e32b6e496929..268f25c3d12e3 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -220,25 +220,6 @@ def time_rank_int_cat_ordered(self): self.s_int_cat_ordered.rank() -class Isin: - - params = ["object", "int64"] - param_names = ["dtype"] - - def setup(self, dtype): - np.random.seed(1234) - n = 5 * 10 ** 5 - sample_size = 100 - arr = list(np.random.randint(0, n // 10, size=n)) - if dtype == "object": - arr = [f"s{i:04d}" for i in arr] - self.sample = np.random.choice(arr, sample_size) - self.series = pd.Series(arr).astype("category") - - def time_isin_categorical(self, dtype): - self.series.isin(self.sample) - - class IsMonotonic: def setup(self): N = 1000 diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py index 3743882b936e2..394433f7c8f99 100644 --- a/asv_bench/benchmarks/hash_functions.py +++ b/asv_bench/benchmarks/hash_functions.py @@ -3,28 +3,6 @@ import pandas as pd -class IsinAlmostFullWithRandomInt: - params = [ - [np.float64, np.int64, np.uint64, np.object], - range(10, 21), - ] - param_names = ["dtype", "exponent"] - - def setup(self, dtype, exponent): - M = 3 * 2 ** (exponent - 2) - # 0.77-the maximal share of occupied buckets - np.random.seed(42) - self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype) - self.values = np.random.randint(0, M, M).astype(dtype) - self.values_outside = self.values + M - - def time_isin(self, dtype, exponent): - self.s.isin(self.values) - - def time_isin_outside(self, dtype, exponent): - self.s.isin(self.values_outside) - - class UniqueForLargePyObjectInts: def setup(self): lst = [x << 32 for x in range(5000)] @@ -34,80 +12,6 @@ def time_unique(self): pd.unique(self.arr) -class IsinWithRandomFloat: - params = [ - [np.float64, np.object], - [ - 1_300, - 2_000, - 7_000, - 8_000, - 70_000, - 80_000, - 750_000, - 900_000, - ], - ] - param_names = ["dtype", "M"] - - def setup(self, dtype, M): - np.random.seed(42) - self.values = np.random.rand(M) - self.s = pd.Series(self.values).astype(dtype) - np.random.shuffle(self.values) - self.values_outside = self.values + 0.1 - - def time_isin(self, dtype, M): - self.s.isin(self.values) - - def time_isin_outside(self, dtype, M): - self.s.isin(self.values_outside) - - -class IsinWithArangeSorted: - params = [ - [np.float64, np.int64, np.uint64, np.object], - [ - 1_000, - 2_000, - 8_000, - 100_000, - 1_000_000, - ], - ] - param_names = ["dtype", "M"] - - def setup(self, dtype, M): - self.s = pd.Series(np.arange(M)).astype(dtype) - self.values = np.arange(M).astype(dtype) - - def time_isin(self, dtype, M): - self.s.isin(self.values) - - -class IsinWithArange: - params = [ - [np.float64, np.int64, np.uint64, np.object], - [ - 1_000, - 2_000, - 8_000, - ], - [-2, 0, 2], - ] - param_names = ["dtype", "M", "offset_factor"] - - def setup(self, dtype, M, offset_factor): - offset = int(M * offset_factor) - np.random.seed(42) - tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6)) - self.s = tmp.astype(dtype) - self.values = np.arange(M).astype(dtype) - - def time_isin(self, dtype, M, offset_factor): - self.s.isin(self.values) - - class Float64GroupIndex: # GH28303 def setup(self): diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index a6bffb1585f2a..d05a28e0873d0 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -2,10 +2,7 @@ import numpy as np -from pandas.compat.numpy import np_version_under1p20 - from pandas import ( - Categorical, NaT, Series, date_range, @@ -30,169 +27,6 @@ def time_constructor(self, data): Series(data=self.data, index=self.idx) -class IsIn: - - params = ["int64", "uint64", "object", "Int64"] - param_names = ["dtype"] - - def setup(self, dtype): - N = 10000 - self.s = Series(np.random.randint(1, 10, N)).astype(dtype) - self.values = [1, 2] - - def time_isin(self, dtypes): - self.s.isin(self.values) - - -class IsInBoolean: - - params = ["boolean", "bool"] - param_names = ["dtype"] - - def setup(self, dtype): - N = 10000 - self.s = Series(np.random.randint(0, 2, N)).astype(dtype) - self.values = [True, False] - - def time_isin(self, dtypes): - self.s.isin(self.values) - - -class IsInDatetime64: - def setup(self): - dti = date_range( - start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s" - ) - self.ser = Series(dti) - self.subset = self.ser._values[::3] - self.cat_subset = Categorical(self.subset) - - def time_isin(self): - self.ser.isin(self.subset) - - def time_isin_cat_values(self): - self.ser.isin(self.cat_subset) - - def time_isin_mismatched_dtype(self): - self.ser.isin([1, 2]) - - def time_isin_empty(self): - self.ser.isin([]) - - -class IsInFloat64: - - params = [np.float64, "Float64"] - param_names = ["dtype"] - - def setup(self, dtype): - N_many = 10 ** 5 - N_few = 10 ** 6 - self.small = Series([1, 2], dtype=dtype) - self.many_different_values = np.arange(N_many, dtype=np.float64) - self.few_different_values = np.zeros(N_few, dtype=np.float64) - self.only_nans_values = np.full(N_few, np.nan, dtype=np.float64) - - def time_isin_many_different(self, dtypes): - # runtime is dominated by creation of the lookup-table - self.small.isin(self.many_different_values) - - def time_isin_few_different(self, dtypes): - # runtime is dominated by creation of the lookup-table - self.small.isin(self.few_different_values) - - def time_isin_nan_values(self, dtypes): - # runtime is dominated by creation of the lookup-table - self.small.isin(self.few_different_values) - - -class IsInForObjects: - def setup(self): - self.s_nans = Series(np.full(10 ** 4, np.nan)).astype(object) - self.vals_nans = np.full(10 ** 4, np.nan).astype(object) - self.s_short = Series(np.arange(2)).astype(object) - self.s_long = Series(np.arange(10 ** 5)).astype(object) - self.vals_short = np.arange(2).astype(object) - self.vals_long = np.arange(10 ** 5).astype(object) - # because of nans floats are special: - self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float_)).astype(object) - self.vals_long_floats = np.arange(10 ** 5, dtype=np.float_).astype(object) - - def time_isin_nans(self): - # if nan-objects are different objects, - # this has the potential to trigger O(n^2) running time - self.s_nans.isin(self.vals_nans) - - def time_isin_short_series_long_values(self): - # running time dominated by the preprocessing - self.s_short.isin(self.vals_long) - - def time_isin_long_series_short_values(self): - # running time dominated by look-up - self.s_long.isin(self.vals_short) - - def time_isin_long_series_long_values(self): - # no dominating part - self.s_long.isin(self.vals_long) - - def time_isin_long_series_long_values_floats(self): - # no dominating part - self.s_long_floats.isin(self.vals_long_floats) - - -class IsInLongSeriesLookUpDominates: - params = [ - ["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], - [5, 1000], - ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], - ] - param_names = ["dtype", "MaxNumber", "series_type"] - - def setup(self, dtype, MaxNumber, series_type): - N = 10 ** 7 - - if not np_version_under1p20 and dtype in ("Int64", "Float64"): - raise NotImplementedError - - if series_type == "random_hits": - np.random.seed(42) - array = np.random.randint(0, MaxNumber, N) - if series_type == "random_misses": - np.random.seed(42) - array = np.random.randint(0, MaxNumber, N) + MaxNumber - if series_type == "monotone_hits": - array = np.repeat(np.arange(MaxNumber), N // MaxNumber) - if series_type == "monotone_misses": - array = np.arange(N) + MaxNumber - self.series = Series(array).astype(dtype) - self.values = np.arange(MaxNumber).astype(dtype) - - def time_isin(self, dtypes, MaxNumber, series_type): - self.series.isin(self.values) - - -class IsInLongSeriesValuesDominate: - params = [ - ["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], - ["random", "monotone"], - ] - param_names = ["dtype", "series_type"] - - def setup(self, dtype, series_type): - N = 10 ** 7 - if series_type == "random": - np.random.seed(42) - vals = np.random.randint(0, 10 * N, N) - if series_type == "monotone": - vals = np.arange(N) - self.values = vals.astype(dtype) - M = 10 ** 6 + 1 - self.series = Series(np.arange(M)).astype(dtype) - - def time_isin(self, dtypes, series_type): - self.series.isin(self.values) - - class NSort: params = ["first", "last", "all"]