|
1 | 1 | import itertools
|
2 | 2 |
|
3 |
| -from .pandas_vb_common import * |
| 3 | +import numpy as np |
4 | 4 | import scipy.sparse
|
5 |
| -from pandas import SparseSeries, SparseDataFrame, SparseArray |
| 5 | +from pandas import (SparseSeries, SparseDataFrame, SparseArray, Series, |
| 6 | + date_range, MultiIndex) |
6 | 7 |
|
| 8 | +from .pandas_vb_common import setup # noqa |
7 | 9 |
|
8 |
| -class sparse_series_to_frame(object): |
9 |
| - goal_time = 0.2 |
10 | 10 |
|
11 |
| - def setup(self): |
12 |
| - self.K = 50 |
13 |
| - self.N = 50000 |
14 |
| - self.rng = np.asarray(date_range('1/1/2000', periods=self.N, freq='T')) |
15 |
| - self.series = {} |
16 |
| - for i in range(1, (self.K + 1)): |
17 |
| - self.data = np.random.randn(self.N)[:(- i)] |
18 |
| - self.this_rng = self.rng[:(- i)] |
19 |
| - self.data[100:] = np.nan |
20 |
| - self.series[i] = SparseSeries(self.data, index=self.this_rng) |
| 11 | +def make_array(size, dense_proportion, fill_value, dtype): |
| 12 | + dense_size = int(size * dense_proportion) |
| 13 | + arr = np.full(size, fill_value, dtype) |
| 14 | + indexer = np.random.choice(np.arange(size), dense_size, replace=False) |
| 15 | + arr[indexer] = np.random.choice(np.arange(100, dtype=dtype), dense_size) |
| 16 | + return arr |
21 | 17 |
|
22 |
| - def time_sparse_series_to_frame(self): |
23 |
| - SparseDataFrame(self.series) |
24 | 18 |
|
| 19 | +class SparseSeriesToFrame(object): |
25 | 20 |
|
26 |
| -class sparse_array_constructor(object): |
27 | 21 | goal_time = 0.2
|
28 | 22 |
|
29 | 23 | def setup(self):
|
30 |
| - np.random.seed(1) |
31 |
| - self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64) |
32 |
| - self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64) |
33 |
| - |
34 |
| - self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64) |
35 |
| - self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64) |
36 |
| - |
37 |
| - self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan) |
38 |
| - self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan) |
39 |
| - |
40 |
| - self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0) |
41 |
| - self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0) |
42 |
| - |
43 |
| - def make_numeric_array(self, length, dense_size, fill_value, dtype): |
44 |
| - arr = np.array([fill_value] * length, dtype=dtype) |
45 |
| - indexer = np.unique(np.random.randint(0, length, dense_size)) |
46 |
| - arr[indexer] = np.random.randint(0, 100, len(indexer)) |
47 |
| - return (arr, fill_value, dtype) |
48 |
| - |
49 |
| - def make_object_array(self, length, dense_size, fill_value): |
50 |
| - elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object) |
51 |
| - arr = np.array([fill_value] * length, dtype=np.object) |
52 |
| - indexer = np.unique(np.random.randint(0, length, dense_size)) |
53 |
| - arr[indexer] = np.random.choice(elems, len(indexer)) |
54 |
| - return (arr, fill_value, np.object) |
55 |
| - |
56 |
| - def time_sparse_array_constructor_int64_10percent(self): |
57 |
| - arr, fill_value, dtype = self.int64_10percent |
58 |
| - SparseArray(arr, fill_value=fill_value, dtype=dtype) |
59 |
| - |
60 |
| - def time_sparse_array_constructor_int64_1percent(self): |
61 |
| - arr, fill_value, dtype = self.int64_1percent |
62 |
| - SparseArray(arr, fill_value=fill_value, dtype=dtype) |
63 |
| - |
64 |
| - def time_sparse_array_constructor_float64_10percent(self): |
65 |
| - arr, fill_value, dtype = self.float64_10percent |
66 |
| - SparseArray(arr, fill_value=fill_value, dtype=dtype) |
67 |
| - |
68 |
| - def time_sparse_array_constructor_float64_1percent(self): |
69 |
| - arr, fill_value, dtype = self.float64_1percent |
70 |
| - SparseArray(arr, fill_value=fill_value, dtype=dtype) |
71 |
| - |
72 |
| - def time_sparse_array_constructor_object_nan_fill_value_10percent(self): |
73 |
| - arr, fill_value, dtype = self.object_nan_fill_value_10percent |
74 |
| - SparseArray(arr, fill_value=fill_value, dtype=dtype) |
75 |
| - |
76 |
| - def time_sparse_array_constructor_object_nan_fill_value_1percent(self): |
77 |
| - arr, fill_value, dtype = self.object_nan_fill_value_1percent |
78 |
| - SparseArray(arr, fill_value=fill_value, dtype=dtype) |
| 24 | + K = 50 |
| 25 | + N = 50001 |
| 26 | + rng = date_range('1/1/2000', periods=N, freq='T') |
| 27 | + self.series = {} |
| 28 | + for i in range(1, K): |
| 29 | + data = np.random.randn(N)[:-i] |
| 30 | + idx = rng[:-i] |
| 31 | + data[100:] = np.nan |
| 32 | + self.series[i] = SparseSeries(data, index=idx) |
79 | 33 |
|
80 |
| - def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self): |
81 |
| - arr, fill_value, dtype = self.object_non_nan_fill_value_10percent |
82 |
| - SparseArray(arr, fill_value=fill_value, dtype=dtype) |
| 34 | + def time_series_to_frame(self): |
| 35 | + SparseDataFrame(self.series) |
83 | 36 |
|
84 |
| - def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self): |
85 |
| - arr, fill_value, dtype = self.object_non_nan_fill_value_1percent |
86 |
| - SparseArray(arr, fill_value=fill_value, dtype=dtype) |
87 | 37 |
|
| 38 | +class SparseArrayConstructor(object): |
88 | 39 |
|
89 |
| -class sparse_frame_constructor(object): |
90 | 40 | goal_time = 0.2
|
| 41 | + params = ([0.1, 0.01], [0, np.nan], |
| 42 | + [np.int64, np.float64, np.object]) |
| 43 | + param_names = ['dense_proportion', 'fill_value', 'dtype'] |
91 | 44 |
|
92 |
| - def time_sparse_frame_constructor(self): |
93 |
| - SparseDataFrame(columns=np.arange(100), index=np.arange(1000)) |
| 45 | + def setup(self, dense_proportion, fill_value, dtype): |
| 46 | + N = 10**6 |
| 47 | + self.array = make_array(N, dense_proportion, fill_value, dtype) |
94 | 48 |
|
95 |
| - def time_sparse_from_scipy(self): |
96 |
| - SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005)) |
| 49 | + def time_sparse_array(self, dense_proportion, fill_value, dtype): |
| 50 | + SparseArray(self.array, fill_value=fill_value, dtype=dtype) |
97 | 51 |
|
98 |
| - def time_sparse_from_dict(self): |
99 |
| - SparseDataFrame(dict(zip(range(1000), itertools.repeat([0])))) |
100 | 52 |
|
| 53 | +class SparseDataFrameConstructor(object): |
101 | 54 |
|
102 |
| -class sparse_series_from_coo(object): |
103 | 55 | goal_time = 0.2
|
104 | 56 |
|
105 | 57 | def setup(self):
|
106 |
| - self.A = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)) |
| 58 | + N = 1000 |
| 59 | + self.arr = np.arange(N) |
| 60 | + self.sparse = scipy.sparse.rand(N, N, 0.005) |
| 61 | + self.dict = dict(zip(range(N), itertools.repeat([0]))) |
107 | 62 |
|
108 |
| - def time_sparse_series_from_coo(self): |
109 |
| - self.ss = SparseSeries.from_coo(self.A) |
| 63 | + def time_constructor(self): |
| 64 | + SparseDataFrame(columns=self.arr, index=self.arr) |
110 | 65 |
|
| 66 | + def time_from_scipy(self): |
| 67 | + SparseDataFrame(self.sparse) |
111 | 68 |
|
112 |
| -class sparse_series_to_coo(object): |
113 |
| - goal_time = 0.2 |
| 69 | + def time_from_dict(self): |
| 70 | + SparseDataFrame(self.dict) |
114 | 71 |
|
115 |
| - def setup(self): |
116 |
| - self.s = pd.Series(([np.nan] * 10000)) |
117 |
| - self.s[0] = 3.0 |
118 |
| - self.s[100] = (-1.0) |
119 |
| - self.s[999] = 12.1 |
120 |
| - self.s.index = pd.MultiIndex.from_product((range(10), range(10), range(10), range(10))) |
121 |
| - self.ss = self.s.to_sparse() |
122 |
| - |
123 |
| - def time_sparse_series_to_coo(self): |
124 |
| - self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) |
125 | 72 |
|
| 73 | +class FromCoo(object): |
126 | 74 |
|
127 |
| -class sparse_arithmetic_int(object): |
128 | 75 | goal_time = 0.2
|
129 | 76 |
|
130 | 77 | def setup(self):
|
131 |
| - np.random.seed(1) |
132 |
| - self.a_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan) |
133 |
| - self.b_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan) |
134 |
| - |
135 |
| - self.a_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0) |
136 |
| - self.b_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0) |
137 |
| - |
138 |
| - self.a_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan) |
139 |
| - self.b_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan) |
140 |
| - |
141 |
| - def make_sparse_array(self, length, dense_size, fill_value): |
142 |
| - arr = np.array([fill_value] * length, dtype=np.float64) |
143 |
| - indexer = np.unique(np.random.randint(0, length, dense_size)) |
144 |
| - arr[indexer] = np.random.randint(0, 100, len(indexer)) |
145 |
| - return pd.SparseArray(arr, fill_value=fill_value) |
146 |
| - |
147 |
| - def time_sparse_make_union(self): |
148 |
| - self.a_10percent.sp_index.make_union(self.b_10percent.sp_index) |
| 78 | + self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], |
| 79 | + ([1, 0, 0], [0, 2, 3])), |
| 80 | + shape=(100, 100)) |
149 | 81 |
|
150 |
| - def time_sparse_intersect(self): |
151 |
| - self.a_10percent.sp_index.intersect(self.b_10percent.sp_index) |
152 |
| - |
153 |
| - def time_sparse_addition_10percent(self): |
154 |
| - self.a_10percent + self.b_10percent |
| 82 | + def time_sparse_series_from_coo(self): |
| 83 | + SparseSeries.from_coo(self.matrix) |
155 | 84 |
|
156 |
| - def time_sparse_addition_10percent_zero(self): |
157 |
| - self.a_10percent_zero + self.b_10percent_zero |
158 | 85 |
|
159 |
| - def time_sparse_addition_1percent(self): |
160 |
| - self.a_1percent + self.b_1percent |
| 86 | +class ToCoo(object): |
161 | 87 |
|
162 |
| - def time_sparse_division_10percent(self): |
163 |
| - self.a_10percent / self.b_10percent |
| 88 | + goal_time = 0.2 |
164 | 89 |
|
165 |
| - def time_sparse_division_10percent_zero(self): |
166 |
| - self.a_10percent_zero / self.b_10percent_zero |
| 90 | + def setup(self): |
| 91 | + s = Series([np.nan] * 10000) |
| 92 | + s[0] = 3.0 |
| 93 | + s[100] = -1.0 |
| 94 | + s[999] = 12.1 |
| 95 | + s.index = MultiIndex.from_product([range(10)] * 4) |
| 96 | + self.ss = s.to_sparse() |
167 | 97 |
|
168 |
| - def time_sparse_division_1percent(self): |
169 |
| - self.a_1percent / self.b_1percent |
| 98 | + def time_sparse_series_to_coo(self): |
| 99 | + self.ss.to_coo(row_levels=[0, 1], |
| 100 | + column_levels=[2, 3], |
| 101 | + sort_labels=True) |
170 | 102 |
|
171 | 103 |
|
| 104 | +class Arithmetic(object): |
172 | 105 |
|
173 |
| -class sparse_arithmetic_block(object): |
174 | 106 | goal_time = 0.2
|
| 107 | + params = ([0.1, 0.01], [0, np.nan]) |
| 108 | + param_names = ['dense_proportion', 'fill_value'] |
175 | 109 |
|
176 |
| - def setup(self): |
177 |
| - np.random.seed(1) |
178 |
| - self.a = self.make_sparse_array(length=1000000, num_blocks=1000, |
179 |
| - block_size=10, fill_value=np.nan) |
180 |
| - self.b = self.make_sparse_array(length=1000000, num_blocks=1000, |
181 |
| - block_size=10, fill_value=np.nan) |
182 |
| - |
183 |
| - self.a_zero = self.make_sparse_array(length=1000000, num_blocks=1000, |
184 |
| - block_size=10, fill_value=0) |
185 |
| - self.b_zero = self.make_sparse_array(length=1000000, num_blocks=1000, |
186 |
| - block_size=10, fill_value=np.nan) |
| 110 | + def setup(self, dense_proportion, fill_value): |
| 111 | + N = 10**6 |
| 112 | + arr1 = make_array(N, dense_proportion, fill_value, np.int64) |
| 113 | + self.array1 = SparseArray(arr1, fill_value=fill_value) |
| 114 | + arr2 = make_array(N, dense_proportion, fill_value, np.int64) |
| 115 | + self.array2 = SparseArray(arr2, fill_value=fill_value) |
187 | 116 |
|
188 |
| - def make_sparse_array(self, length, num_blocks, block_size, fill_value): |
189 |
| - a = np.array([fill_value] * length) |
190 |
| - for block in range(num_blocks): |
191 |
| - i = np.random.randint(0, length) |
192 |
| - a[i:i + block_size] = np.random.randint(0, 100, len(a[i:i + block_size])) |
193 |
| - return pd.SparseArray(a, fill_value=fill_value) |
| 117 | + def time_make_union(self, dense_proportion, fill_value): |
| 118 | + self.array1.sp_index.make_union(self.array2.sp_index) |
194 | 119 |
|
195 |
| - def time_sparse_make_union(self): |
196 |
| - self.a.sp_index.make_union(self.b.sp_index) |
| 120 | + def time_intersect(self, dense_proportion, fill_value): |
| 121 | + self.array1.sp_index.intersect(self.array2.sp_index) |
197 | 122 |
|
198 |
| - def time_sparse_intersect(self): |
199 |
| - self.a.sp_index.intersect(self.b.sp_index) |
| 123 | + def time_add(self, dense_proportion, fill_value): |
| 124 | + self.array1 + self.array2 |
200 | 125 |
|
201 |
| - def time_sparse_addition(self): |
202 |
| - self.a + self.b |
| 126 | + def time_divide(self, dense_proportion, fill_value): |
| 127 | + self.array1 / self.array2 |
203 | 128 |
|
204 |
| - def time_sparse_addition_zero(self): |
205 |
| - self.a_zero + self.b_zero |
206 | 129 |
|
207 |
| - def time_sparse_division(self): |
208 |
| - self.a / self.b |
| 130 | +class ArithmeticBlock(object): |
209 | 131 |
|
210 |
| - def time_sparse_division_zero(self): |
211 |
| - self.a_zero / self.b_zero |
| 132 | + goal_time = 0.2 |
| 133 | + params = [np.nan, 0] |
| 134 | + param_names = ['fill_value'] |
| 135 | + |
| 136 | + def setup(self, fill_value): |
| 137 | + N = 10**6 |
| 138 | + self.arr1 = self.make_block_array(length=N, num_blocks=1000, |
| 139 | + block_size=10, fill_value=fill_value) |
| 140 | + self.arr2 = self.make_block_array(length=N, num_blocks=1000, |
| 141 | + block_size=10, fill_value=fill_value) |
| 142 | + |
| 143 | + def make_block_array(self, length, num_blocks, block_size, fill_value): |
| 144 | + arr = np.full(length, fill_value) |
| 145 | + indicies = np.random.choice(np.arange(0, length, block_size), |
| 146 | + num_blocks, |
| 147 | + replace=False) |
| 148 | + for ind in indicies: |
| 149 | + arr[ind:ind + block_size] = np.random.randint(0, 100, block_size) |
| 150 | + return SparseArray(arr, fill_value=fill_value) |
| 151 | + |
| 152 | + def time_make_union(self, fill_value): |
| 153 | + self.arr1.sp_index.make_union(self.arr2.sp_index) |
| 154 | + |
| 155 | + def time_intersect(self, fill_value): |
| 156 | + self.arr2.sp_index.intersect(self.arr2.sp_index) |
| 157 | + |
| 158 | + def time_addition(self, fill_value): |
| 159 | + self.arr1 + self.arr2 |
| 160 | + |
| 161 | + def time_division(self, fill_value): |
| 162 | + self.arr1 / self.arr2 |
0 commit comments