-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
CLN: ASV sparse #19047
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
CLN: ASV sparse #19047
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,211 +1,162 @@ | ||
import itertools | ||
|
||
from .pandas_vb_common import * | ||
import numpy as np | ||
import scipy.sparse | ||
from pandas import SparseSeries, SparseDataFrame, SparseArray | ||
from pandas import (SparseSeries, SparseDataFrame, SparseArray, Series, | ||
date_range, MultiIndex) | ||
|
||
from .pandas_vb_common import setup # noqa | ||
|
||
class sparse_series_to_frame(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
self.K = 50 | ||
self.N = 50000 | ||
self.rng = np.asarray(date_range('1/1/2000', periods=self.N, freq='T')) | ||
self.series = {} | ||
for i in range(1, (self.K + 1)): | ||
self.data = np.random.randn(self.N)[:(- i)] | ||
self.this_rng = self.rng[:(- i)] | ||
self.data[100:] = np.nan | ||
self.series[i] = SparseSeries(self.data, index=self.this_rng) | ||
def make_array(size, dense_proportion, fill_value, dtype): | ||
dense_size = int(size * dense_proportion) | ||
arr = np.full(size, fill_value, dtype) | ||
indexer = np.random.choice(np.arange(size), dense_size, replace=False) | ||
arr[indexer] = np.random.choice(np.arange(100, dtype=dtype), dense_size) | ||
return arr | ||
|
||
def time_sparse_series_to_frame(self): | ||
SparseDataFrame(self.series) | ||
|
||
class SparseSeriesToFrame(object): | ||
|
||
class sparse_array_constructor(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
np.random.seed(1) | ||
self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64) | ||
self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64) | ||
|
||
self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64) | ||
self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64) | ||
|
||
self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan) | ||
self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan) | ||
|
||
self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0) | ||
self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0) | ||
|
||
def make_numeric_array(self, length, dense_size, fill_value, dtype): | ||
arr = np.array([fill_value] * length, dtype=dtype) | ||
indexer = np.unique(np.random.randint(0, length, dense_size)) | ||
arr[indexer] = np.random.randint(0, 100, len(indexer)) | ||
return (arr, fill_value, dtype) | ||
|
||
def make_object_array(self, length, dense_size, fill_value): | ||
elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object) | ||
arr = np.array([fill_value] * length, dtype=np.object) | ||
indexer = np.unique(np.random.randint(0, length, dense_size)) | ||
arr[indexer] = np.random.choice(elems, len(indexer)) | ||
return (arr, fill_value, np.object) | ||
|
||
def time_sparse_array_constructor_int64_10percent(self): | ||
arr, fill_value, dtype = self.int64_10percent | ||
SparseArray(arr, fill_value=fill_value, dtype=dtype) | ||
|
||
def time_sparse_array_constructor_int64_1percent(self): | ||
arr, fill_value, dtype = self.int64_1percent | ||
SparseArray(arr, fill_value=fill_value, dtype=dtype) | ||
|
||
def time_sparse_array_constructor_float64_10percent(self): | ||
arr, fill_value, dtype = self.float64_10percent | ||
SparseArray(arr, fill_value=fill_value, dtype=dtype) | ||
|
||
def time_sparse_array_constructor_float64_1percent(self): | ||
arr, fill_value, dtype = self.float64_1percent | ||
SparseArray(arr, fill_value=fill_value, dtype=dtype) | ||
|
||
def time_sparse_array_constructor_object_nan_fill_value_10percent(self): | ||
arr, fill_value, dtype = self.object_nan_fill_value_10percent | ||
SparseArray(arr, fill_value=fill_value, dtype=dtype) | ||
|
||
def time_sparse_array_constructor_object_nan_fill_value_1percent(self): | ||
arr, fill_value, dtype = self.object_nan_fill_value_1percent | ||
SparseArray(arr, fill_value=fill_value, dtype=dtype) | ||
K = 50 | ||
N = 50001 | ||
rng = date_range('1/1/2000', periods=N, freq='T') | ||
self.series = {} | ||
for i in range(1, K): | ||
data = np.random.randn(N)[:-i] | ||
idx = rng[:-i] | ||
data[100:] = np.nan | ||
self.series[i] = SparseSeries(data, index=idx) | ||
|
||
def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self): | ||
arr, fill_value, dtype = self.object_non_nan_fill_value_10percent | ||
SparseArray(arr, fill_value=fill_value, dtype=dtype) | ||
def time_series_to_frame(self): | ||
SparseDataFrame(self.series) | ||
|
||
def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self): | ||
arr, fill_value, dtype = self.object_non_nan_fill_value_1percent | ||
SparseArray(arr, fill_value=fill_value, dtype=dtype) | ||
|
||
class SparseArrayConstructor(object): | ||
|
||
class sparse_frame_constructor(object): | ||
goal_time = 0.2 | ||
params = ([0.1, 0.01], [0, np.nan], | ||
[np.int64, np.float64, np.object]) | ||
param_names = ['dense_proportion', 'fill_value', 'dtype'] | ||
|
||
def time_sparse_frame_constructor(self): | ||
SparseDataFrame(columns=np.arange(100), index=np.arange(1000)) | ||
def setup(self, dense_proportion, fill_value, dtype): | ||
N = 10**6 | ||
self.array = make_array(N, dense_proportion, fill_value, dtype) | ||
|
||
def time_sparse_from_scipy(self): | ||
SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005)) | ||
def time_sparse_array(self, dense_proportion, fill_value, dtype): | ||
SparseArray(self.array, fill_value=fill_value, dtype=dtype) | ||
|
||
def time_sparse_from_dict(self): | ||
SparseDataFrame(dict(zip(range(1000), itertools.repeat([0])))) | ||
|
||
class SparseDataFrameConstructor(object): | ||
|
||
class sparse_series_from_coo(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
self.A = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)) | ||
N = 1000 | ||
self.arr = np.arange(N) | ||
self.sparse = scipy.sparse.rand(N, N, 0.005) | ||
self.dict = dict(zip(range(N), itertools.repeat([0]))) | ||
|
||
def time_sparse_series_from_coo(self): | ||
self.ss = SparseSeries.from_coo(self.A) | ||
def time_constructor(self): | ||
SparseDataFrame(columns=self.arr, index=self.arr) | ||
|
||
def time_from_scipy(self): | ||
SparseDataFrame(self.sparse) | ||
|
||
class sparse_series_to_coo(object): | ||
goal_time = 0.2 | ||
def time_from_dict(self): | ||
SparseDataFrame(self.dict) | ||
|
||
def setup(self): | ||
self.s = pd.Series(([np.nan] * 10000)) | ||
self.s[0] = 3.0 | ||
self.s[100] = (-1.0) | ||
self.s[999] = 12.1 | ||
self.s.index = pd.MultiIndex.from_product((range(10), range(10), range(10), range(10))) | ||
self.ss = self.s.to_sparse() | ||
|
||
def time_sparse_series_to_coo(self): | ||
self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) | ||
|
||
class FromCoo(object): | ||
|
||
class sparse_arithmetic_int(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
np.random.seed(1) | ||
self.a_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan) | ||
self.b_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan) | ||
|
||
self.a_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0) | ||
self.b_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0) | ||
|
||
self.a_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan) | ||
self.b_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan) | ||
|
||
def make_sparse_array(self, length, dense_size, fill_value): | ||
arr = np.array([fill_value] * length, dtype=np.float64) | ||
indexer = np.unique(np.random.randint(0, length, dense_size)) | ||
arr[indexer] = np.random.randint(0, 100, len(indexer)) | ||
return pd.SparseArray(arr, fill_value=fill_value) | ||
|
||
def time_sparse_make_union(self): | ||
self.a_10percent.sp_index.make_union(self.b_10percent.sp_index) | ||
self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], | ||
([1, 0, 0], [0, 2, 3])), | ||
shape=(100, 100)) | ||
|
||
def time_sparse_intersect(self): | ||
self.a_10percent.sp_index.intersect(self.b_10percent.sp_index) | ||
|
||
def time_sparse_addition_10percent(self): | ||
self.a_10percent + self.b_10percent | ||
def time_sparse_series_from_coo(self): | ||
SparseSeries.from_coo(self.matrix) | ||
|
||
def time_sparse_addition_10percent_zero(self): | ||
self.a_10percent_zero + self.b_10percent_zero | ||
|
||
def time_sparse_addition_1percent(self): | ||
self.a_1percent + self.b_1percent | ||
class ToCoo(object): | ||
|
||
def time_sparse_division_10percent(self): | ||
self.a_10percent / self.b_10percent | ||
goal_time = 0.2 | ||
|
||
def time_sparse_division_10percent_zero(self): | ||
self.a_10percent_zero / self.b_10percent_zero | ||
def setup(self): | ||
s = Series([np.nan] * 10000) | ||
s[0] = 3.0 | ||
s[100] = -1.0 | ||
s[999] = 12.1 | ||
s.index = MultiIndex.from_product([range(10)] * 4) | ||
self.ss = s.to_sparse() | ||
|
||
def time_sparse_division_1percent(self): | ||
self.a_1percent / self.b_1percent | ||
def time_sparse_series_to_coo(self): | ||
self.ss.to_coo(row_levels=[0, 1], | ||
column_levels=[2, 3], | ||
sort_labels=True) | ||
|
||
|
||
class Arithmetic(object): | ||
|
||
class sparse_arithmetic_block(object): | ||
goal_time = 0.2 | ||
params = ([0.1, 0.01], [0, np.nan]) | ||
param_names = ['dense_proportion', 'fill_value'] | ||
|
||
def setup(self): | ||
np.random.seed(1) | ||
self.a = self.make_sparse_array(length=1000000, num_blocks=1000, | ||
block_size=10, fill_value=np.nan) | ||
self.b = self.make_sparse_array(length=1000000, num_blocks=1000, | ||
block_size=10, fill_value=np.nan) | ||
|
||
self.a_zero = self.make_sparse_array(length=1000000, num_blocks=1000, | ||
block_size=10, fill_value=0) | ||
self.b_zero = self.make_sparse_array(length=1000000, num_blocks=1000, | ||
block_size=10, fill_value=np.nan) | ||
def setup(self, dense_proportion, fill_value): | ||
N = 10**6 | ||
arr1 = make_array(N, dense_proportion, fill_value, np.int64) | ||
self.array1 = SparseArray(arr1, fill_value=fill_value) | ||
arr2 = make_array(N, dense_proportion, fill_value, np.int64) | ||
self.array2 = SparseArray(arr2, fill_value=fill_value) | ||
|
||
def make_sparse_array(self, length, num_blocks, block_size, fill_value): | ||
a = np.array([fill_value] * length) | ||
for block in range(num_blocks): | ||
i = np.random.randint(0, length) | ||
a[i:i + block_size] = np.random.randint(0, 100, len(a[i:i + block_size])) | ||
return pd.SparseArray(a, fill_value=fill_value) | ||
def time_make_union(self, dense_proportion, fill_value): | ||
self.array1.sp_index.make_union(self.array2.sp_index) | ||
|
||
def time_sparse_make_union(self): | ||
self.a.sp_index.make_union(self.b.sp_index) | ||
def time_intersect(self, dense_proportion, fill_value): | ||
self.array1.sp_index.intersect(self.array2.sp_index) | ||
|
||
def time_sparse_intersect(self): | ||
self.a.sp_index.intersect(self.b.sp_index) | ||
def time_add(self, dense_proportion, fill_value): | ||
self.array1 + self.array2 | ||
|
||
def time_sparse_addition(self): | ||
self.a + self.b | ||
def time_divide(self, dense_proportion, fill_value): | ||
self.array1 / self.array2 | ||
|
||
def time_sparse_addition_zero(self): | ||
self.a_zero + self.b_zero | ||
|
||
def time_sparse_division(self): | ||
self.a / self.b | ||
class ArithmeticBlock(object): | ||
|
||
def time_sparse_division_zero(self): | ||
self.a_zero / self.b_zero | ||
goal_time = 0.2 | ||
params = [np.nan, 0] | ||
param_names = ['fill_value'] | ||
|
||
def setup(self, fill_value): | ||
N = 10**6 | ||
self.arr1 = self.make_block_array(length=N, num_blocks=1000, | ||
block_size=10, fill_value=fill_value) | ||
self.arr2 = self.make_block_array(length=N, num_blocks=1000, | ||
block_size=10, fill_value=fill_value) | ||
|
||
def make_block_array(self, length, num_blocks, block_size, fill_value): | ||
arr = np.full(length, fill_value) | ||
indicies = np.random.choice(np.arange(0, length, block_size), | ||
num_blocks, | ||
replace=False) | ||
for ind in indicies: | ||
arr[ind:ind + block_size] = np.random.randint(0, 100, block_size) | ||
return SparseArray(arr, fill_value=fill_value) | ||
|
||
def time_make_union(self, fill_value): | ||
self.arr1.sp_index.make_union(self.arr2.sp_index) | ||
|
||
def time_intersect(self, fill_value): | ||
self.arr2.sp_index.intersect(self.arr2.sp_index) | ||
|
||
def time_addition(self, fill_value): | ||
self.arr1 + self.arr2 | ||
|
||
def time_division(self, fill_value): | ||
self.arr1 / self.arr2 |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@mroeschke looking at this now, can we turn the Arithmetic class above into a special case of ArithmeticBlock?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure I suppose so, though I didn't write the original benchmarks.