Skip to content

CLN: ASV sparse #19047

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 3, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 110 additions & 159 deletions asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
@@ -1,211 +1,162 @@
import itertools

from .pandas_vb_common import *
import numpy as np
import scipy.sparse
from pandas import SparseSeries, SparseDataFrame, SparseArray
from pandas import (SparseSeries, SparseDataFrame, SparseArray, Series,
date_range, MultiIndex)

from .pandas_vb_common import setup # noqa

class sparse_series_to_frame(object):
goal_time = 0.2

def setup(self):
self.K = 50
self.N = 50000
self.rng = np.asarray(date_range('1/1/2000', periods=self.N, freq='T'))
self.series = {}
for i in range(1, (self.K + 1)):
self.data = np.random.randn(self.N)[:(- i)]
self.this_rng = self.rng[:(- i)]
self.data[100:] = np.nan
self.series[i] = SparseSeries(self.data, index=self.this_rng)
def make_array(size, dense_proportion, fill_value, dtype):
dense_size = int(size * dense_proportion)
arr = np.full(size, fill_value, dtype)
indexer = np.random.choice(np.arange(size), dense_size, replace=False)
arr[indexer] = np.random.choice(np.arange(100, dtype=dtype), dense_size)
return arr

def time_sparse_series_to_frame(self):
SparseDataFrame(self.series)

class SparseSeriesToFrame(object):

class sparse_array_constructor(object):
goal_time = 0.2

def setup(self):
np.random.seed(1)
self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64)
self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64)

self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64)
self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64)

self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan)
self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan)

self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0)
self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0)

def make_numeric_array(self, length, dense_size, fill_value, dtype):
arr = np.array([fill_value] * length, dtype=dtype)
indexer = np.unique(np.random.randint(0, length, dense_size))
arr[indexer] = np.random.randint(0, 100, len(indexer))
return (arr, fill_value, dtype)

def make_object_array(self, length, dense_size, fill_value):
elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object)
arr = np.array([fill_value] * length, dtype=np.object)
indexer = np.unique(np.random.randint(0, length, dense_size))
arr[indexer] = np.random.choice(elems, len(indexer))
return (arr, fill_value, np.object)

def time_sparse_array_constructor_int64_10percent(self):
arr, fill_value, dtype = self.int64_10percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_int64_1percent(self):
arr, fill_value, dtype = self.int64_1percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_float64_10percent(self):
arr, fill_value, dtype = self.float64_10percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_float64_1percent(self):
arr, fill_value, dtype = self.float64_1percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_object_nan_fill_value_10percent(self):
arr, fill_value, dtype = self.object_nan_fill_value_10percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_object_nan_fill_value_1percent(self):
arr, fill_value, dtype = self.object_nan_fill_value_1percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)
K = 50
N = 50001
rng = date_range('1/1/2000', periods=N, freq='T')
self.series = {}
for i in range(1, K):
data = np.random.randn(N)[:-i]
idx = rng[:-i]
data[100:] = np.nan
self.series[i] = SparseSeries(data, index=idx)

def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self):
arr, fill_value, dtype = self.object_non_nan_fill_value_10percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)
def time_series_to_frame(self):
SparseDataFrame(self.series)

def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self):
arr, fill_value, dtype = self.object_non_nan_fill_value_1percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

class SparseArrayConstructor(object):

class sparse_frame_constructor(object):
goal_time = 0.2
params = ([0.1, 0.01], [0, np.nan],
[np.int64, np.float64, np.object])
param_names = ['dense_proportion', 'fill_value', 'dtype']

def time_sparse_frame_constructor(self):
SparseDataFrame(columns=np.arange(100), index=np.arange(1000))
def setup(self, dense_proportion, fill_value, dtype):
N = 10**6
self.array = make_array(N, dense_proportion, fill_value, dtype)

def time_sparse_from_scipy(self):
SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005))
def time_sparse_array(self, dense_proportion, fill_value, dtype):
SparseArray(self.array, fill_value=fill_value, dtype=dtype)

def time_sparse_from_dict(self):
SparseDataFrame(dict(zip(range(1000), itertools.repeat([0]))))

class SparseDataFrameConstructor(object):

class sparse_series_from_coo(object):
goal_time = 0.2

def setup(self):
self.A = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100))
N = 1000
self.arr = np.arange(N)
self.sparse = scipy.sparse.rand(N, N, 0.005)
self.dict = dict(zip(range(N), itertools.repeat([0])))

def time_sparse_series_from_coo(self):
self.ss = SparseSeries.from_coo(self.A)
def time_constructor(self):
SparseDataFrame(columns=self.arr, index=self.arr)

def time_from_scipy(self):
SparseDataFrame(self.sparse)

class sparse_series_to_coo(object):
goal_time = 0.2
def time_from_dict(self):
SparseDataFrame(self.dict)

def setup(self):
self.s = pd.Series(([np.nan] * 10000))
self.s[0] = 3.0
self.s[100] = (-1.0)
self.s[999] = 12.1
self.s.index = pd.MultiIndex.from_product((range(10), range(10), range(10), range(10)))
self.ss = self.s.to_sparse()

def time_sparse_series_to_coo(self):
self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)

class FromCoo(object):

class sparse_arithmetic_int(object):
goal_time = 0.2

def setup(self):
np.random.seed(1)
self.a_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan)
self.b_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan)

self.a_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0)
self.b_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0)

self.a_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan)
self.b_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan)

def make_sparse_array(self, length, dense_size, fill_value):
arr = np.array([fill_value] * length, dtype=np.float64)
indexer = np.unique(np.random.randint(0, length, dense_size))
arr[indexer] = np.random.randint(0, 100, len(indexer))
return pd.SparseArray(arr, fill_value=fill_value)

def time_sparse_make_union(self):
self.a_10percent.sp_index.make_union(self.b_10percent.sp_index)
self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0],
([1, 0, 0], [0, 2, 3])),
shape=(100, 100))

def time_sparse_intersect(self):
self.a_10percent.sp_index.intersect(self.b_10percent.sp_index)

def time_sparse_addition_10percent(self):
self.a_10percent + self.b_10percent
def time_sparse_series_from_coo(self):
SparseSeries.from_coo(self.matrix)

def time_sparse_addition_10percent_zero(self):
self.a_10percent_zero + self.b_10percent_zero

def time_sparse_addition_1percent(self):
self.a_1percent + self.b_1percent
class ToCoo(object):

def time_sparse_division_10percent(self):
self.a_10percent / self.b_10percent
goal_time = 0.2

def time_sparse_division_10percent_zero(self):
self.a_10percent_zero / self.b_10percent_zero
def setup(self):
s = Series([np.nan] * 10000)
s[0] = 3.0
s[100] = -1.0
s[999] = 12.1
s.index = MultiIndex.from_product([range(10)] * 4)
self.ss = s.to_sparse()

def time_sparse_division_1percent(self):
self.a_1percent / self.b_1percent
def time_sparse_series_to_coo(self):
self.ss.to_coo(row_levels=[0, 1],
column_levels=[2, 3],
sort_labels=True)


class Arithmetic(object):

class sparse_arithmetic_block(object):
goal_time = 0.2
params = ([0.1, 0.01], [0, np.nan])
param_names = ['dense_proportion', 'fill_value']

def setup(self):
np.random.seed(1)
self.a = self.make_sparse_array(length=1000000, num_blocks=1000,
block_size=10, fill_value=np.nan)
self.b = self.make_sparse_array(length=1000000, num_blocks=1000,
block_size=10, fill_value=np.nan)

self.a_zero = self.make_sparse_array(length=1000000, num_blocks=1000,
block_size=10, fill_value=0)
self.b_zero = self.make_sparse_array(length=1000000, num_blocks=1000,
block_size=10, fill_value=np.nan)
def setup(self, dense_proportion, fill_value):
N = 10**6
arr1 = make_array(N, dense_proportion, fill_value, np.int64)
self.array1 = SparseArray(arr1, fill_value=fill_value)
arr2 = make_array(N, dense_proportion, fill_value, np.int64)
self.array2 = SparseArray(arr2, fill_value=fill_value)

def make_sparse_array(self, length, num_blocks, block_size, fill_value):
a = np.array([fill_value] * length)
for block in range(num_blocks):
i = np.random.randint(0, length)
a[i:i + block_size] = np.random.randint(0, 100, len(a[i:i + block_size]))
return pd.SparseArray(a, fill_value=fill_value)
def time_make_union(self, dense_proportion, fill_value):
self.array1.sp_index.make_union(self.array2.sp_index)

def time_sparse_make_union(self):
self.a.sp_index.make_union(self.b.sp_index)
def time_intersect(self, dense_proportion, fill_value):
self.array1.sp_index.intersect(self.array2.sp_index)

def time_sparse_intersect(self):
self.a.sp_index.intersect(self.b.sp_index)
def time_add(self, dense_proportion, fill_value):
self.array1 + self.array2

def time_sparse_addition(self):
self.a + self.b
def time_divide(self, dense_proportion, fill_value):
self.array1 / self.array2

def time_sparse_addition_zero(self):
self.a_zero + self.b_zero

def time_sparse_division(self):
self.a / self.b
class ArithmeticBlock(object):

def time_sparse_division_zero(self):
self.a_zero / self.b_zero
goal_time = 0.2
params = [np.nan, 0]
param_names = ['fill_value']

def setup(self, fill_value):
N = 10**6
self.arr1 = self.make_block_array(length=N, num_blocks=1000,
block_size=10, fill_value=fill_value)
self.arr2 = self.make_block_array(length=N, num_blocks=1000,
block_size=10, fill_value=fill_value)

def make_block_array(self, length, num_blocks, block_size, fill_value):
arr = np.full(length, fill_value)
indicies = np.random.choice(np.arange(0, length, block_size),
num_blocks,
replace=False)
for ind in indicies:
arr[ind:ind + block_size] = np.random.randint(0, 100, block_size)
return SparseArray(arr, fill_value=fill_value)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke looking at this now, can we turn the Arithmetic class above into a special case of ArithmeticBlock?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure I suppose so, though I didn't write the original benchmarks.


def time_make_union(self, fill_value):
self.arr1.sp_index.make_union(self.arr2.sp_index)

def time_intersect(self, fill_value):
self.arr2.sp_index.intersect(self.arr2.sp_index)

def time_addition(self, fill_value):
self.arr1 + self.arr2

def time_division(self, fill_value):
self.arr1 / self.arr2