Skip to content

CLN: ASV HDFStore benchmark #18641

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 6, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 39 additions & 47 deletions asv_bench/benchmarks/hdfstore_bench.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,40 @@
from .pandas_vb_common import *
import os
import numpy as np
from pandas import DataFrame, Panel, date_range, HDFStore
import pandas.util.testing as tm

from .pandas_vb_common import BaseIO, setup # noqa

class HDF5(object):
goal_time = 0.2

def setup(self):
self.index = tm.makeStringIndex(25000)
self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000),},
index=self.index)

self.df_mixed = DataFrame(
{'float1': randn(25000), 'float2': randn(25000),
'string1': (['foo'] * 25000),
'bool1': ([True] * 25000),
'int1': np.random.randint(0, 250000, size=25000),},
index=self.index)
class HDF5(BaseIO):

self.df_wide = DataFrame(np.random.randn(25000, 100))

self.df2 = DataFrame({'float1': randn(25000), 'float2': randn(25000)},
index=date_range('1/1/2000', periods=25000))
self.df_wide2 = DataFrame(np.random.randn(25000, 100),
index=date_range('1/1/2000', periods=25000))
goal_time = 0.2

self.df_dc = DataFrame(np.random.randn(10000, 10),
columns=[('C%03d' % i) for i in range(10)])
def setup(self):
N = 25000
index = tm.makeStringIndex(N)
self.df = DataFrame({'float1': np.random.randn(N),
'float2': np.random.randn(N)},
index=index)
self.df_mixed = DataFrame({'float1': np.random.randn(N),
'float2': np.random.randn(N),
'string1': ['foo'] * N,
'bool1': [True] * N,
'int1': np.random.randint(0, N, size=N)},
index=index)
self.df_wide = DataFrame(np.random.randn(N, 100))
self.start_wide = self.df_wide.index[10000]
self.stop_wide = self.df_wide.index[15000]
self.df2 = DataFrame({'float1': np.random.randn(N),
'float2': np.random.randn(N)},
index=date_range('1/1/2000', periods=N))
self.start = self.df2.index[10000]
self.stop = self.df2.index[15000]
self.df_wide2 = DataFrame(np.random.randn(N, 100),
index=date_range('1/1/2000', periods=N))
self.df_dc = DataFrame(np.random.randn(N, 10),
columns=['C%03d' % i for i in range(10)])

self.f = '__test__.h5'
self.remove(self.f)

self.store = HDFStore(self.f)
self.store.put('fixed', self.df)
Expand All @@ -42,12 +48,6 @@ def teardown(self):
self.store.close()
self.remove(self.f)

def remove(self, f):
try:
os.remove(f)
except:
pass

def time_read_store(self):
self.store.get('fixed')

Expand Down Expand Up @@ -82,14 +82,12 @@ def time_write_store_table_dc(self):
self.store.append('table_dc_write', self.df_dc, data_columns=True)

def time_query_store_table_wide(self):
start = self.df_wide2.index[10000]
stop = self.df_wide2.index[15000]
self.store.select('table_wide', where="index > start and index < stop")
self.store.select('table_wide', where="index > self.start_wide and "
"index < self.stop_wide")

def time_query_store_table(self):
start = self.df2.index[10000]
stop = self.df2.index[15000]
self.store.select('table', where="index > start and index < stop")
self.store.select('table', where="index > self.start and "
"index < self.stop")

def time_store_repr(self):
repr(self.store)
Expand All @@ -101,29 +99,23 @@ def time_store_info(self):
self.store.info()


class HDF5Panel(object):
class HDF5Panel(BaseIO):

goal_time = 0.2

def setup(self):
self.f = '__test__.h5'
self.p = Panel(randn(20, 1000, 25),
items=[('Item%03d' % i) for i in range(20)],
self.p = Panel(np.random.randn(20, 1000, 25),
items=['Item%03d' % i for i in range(20)],
major_axis=date_range('1/1/2000', periods=1000),
minor_axis=[('E%03d' % i) for i in range(25)])
self.remove(self.f)
minor_axis=['E%03d' % i for i in range(25)])
self.store = HDFStore(self.f)
self.store.append('p1', self.p)

def teardown(self):
self.store.close()
self.remove(self.f)

def remove(self, f):
try:
os.remove(f)
except:
pass

def time_read_store_table_panel(self):
self.store.select('p1')

Expand Down
34 changes: 9 additions & 25 deletions asv_bench/benchmarks/io_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,7 @@
import timeit


class _BenchTeardown(object):
"""
base class for teardown method implementation
"""
fname = None

def remove(self, f):
try:
os.remove(f)
except:
pass

def teardown(self):
self.remove(self.fname)


class frame_to_csv(_BenchTeardown):
class frame_to_csv(BaseIO):
goal_time = 0.2
fname = '__test__.csv'

Expand All @@ -35,7 +19,7 @@ def time_frame_to_csv(self):
self.df.to_csv(self.fname)


class frame_to_csv2(_BenchTeardown):
class frame_to_csv2(BaseIO):
goal_time = 0.2
fname = '__test__.csv'

Expand All @@ -49,7 +33,7 @@ def time_frame_to_csv2(self):
self.df.to_csv(self.fname)


class frame_to_csv_date_formatting(_BenchTeardown):
class frame_to_csv_date_formatting(BaseIO):
goal_time = 0.2
fname = '__test__.csv'

Expand All @@ -61,7 +45,7 @@ def time_frame_to_csv_date_formatting(self):
self.data.to_csv(self.fname, date_format='%Y%m%d')


class frame_to_csv_mixed(_BenchTeardown):
class frame_to_csv_mixed(BaseIO):
goal_time = 0.2
fname = '__test__.csv'

Expand Down Expand Up @@ -114,7 +98,7 @@ def time_read_csv_infer_datetime_format_ymd(self):
read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True)


class read_csv_skiprows(_BenchTeardown):
class read_csv_skiprows(BaseIO):
goal_time = 0.2
fname = '__test__.csv'

Expand All @@ -127,7 +111,7 @@ def time_read_csv_skiprows(self):
read_csv(self.fname, skiprows=10000)


class read_csv_standard(_BenchTeardown):
class read_csv_standard(BaseIO):
goal_time = 0.2
fname = '__test__.csv'

Expand Down Expand Up @@ -174,7 +158,7 @@ def time_read_uint64_na_values(self):
read_csv(StringIO(self.data1), header=None, na_values=self.na_values)


class write_csv_standard(_BenchTeardown):
class write_csv_standard(BaseIO):
goal_time = 0.2
fname = '__test__.csv'

Expand Down Expand Up @@ -218,14 +202,14 @@ def time_read_nrows(self, compression, engine):
compression=compression, engine=engine)


class read_json_lines(_BenchTeardown):
class read_json_lines(BaseIO):
goal_time = 0.2
fname = "__test__.json"

def setup(self):
self.N = 100000
self.C = 5
self.df = DataFrame({('float{0}'.format(i), randn(self.N)) for i in range(self.C)})
self.df = DataFrame({'float{0}'.format(i): randn(self.N) for i in range(self.C)})
self.df.to_json(self.fname,orient="records",lines=True)

def time_read_json_lines(self):
Expand Down
20 changes: 20 additions & 0 deletions asv_bench/benchmarks/pandas_vb_common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from pandas import *
import pandas as pd
from numpy.random import randn
Expand All @@ -19,6 +20,25 @@
def setup(*args, **kwargs):
np.random.seed(1234)


class BaseIO(object):
"""
Base class for IO benchmarks
"""
fname = None

def remove(self, f):
"""Remove created files"""
try:
os.remove(f)
except:
# On Windows, attempting to remove a file that is in use
# causes an exception to be raised
pass

def teardown(self):
self.remove(self.fname)

# try em until it works!
for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']:
try:
Expand Down