diff --git a/asv_bench/benchmarks/io/__init__.py b/asv_bench/benchmarks/io/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py new file mode 100644 index 0000000000000..bc4599436111f --- /dev/null +++ b/asv_bench/benchmarks/io/csv.py @@ -0,0 +1,249 @@ +import random +import timeit +import string + +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Categorical, date_range, read_csv +from pandas.compat import PY2 +from pandas.compat import cStringIO as StringIO + +from ..pandas_vb_common import setup, BaseIO # noqa + + +class ToCSV(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + params = ['wide', 'long', 'mixed'] + param_names = ['kind'] + + def setup(self, kind): + wide_frame = DataFrame(np.random.randn(3000, 30)) + long_frame = DataFrame({'A': np.arange(50000), + 'B': np.arange(50000) + 1., + 'C': np.arange(50000) + 2., + 'D': np.arange(50000) + 3.}) + mixed_frame = DataFrame({'float': np.random.randn(5000), + 'int': np.random.randn(5000).astype(int), + 'bool': (np.arange(5000) % 2) == 0, + 'datetime': date_range('2001', + freq='s', + periods=5000), + 'object': ['foo'] * 5000}) + mixed_frame.loc[30:500, 'float'] = np.nan + data = {'wide': wide_frame, + 'long': long_frame, + 'mixed': mixed_frame} + self.df = data[kind] + + def time_frame(self, kind): + self.df.to_csv(self.fname) + + +class ToCSVDatetime(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + + def setup(self): + rng = date_range('1/1/2000', periods=1000) + self.data = DataFrame(rng, index=rng) + + def time_frame_date_formatting(self): + self.data.to_csv(self.fname, date_format='%Y%m%d') + + +class ReadCSVDInferDatetimeFormat(object): + + goal_time = 0.2 + params = ([True, False], ['custom', 'iso8601', 'ymd']) + param_names = ['infer_datetime_format', 'format'] + + def setup(self, infer_datetime_format, format): + rng = date_range('1/1/2000', periods=1000) + formats = {'custom': '%m/%d/%Y %H:%M:%S.%f', + 'iso8601': '%Y-%m-%d %H:%M:%S', + 'ymd': '%Y%m%d'} + dt_format = formats[format] + self.data = StringIO('\n'.join(rng.strftime(dt_format).tolist())) + + def time_read_csv(self, infer_datetime_format, format): + read_csv(self.data, header=None, names=['foo'], parse_dates=['foo'], + infer_datetime_format=infer_datetime_format) + + +class ReadCSVSkipRows(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + params = [None, 10000] + param_names = ['skiprows'] + + def setup(self, skiprows): + N = 20000 + index = tm.makeStringIndex(N) + df = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N), + 'string1': ['foo'] * N, + 'bool1': [True] * N, + 'int1': np.random.randint(0, N, size=N)}, + index=index) + df.to_csv(self.fname) + + def time_skipprows(self, skiprows): + read_csv(self.fname, skiprows=skiprows) + + +class ReadUint64Integers(object): + + goal_time = 0.2 + + def setup(self): + self.na_values = [2**63 + 500] + arr = np.arange(10000).astype('uint64') + 2**63 + self.data1 = StringIO('\n'.join(arr.astype(str).tolist())) + arr = arr.astype(object) + arr[500] = -1 + self.data2 = StringIO('\n'.join(arr.astype(str).tolist())) + + def time_read_uint64(self): + read_csv(self.data1, header=None, names=['foo']) + + def time_read_uint64_neg_values(self): + read_csv(self.data2, header=None, names=['foo']) + + def time_read_uint64_na_values(self): + read_csv(self.data1, header=None, names=['foo'], + na_values=self.na_values) + + +class S3(object): + # Make sure that we can read part of a file from S3 without + # needing to download the entire thing. Use the timeit.default_timer + # to measure wall time instead of CPU time -- we want to see + # how long it takes to download the data. + timer = timeit.default_timer + params = ([None, "gzip", "bz2"], ["python", "c"]) + param_names = ["compression", "engine"] + + def setup(self, compression, engine): + if compression == "bz2" and engine == "c" and PY2: + # The Python 2 C parser can't read bz2 from open files. + raise NotImplementedError + try: + import s3fs + except ImportError: + # Skip these benchmarks if `boto` is not installed. + raise NotImplementedError + + ext = "" + if compression == "gzip": + ext = ".gz" + elif compression == "bz2": + ext = ".bz2" + self.big_fname = "s3://pandas-test/large_random.csv" + ext + + def time_read_csv_10_rows(self, compression, engine): + # Read a small number of rows from a huge (100,000 x 50) table. + read_csv(self.big_fname, nrows=10, compression=compression, + engine=engine) + + +class ReadCSVThousands(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + params = ([',', '|'], [None, ',']) + param_names = ['sep', 'thousands'] + + def setup(self, sep, thousands): + N = 10000 + K = 8 + data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) + df = DataFrame(data) + if thousands is not None: + fmt = ':{}'.format(thousands) + fmt = '{' + fmt + '}' + df = df.applymap(lambda x: fmt.format(x)) + df.to_csv(self.fname, sep=sep) + + def time_thousands(self, sep, thousands): + read_csv(self.fname, sep=sep, thousands=thousands) + + +class ReadCSVComment(object): + + goal_time = 0.2 + + def setup(self): + data = ['A,B,C'] + (['1,2,3 # comment'] * 100000) + self.s_data = StringIO('\n'.join(data)) + + def time_comment(self): + read_csv(self.s_data, comment='#', header=None, names=list('abc')) + + +class ReadCSVFloatPrecision(object): + + goal_time = 0.2 + params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip']) + param_names = ['sep', 'decimal', 'float_precision'] + + def setup(self, sep, decimal, float_precision): + floats = [''.join(random.choice(string.digits) for _ in range(28)) + for _ in range(15)] + rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n' + data = rows * 5 + data = data.format(*floats) * 200 # 1000 x 3 strings csv + self.s_data = StringIO(data) + + def time_read_csv(self, sep, decimal, float_precision): + read_csv(self.s_data, sep=sep, header=None, names=list('abc'), + float_precision=float_precision) + + def time_read_csv_python_engine(self, sep, decimal, float_precision): + read_csv(self.s_data, sep=sep, header=None, engine='python', + float_precision=None, names=list('abc')) + + +class ReadCSVCategorical(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + + def setup(self): + N = 100000 + group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] + df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc')) + df.to_csv(self.fname, index=False) + + def time_convert_post(self): + read_csv(self.fname).apply(Categorical) + + def time_convert_direct(self): + read_csv(self.fname, dtype='category') + + +class ReadCSVParseDates(object): + + goal_time = 0.2 + + def setup(self): + data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n + {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n + {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n + {},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n + {},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n + """ + two_cols = ['KORD,19990127'] * 5 + data = data.format(*two_cols) + self.s_data = StringIO(data) + + def time_multiple_date(self): + read_csv(self.s_data, sep=',', header=None, + names=list(string.digits[:9]), parse_dates=[[1, 2], [1, 3]]) + + def time_baseline(self): + read_csv(self.s_data, sep=',', header=None, parse_dates=[1], + names=list(string.digits[:9])) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py new file mode 100644 index 0000000000000..acfdd327c3b51 --- /dev/null +++ b/asv_bench/benchmarks/io/json.py @@ -0,0 +1,127 @@ +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, date_range, timedelta_range, concat, read_json + +from ..pandas_vb_common import setup, BaseIO # noqa + + +class ReadJSON(BaseIO): + + goal_time = 0.2 + fname = "__test__.json" + params = (['split', 'index', 'records'], ['int', 'datetime']) + param_names = ['orient', 'index'] + + def setup(self, orient, index): + N = 100000 + indexes = {'int': np.arange(N), + 'datetime': date_range('20000101', periods=N, freq='H')} + df = DataFrame(np.random.randn(N, 5), + columns=['float_{}'.format(i) for i in range(5)], + index=indexes[index]) + df.to_json(self.fname, orient=orient) + + def time_read_json(self, orient, index): + read_json(self.fname, orient=orient) + + +class ReadJSONLines(BaseIO): + + goal_time = 0.2 + fname = "__test_lines__.json" + params = ['int', 'datetime'] + param_names = ['index'] + + def setup(self, index): + N = 100000 + indexes = {'int': np.arange(N), + 'datetime': date_range('20000101', periods=N, freq='H')} + df = DataFrame(np.random.randn(N, 5), + columns=['float_{}'.format(i) for i in range(5)], + index=indexes[index]) + df.to_json(self.fname, orient='records', lines=True) + + def time_read_json_lines(self, index): + read_json(self.fname, orient='records', lines=True) + + def time_read_json_lines_concat(self, index): + concat(read_json(self.fname, orient='records', lines=True, + chunksize=25000)) + + def peakmem_read_json_lines(self, index): + read_json(self.fname, orient='records', lines=True) + + def peakmem_read_json_lines_concat(self, index): + concat(read_json(self.fname, orient='records', lines=True, + chunksize=25000)) + + +class ToJSON(BaseIO): + + goal_time = 0.2 + fname = "__test__.json" + params = ['split', 'columns', 'index'] + param_names = ['orient'] + + def setup(self, lines_orient): + N = 10**5 + ncols = 5 + index = date_range('20000101', periods=N, freq='H') + timedeltas = timedelta_range(start=1, periods=N, freq='s') + datetimes = date_range(start=1, periods=N, freq='s') + ints = np.random.randint(100000000, size=N) + floats = np.random.randn(N) + strings = tm.makeStringIndex(N) + self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) + self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) + self.df_td_int_ts = DataFrame({'td_1': timedeltas, + 'td_2': timedeltas, + 'int_1': ints, + 'int_2': ints, + 'ts_1': datetimes, + 'ts_2': datetimes}, + index=index) + self.df_int_floats = DataFrame({'int_1': ints, + 'int_2': ints, + 'int_3': ints, + 'float_1': floats, + 'float_2': floats, + 'float_3': floats}, + index=index) + self.df_int_float_str = DataFrame({'int_1': ints, + 'int_2': ints, + 'float_1': floats, + 'float_2': floats, + 'str_1': strings, + 'str_2': strings}, + index=index) + + def time_floats_with_int_index(self, orient): + self.df.to_json(self.fname, orient=orient) + + def time_floats_with_dt_index(self, orient): + self.df_date_idx.to_json(self.fname, orient=orient) + + def time_delta_int_tstamp(self, orient): + self.df_td_int_ts.to_json(self.fname, orient=orient) + + def time_float_int(self, orient): + self.df_int_floats.to_json(self.fname, orient=orient) + + def time_float_int_str(self, orient): + self.df_int_float_str.to_json(self.fname, orient=orient) + + def time_floats_with_int_idex_lines(self, orient): + self.df.to_json(self.fname, orient='records', lines=True) + + def time_floats_with_dt_index_lines(self, orient): + self.df_date_idx.to_json(self.fname, orient='records', lines=True) + + def time_delta_int_tstamp_lines(self, orient): + self.df_td_int_ts.to_json(self.fname, orient='records', lines=True) + + def time_float_int_lines(self, orient): + self.df_int_floats.to_json(self.fname, orient='records', lines=True) + + def time_float_int_str_lines(self, orient): + self.df_int_float_str.to_json(self.fname, orient='records', lines=True) diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py deleted file mode 100644 index e8112cc41f032..0000000000000 --- a/asv_bench/benchmarks/io_bench.py +++ /dev/null @@ -1,225 +0,0 @@ -import os -from .pandas_vb_common import * -from pandas import concat, Timestamp, compat -try: - from StringIO import StringIO -except ImportError: - from io import StringIO -import timeit - - -class frame_to_csv(BaseIO): - goal_time = 0.2 - fname = '__test__.csv' - - def setup(self): - self.df = DataFrame(np.random.randn(3000, 30)) - - def time_frame_to_csv(self): - self.df.to_csv(self.fname) - - -class frame_to_csv2(BaseIO): - goal_time = 0.2 - fname = '__test__.csv' - - def setup(self): - self.df = DataFrame({'A': range(50000), }) - self.df['B'] = (self.df.A + 1.0) - self.df['C'] = (self.df.A + 2.0) - self.df['D'] = (self.df.A + 3.0) - - def time_frame_to_csv2(self): - self.df.to_csv(self.fname) - - -class frame_to_csv_date_formatting(BaseIO): - goal_time = 0.2 - fname = '__test__.csv' - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = DataFrame(self.rng, index=self.rng) - - def time_frame_to_csv_date_formatting(self): - self.data.to_csv(self.fname, date_format='%Y%m%d') - - -class frame_to_csv_mixed(BaseIO): - goal_time = 0.2 - fname = '__test__.csv' - - def setup(self): - self.df_float = DataFrame(np.random.randn(5000, 5), dtype='float64', columns=self.create_cols('float')) - self.df_int = DataFrame(np.random.randn(5000, 5), dtype='int64', columns=self.create_cols('int')) - self.df_bool = DataFrame(True, index=self.df_float.index, columns=self.create_cols('bool')) - self.df_object = DataFrame('foo', index=self.df_float.index, columns=self.create_cols('object')) - self.df_dt = DataFrame(Timestamp('20010101'), index=self.df_float.index, columns=self.create_cols('date')) - self.df_float.ix[30:500, 1:3] = np.nan - self.df = concat([self.df_float, self.df_int, self.df_bool, self.df_object, self.df_dt], axis=1) - - def time_frame_to_csv_mixed(self): - self.df.to_csv(self.fname) - - def create_cols(self, name): - return [('%s%03d' % (name, i)) for i in range(5)] - - -class read_csv_infer_datetime_format_custom(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%m/%d/%Y %H:%M:%S.%f')))) - - def time_read_csv_infer_datetime_format_custom(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) - - -class read_csv_infer_datetime_format_iso8601(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y-%m-%d %H:%M:%S')))) - - def time_read_csv_infer_datetime_format_iso8601(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) - - -class read_csv_infer_datetime_format_ymd(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y%m%d')))) - - def time_read_csv_infer_datetime_format_ymd(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) - - -class read_csv_skiprows(BaseIO): - goal_time = 0.2 - fname = '__test__.csv' - - def setup(self): - self.index = tm.makeStringIndex(20000) - self.df = DataFrame({'float1': randn(20000), 'float2': randn(20000), 'string1': (['foo'] * 20000), 'bool1': ([True] * 20000), 'int1': np.random.randint(0, 200000, size=20000), }, index=self.index) - self.df.to_csv(self.fname) - - def time_read_csv_skiprows(self): - read_csv(self.fname, skiprows=10000) - - -class read_csv_standard(BaseIO): - goal_time = 0.2 - fname = '__test__.csv' - - def setup(self): - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_csv(self.fname) - - def time_read_csv_standard(self): - read_csv(self.fname) - - -class read_parse_dates_iso8601(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y-%m-%d %H:%M:%S')))) - - def time_read_parse_dates_iso8601(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo']) - - -class read_uint64_integers(object): - goal_time = 0.2 - - def setup(self): - self.na_values = [2**63 + 500] - - self.arr1 = np.arange(10000).astype('uint64') + 2**63 - self.data1 = '\n'.join(map(lambda x: str(x), self.arr1)) - - self.arr2 = self.arr1.copy().astype(object) - self.arr2[500] = -1 - self.data2 = '\n'.join(map(lambda x: str(x), self.arr2)) - - def time_read_uint64(self): - read_csv(StringIO(self.data1), header=None) - - def time_read_uint64_neg_values(self): - read_csv(StringIO(self.data2), header=None) - - def time_read_uint64_na_values(self): - read_csv(StringIO(self.data1), header=None, na_values=self.na_values) - - -class write_csv_standard(BaseIO): - goal_time = 0.2 - fname = '__test__.csv' - - def setup(self): - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - - def time_write_csv_standard(self): - self.df.to_csv(self.fname) - - -class read_csv_from_s3(object): - # Make sure that we can read part of a file from S3 without - # needing to download the entire thing. Use the timeit.default_timer - # to measure wall time instead of CPU time -- we want to see - # how long it takes to download the data. - timer = timeit.default_timer - params = ([None, "gzip", "bz2"], ["python", "c"]) - param_names = ["compression", "engine"] - - def setup(self, compression, engine): - if compression == "bz2" and engine == "c" and compat.PY2: - # The Python 2 C parser can't read bz2 from open files. - raise NotImplementedError - try: - import s3fs - except ImportError: - # Skip these benchmarks if `boto` is not installed. - raise NotImplementedError - - self.big_fname = "s3://pandas-test/large_random.csv" - - def time_read_nrows(self, compression, engine): - # Read a small number of rows from a huge (100,000 x 50) table. - ext = "" - if compression == "gzip": - ext = ".gz" - elif compression == "bz2": - ext = ".bz2" - pd.read_csv(self.big_fname + ext, nrows=10, - compression=compression, engine=engine) - - -class read_json_lines(BaseIO): - goal_time = 0.2 - fname = "__test__.json" - - def setup(self): - self.N = 100000 - self.C = 5 - self.df = DataFrame({'float{0}'.format(i): randn(self.N) for i in range(self.C)}) - self.df.to_json(self.fname,orient="records",lines=True) - - def time_read_json_lines(self): - pd.read_json(self.fname, lines=True) - - def time_read_json_lines_chunk(self): - pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4)) - - def peakmem_read_json_lines(self): - pd.read_json(self.fname, lines=True) - - def peakmem_read_json_lines_chunk(self): - pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4)) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py index 758162f000e8d..7b6cefc56f0da 100644 --- a/asv_bench/benchmarks/packers.py +++ b/asv_bench/benchmarks/packers.py @@ -77,28 +77,6 @@ def time_packers_read_hdf_table(self): pd.read_hdf(self.f, 'df') -class packers_read_json(_Packers): - - def setup(self): - self._setup() - self.df.to_json(self.f, orient='split') - self.df.index = np.arange(self.N) - - def time_packers_read_json(self): - pd.read_json(self.f, orient='split') - - -class packers_read_json_date_index(_Packers): - - def setup(self): - self._setup() - self.remove(self.f) - self.df.to_json(self.f, orient='split') - - def time_packers_read_json_date_index(self): - pd.read_json(self.f, orient='split') - - class packers_read_pack(_Packers): def setup(self): @@ -219,46 +197,6 @@ def time_write_hdf_table(self): self.df2.to_hdf(self.f, 'df', table=True) -class JSON(_Packers): - - def setup(self): - self._setup() - self.df_date = self.df.copy() - self.df.index = np.arange(self.N) - self.cols = [(lambda i: ('{0}_timedelta'.format(i), [pd.Timedelta(('%d seconds' % randrange(1000000.0))) for _ in range(self.N)])), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_timestamp'.format(i), [pd.Timestamp((1418842918083256000 + randrange(1000000000.0, 1e+18, 200))) for _ in range(self.N)]))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))] - self.df_mixed2 = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_str'.format(i), [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]))] - self.df_mixed3 = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - def time_write_json(self): - self.df.to_json(self.f, orient='split') - - def time_write_json_T(self): - self.df.to_json(self.f, orient='columns') - - def time_write_json_date_index(self): - self.df_date.to_json(self.f, orient='split') - - def time_write_json_mixed_delta_int_tstamp(self): - self.df_mixed.to_json(self.f, orient='split') - - def time_write_json_mixed_float_int(self): - self.df_mixed2.to_json(self.f, orient='index') - - def time_write_json_mixed_float_int_T(self): - self.df_mixed2.to_json(self.f, orient='columns') - - def time_write_json_mixed_float_int_str(self): - self.df_mixed3.to_json(self.f, orient='split') - - def time_write_json_lines(self): - self.df.to_json(self.f, orient="records", lines=True) - - class MsgPack(_Packers): def setup(self): diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py deleted file mode 100644 index 32bf7e50d1a89..0000000000000 --- a/asv_bench/benchmarks/parser_vb.py +++ /dev/null @@ -1,121 +0,0 @@ -from .pandas_vb_common import * -import os -from pandas import read_csv -try: - from cStringIO import StringIO -except ImportError: - from io import StringIO - - -class read_csv1(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) - self.df.to_csv('test.csv', sep='|') - - self.format = (lambda x: '{:,}'.format(x)) - self.df2 = self.df.applymap(self.format) - self.df2.to_csv('test2.csv', sep='|') - - def time_sep(self): - read_csv('test.csv', sep='|') - - def time_thousands(self): - read_csv('test.csv', sep='|', thousands=',') - - def teardown(self): - os.remove('test.csv') - os.remove('test2.csv') - - -class read_csv2(object): - goal_time = 0.2 - - def setup(self): - self.data = ['A,B,C'] - self.data = (self.data + (['1,2,3 # comment'] * 100000)) - self.data = '\n'.join(self.data) - - def time_comment(self): - read_csv(StringIO(self.data), comment='#') - - -class read_csv3(object): - goal_time = 0.2 - - def setup(self): - self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n -0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n -0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n -0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n -0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" - self.data2 = self.data.replace(',', ';').replace('.', ',') - self.data = (self.data * 200) - self.data2 = (self.data2 * 200) - - def time_default_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, - float_precision=None) - - def time_default_converter_with_decimal(self): - read_csv(StringIO(self.data2), sep=';', header=None, - float_precision=None, decimal=',') - - def time_default_converter_python_engine(self): - read_csv(StringIO(self.data), sep=',', header=None, - float_precision=None, engine='python') - - def time_default_converter_with_decimal_python_engine(self): - read_csv(StringIO(self.data2), sep=';', header=None, - float_precision=None, decimal=',', engine='python') - - def time_precise_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, - float_precision='high') - - def time_roundtrip_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, - float_precision='round_trip') - - -class read_csv_categorical(object): - goal_time = 0.2 - - def setup(self): - N = 100000 - group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] - df = DataFrame({'a': np.random.choice(group1, N).astype('object'), - 'b': np.random.choice(group1, N).astype('object'), - 'c': np.random.choice(group1, N).astype('object')}) - df.to_csv('strings.csv', index=False) - - def time_convert_post(self): - read_csv('strings.csv').apply(pd.Categorical) - - def time_convert_direct(self): - read_csv('strings.csv', dtype='category') - - def teardown(self): - os.remove('strings.csv') - - -class read_csv_dateparsing(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.data = 'KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' - self.data = (self.data * 200) - self.data2 = 'KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' - self.data2 = (self.data2 * 200) - - def time_multiple_date(self): - read_csv(StringIO(self.data), sep=',', header=None, - parse_dates=[[1, 2], [1, 3]]) - - def time_baseline(self): - read_csv(StringIO(self.data2), sep=',', header=None, parse_dates=[1])