From dd369e037bca53dccd48a45f7dee6bdf0b2e7e2f Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 12 May 2019 23:22:16 +0300 Subject: [PATCH 1/4] added 'dayfirst' parameter to 'ReadCSVParseSpecialDate' benchmark --- asv_bench/benchmarks/io/csv.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index c51fb09ad8671..818ce2ec3c0dc 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -272,22 +272,22 @@ def mem_parser_chunks(self): class ReadCSVParseSpecialDate(StringIORewind): - params = (['mY', 'mdY', 'hm'],) - params_name = ['value'] + params = (['mY', 'mdY', 'hm'], [False, True]) + params_name = ['value', 'dayfirst'] objects = { 'mY': '01-2019\n10-2019\n02/2000\n', 'mdY': '12/02/2010\n', 'hm': '21:34\n' } - def setup(self, value): + def setup(self, value, dayfirst): count_elem = 10000 data = self.objects[value] * count_elem self.StringIO_input = StringIO(data) - def time_read_special_date(self, value): + def time_read_special_date(self, value, dayfirst): read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=['Date'], parse_dates=['Date']) + names=['Date'], parse_dates=['Date'], dayfirst=dayfirst) from ..pandas_vb_common import setup # noqa: F401 From 3b1034b8a72f9516bc807f8c233a2a45ae56a504 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 12 May 2019 23:26:48 +0300 Subject: [PATCH 2/4] added benchmark for 'to_datetime' function with '%d-%m-%Y' format --- asv_bench/benchmarks/timeseries.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index eea1df35c7711..23abee5651a84 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -354,8 +354,10 @@ def time_infer_quarter(self): class ToDatetimeFormat: def setup(self): - self.s = Series(['19MAY11', '19MAY11:00:00:00'] * 100000) + self.count = 100000 + self.s = Series(['19MAY11', '19MAY11:00:00:00'] * self.count) self.s2 = self.s.str.replace(':\\S+$', '') + self.s3 = Series(['02-11-2000'] * self.count) def time_exact(self): to_datetime(self.s2, format='%d%b%y') @@ -363,6 +365,8 @@ def time_exact(self): def time_no_exact(self): to_datetime(self.s, format='%d%b%y', exact=False) + def time_DDMMYYYY(self): + to_datetime(self.s3, format='%d-%m-%Y') class ToDatetimeCache: From 31aa606ad7094128e0030dc134bfe15b1baef72d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2019 18:32:08 +0300 Subject: [PATCH 3/4] created 'ParseDateComparison' class for asv testing --- asv_bench/benchmarks/io/csv.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 818ce2ec3c0dc..df3ba3985304a 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -3,7 +3,7 @@ import numpy as np import pandas.util.testing as tm -from pandas import DataFrame, Categorical, date_range, read_csv +from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime from pandas.io.parsers import _parser_defaults from io import StringIO @@ -273,7 +273,7 @@ def mem_parser_chunks(self): class ReadCSVParseSpecialDate(StringIORewind): params = (['mY', 'mdY', 'hm'], [False, True]) - params_name = ['value', 'dayfirst'] + param_names = ['value', 'dayfirst'] objects = { 'mY': '01-2019\n10-2019\n02/2000\n', 'mdY': '12/02/2010\n', @@ -290,4 +290,29 @@ def time_read_special_date(self, value, dayfirst): names=['Date'], parse_dates=['Date'], dayfirst=dayfirst) +class ParseDateComparison(StringIORewind): + params = ([False, True],) + param_names = ['cache_dates'] + + def setup(self, cache_dates): + count_elem = 10000 + data = '12-02-2010\n' * count_elem + self.StringIO_input = StringIO(data) + + def time_read_csv_dayfirst(self, cache_dates): + read_csv(self.data(self.StringIO_input), sep=',', header=None, + names=['Date'], parse_dates=['Date'], cache_dates=cache_dates, + dayfirst=True) + + def time_to_datetime_dayfirst(self, cache_dates): + df = read_csv(self.data(self.StringIO_input), + dtype={'date': str}, names=['date']) + to_datetime(df['date'], cache=cache_dates, dayfirst=True) + + def time_to_datetime_format_DD_MM_YYYY(self, cache_dates): + df = read_csv(self.data(self.StringIO_input), + dtype={'date': str}, names=['date']) + to_datetime(df['date'], cache=cache_dates, format='%d-%m-%Y') + + from ..pandas_vb_common import setup # noqa: F401 From e33284d0f3db92d70854751e4b95b7ee507d1f15 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 14 May 2019 18:39:40 +0300 Subject: [PATCH 4/4] rollback first benchmarks --- asv_bench/benchmarks/io/csv.py | 10 +++++----- asv_bench/benchmarks/timeseries.py | 6 +----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index df3ba3985304a..a12e603a6017f 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -272,22 +272,22 @@ def mem_parser_chunks(self): class ReadCSVParseSpecialDate(StringIORewind): - params = (['mY', 'mdY', 'hm'], [False, True]) - param_names = ['value', 'dayfirst'] + params = (['mY', 'mdY', 'hm'],) + param_names = ['value'] objects = { 'mY': '01-2019\n10-2019\n02/2000\n', 'mdY': '12/02/2010\n', 'hm': '21:34\n' } - def setup(self, value, dayfirst): + def setup(self, value): count_elem = 10000 data = self.objects[value] * count_elem self.StringIO_input = StringIO(data) - def time_read_special_date(self, value, dayfirst): + def time_read_special_date(self, value): read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=['Date'], parse_dates=['Date'], dayfirst=dayfirst) + names=['Date'], parse_dates=['Date']) class ParseDateComparison(StringIORewind): diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 23abee5651a84..eea1df35c7711 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -354,10 +354,8 @@ def time_infer_quarter(self): class ToDatetimeFormat: def setup(self): - self.count = 100000 - self.s = Series(['19MAY11', '19MAY11:00:00:00'] * self.count) + self.s = Series(['19MAY11', '19MAY11:00:00:00'] * 100000) self.s2 = self.s.str.replace(':\\S+$', '') - self.s3 = Series(['02-11-2000'] * self.count) def time_exact(self): to_datetime(self.s2, format='%d%b%y') @@ -365,8 +363,6 @@ def time_exact(self): def time_no_exact(self): to_datetime(self.s, format='%d%b%y', exact=False) - def time_DDMMYYYY(self): - to_datetime(self.s3, format='%d-%m-%Y') class ToDatetimeCache: