diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2811e31128156..01279da3b5796 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1125,6 +1125,7 @@ Bug Fixes - Bug in ``Categorical.from_codes()`` where an unhelpful error was raised when an invalid ``ordered`` parameter was passed in (:issue:`14058`) - Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`) +- Bug in ``pd.read_csv()`` where the index columns were being incorrectly parsed when parsed as dates with a ``thousands`` parameter (:issue:`14066`) - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) - Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e765ebc36e33e..62f2ad1419d92 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1474,6 +1474,13 @@ def _set(x): else: _set(val) + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + _set(k) + elif self.index_col is not None: + _set(self.index_col) + def set_error_bad_lines(self, status): self._reader.set_error_bad_lines(int(status)) @@ -1856,6 +1863,14 @@ def _set(x): _set(k) else: _set(val) + + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + _set(k) + elif self.index_col is not None: + _set(self.index_col) + return noconvert_columns def _make_reader(self, f): diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py index 01816bde66120..09f524590eca5 100644 --- a/pandas/io/tests/parser/parse_dates.py +++ b/pandas/io/tests/parser/parse_dates.py @@ -474,3 +474,35 @@ def test_parse_dates_empty_string(self): result = self.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False) self.assertTrue(result['Date'].isnull()[1]) + + def test_parse_dates_noconvert_thousands(self): + # see gh-14066 + data = 'a\n04.15.2016' + + expected = DataFrame([datetime(2016, 4, 15)], columns=['a']) + result = self.read_csv(StringIO(data), parse_dates=['a'], + thousands='.') + tm.assert_frame_equal(result, expected) + + exp_index = DatetimeIndex(['2016-04-15'], name='a') + expected = DataFrame(index=exp_index) + result = self.read_csv(StringIO(data), index_col=0, + parse_dates=True, thousands='.') + tm.assert_frame_equal(result, expected) + + data = 'a,b\n04.15.2016,09.16.2013' + + expected = DataFrame([[datetime(2016, 4, 15), + datetime(2013, 9, 16)]], + columns=['a', 'b']) + result = self.read_csv(StringIO(data), parse_dates=['a', 'b'], + thousands='.') + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[datetime(2016, 4, 15), + datetime(2013, 9, 16)]], + columns=['a', 'b']) + expected = expected.set_index(['a', 'b']) + result = self.read_csv(StringIO(data), index_col=[0, 1], + parse_dates=True, thousands='.') + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index ac32c20034c66..16a19c50be960 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -5,13 +5,12 @@ for all of the parsers defined in parsers.py """ -from datetime import datetime import nose import numpy as np import pandas.util.testing as tm -from pandas import DataFrame +from pandas import DataFrame, Index from pandas.lib import Timestamp from pandas.compat import StringIO @@ -99,35 +98,31 @@ def test_usecols_index_col_False(self): def test_usecols_index_col_conflict(self): # see gh-4201: test that index_col as integer reflects usecols - data = """SecId,Time,Price,P2,P3 -10000,2013-5-11,100,10,1 -500,2013-5-12,101,11,1 -""" - expected = DataFrame({'Price': [100, 101]}, index=[ - datetime(2013, 5, 11), datetime(2013, 5, 12)]) - expected.index.name = 'Time' + data = 'a,b,c,d\nA,a,1,one\nB,b,2,two' + expected = DataFrame({'c': [1, 2]}, index=Index( + ['a', 'b'], name='b')) - df = self.read_csv(StringIO(data), usecols=[ - 'Time', 'Price'], parse_dates=True, index_col=0) + df = self.read_csv(StringIO(data), usecols=['b', 'c'], + index_col=0) tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[ - 'Time', 'Price'], parse_dates=True, index_col='Time') + df = self.read_csv(StringIO(data), usecols=['b', 'c'], + index_col='b') tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[ - 1, 2], parse_dates=True, index_col='Time') + df = self.read_csv(StringIO(data), usecols=[1, 2], + index_col='b') tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[ - 1, 2], parse_dates=True, index_col=0) + df = self.read_csv(StringIO(data), usecols=[1, 2], + index_col=0) tm.assert_frame_equal(expected, df) expected = DataFrame( - {'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)}) - expected = expected.set_index(['Price', 'P2']) - df = self.read_csv(StringIO(data), usecols=[ - 'Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) + {'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')}) + expected = expected.set_index(['b', 'c']) + df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'], + index_col=['b', 'c']) tm.assert_frame_equal(expected, df) def test_usecols_implicit_index_col(self):