Skip to content

Commit bbfb95d

Browse files
committed
BUG: usecols bug with implicit first index column. close #2654
1 parent e430ac4 commit bbfb95d

File tree

3 files changed

+36
-16
lines changed

3 files changed

+36
-16
lines changed

RELEASE.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ pandas 0.10.1
107107
- Prevent MemoryError when using counting sort in sortlevel with
108108
high-cardinality MultiIndex objects (GH2684_)
109109
- Fix Period resampling bug when all values fall into a single bin (GH2070_)
110+
- Fix buggy interaction with usecols argument in read_csv when there is an
111+
implicit first index column (GH2654_)
110112

111113
**API Changes**
112114

@@ -136,6 +138,7 @@ pandas 0.10.1
136138
.. _GH2637: https://github.com/pydata/pandas/issues/2637
137139
.. _GH2643: https://github.com/pydata/pandas/issues/2643
138140
.. _GH2649: https://github.com/pydata/pandas/issues/2649
141+
.. _GH2654: https://github.com/pydata/pandas/issues/2654
139142
.. _GH2668: https://github.com/pydata/pandas/issues/2668
140143
.. _GH2684: https://github.com/pydata/pandas/issues/2684
141144
.. _GH2689: https://github.com/pydata/pandas/issues/2689

pandas/io/tests/test_parsers.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1855,6 +1855,16 @@ def test_usecols(self):
18551855
self.assertRaises(ValueError, self.read_csv, StringIO(data),
18561856
names=['a', 'b'], usecols=[1], header=None)
18571857

1858+
def test_usecols_implicit_index_col(self):
1859+
# #2654
1860+
data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
1861+
1862+
result = self.read_csv(StringIO(data), usecols=['a', 'b'])
1863+
expected = DataFrame({'a': ['apple', 'orange'],
1864+
'b': ['bat', 'cow']}, index=[4, 8])
1865+
1866+
tm.assert_frame_equal(result, expected)
1867+
18581868
def test_pure_python_failover(self):
18591869
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
18601870

pandas/src/parser.pyx

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -604,7 +604,7 @@ cdef class TextReader:
604604
# Corner case, not enough lines in the file
605605
if self.parser.lines < data_line + 1:
606606
field_count = len(header)
607-
elif not self.has_usecols:
607+
else: # not self.has_usecols:
608608
field_count = self.parser.line_fields[data_line]
609609

610610
passed_count = len(header)
@@ -614,15 +614,18 @@ cdef class TextReader:
614614
'data has %d fields'
615615
% (passed_count, field_count))
616616

617+
if self.has_usecols:
618+
nuse = len(self.usecols)
619+
if nuse == passed_count:
620+
self.leading_cols = 0
621+
elif self.names is None and nuse < passed_count:
622+
self.leading_cols = field_count - passed_count
623+
elif passed_count != field_count:
624+
raise ValueError('Passed header names '
625+
'mismatches usecols')
617626
# oh boy, #2442
618-
if self.allow_leading_cols:
627+
elif self.allow_leading_cols:
619628
self.leading_cols = field_count - passed_count
620-
else:
621-
# TODO: some better check here
622-
# field_count = len(header)
623-
n = len(header)
624-
if n != field_count and n != len(self.usecols):
625-
raise ValueError('Passed header names mismatches usecols')
626629

627630
return header, field_count
628631

@@ -795,11 +798,14 @@ cdef class TextReader:
795798
results = {}
796799
nused = 0
797800
for i in range(self.table_width):
798-
name = self._get_column_name(i, nused)
799-
800-
if self.has_usecols and not (i in self.usecols or
801-
name in self.usecols):
802-
continue
801+
if i < self.leading_cols:
802+
# Pass through leading columns always
803+
name = i
804+
else:
805+
name = self._get_column_name(i, nused)
806+
if self.has_usecols and not (i in self.usecols or
807+
name in self.usecols):
808+
continue
803809

804810
conv = self._get_converter(i, name)
805811

@@ -837,8 +843,9 @@ cdef class TextReader:
837843

838844
results[i] = col_res
839845

840-
# number of used columns
841-
nused += 1
846+
# number of used column names
847+
if i > self.leading_cols:
848+
nused += 1
842849

843850
self.parser_start += end - start
844851

@@ -1013,7 +1020,7 @@ cdef class TextReader:
10131020
if len(self.names) == len(self.usecols):
10141021
return self.names[nused]
10151022
else:
1016-
return self.names[i]
1023+
return self.names[i - self.leading_cols]
10171024
else:
10181025
if self.header is not None:
10191026
j = i - self.leading_cols

0 commit comments

Comments
 (0)