BUG: usecols bug with implicit first index column. close #2654

wesm · wesm · commit bbfb95d5d14e · 2013-01-21T14:07:41.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -107,6 +107,8 @@ pandas 0.10.1
   - Prevent MemoryError when using counting sort in sortlevel with
     high-cardinality MultiIndex objects (GH2684_)
   - Fix Period resampling bug when all values fall into a single bin (GH2070_)
+  - Fix buggy interaction with usecols argument in read_csv when there is an
+    implicit first index column (GH2654_)
 
 **API Changes**
 
@@ -136,6 +138,7 @@ pandas 0.10.1
 .. _GH2637: https://github.com/pydata/pandas/issues/2637
 .. _GH2643: https://github.com/pydata/pandas/issues/2643
 .. _GH2649: https://github.com/pydata/pandas/issues/2649
+.. _GH2654: https://github.com/pydata/pandas/issues/2654
 .. _GH2668: https://github.com/pydata/pandas/issues/2668
 .. _GH2684: https://github.com/pydata/pandas/issues/2684
 .. _GH2689: https://github.com/pydata/pandas/issues/2689
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1855,6 +1855,16 @@ def test_usecols(self):
         self.assertRaises(ValueError, self.read_csv, StringIO(data),
                           names=['a', 'b'], usecols=[1], header=None)
 
+    def test_usecols_implicit_index_col(self):
+        # #2654
+        data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
+
+        result = self.read_csv(StringIO(data), usecols=['a', 'b'])
+        expected = DataFrame({'a': ['apple', 'orange'],
+                              'b': ['bat', 'cow']}, index=[4, 8])
+
+        tm.assert_frame_equal(result, expected)
+
     def test_pure_python_failover(self):
         data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
 
diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx
@@ -604,7 +604,7 @@ cdef class TextReader:
         # Corner case, not enough lines in the file
         if self.parser.lines < data_line + 1:
             field_count = len(header)
-        elif not self.has_usecols:
+        else: # not self.has_usecols:
             field_count = self.parser.line_fields[data_line]
 
             passed_count = len(header)
@@ -614,15 +614,18 @@ cdef class TextReader:
                                    'data has %d fields'
                                    % (passed_count, field_count))
 
+            if self.has_usecols:
+                nuse = len(self.usecols)
+                if nuse == passed_count:
+                    self.leading_cols = 0
+                elif self.names is None and nuse < passed_count:
+                    self.leading_cols = field_count - passed_count
+                elif passed_count != field_count:
+                    raise ValueError('Passed header names '
+                                     'mismatches usecols')
             # oh boy, #2442
-            if self.allow_leading_cols:
+            elif self.allow_leading_cols:
                 self.leading_cols = field_count - passed_count
-        else:
-            # TODO: some better check here
-            # field_count = len(header)
-            n = len(header)
-            if n != field_count and n != len(self.usecols):
-                raise ValueError('Passed header names mismatches usecols')
 
         return header, field_count
 
@@ -795,11 +798,14 @@ cdef class TextReader:
         results = {}
         nused = 0
         for i in range(self.table_width):
-            name = self._get_column_name(i, nused)
-
-            if self.has_usecols and not (i in self.usecols or
-                                         name in self.usecols):
-                continue
+            if i < self.leading_cols:
+                # Pass through leading columns always
+                name = i
+            else:
+                name = self._get_column_name(i, nused)
+                if self.has_usecols and not (i in self.usecols or
+                                             name in self.usecols):
+                    continue
 
             conv = self._get_converter(i, name)
 
@@ -837,8 +843,9 @@ cdef class TextReader:
 
             results[i] = col_res
 
-            # number of used columns
-            nused += 1
+            # number of used column names
+            if i > self.leading_cols:
+                nused += 1
 
         self.parser_start += end - start
 
@@ -1013,7 +1020,7 @@ cdef class TextReader:
             if len(self.names) == len(self.usecols):
                 return self.names[nused]
             else:
-                return self.names[i]
+                return self.names[i - self.leading_cols]
         else:
             if self.header is not None:
                 j = i - self.leading_cols