diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 2545407ce43c9..d9983759083ca 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -55,6 +55,12 @@ API changes +- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`) + + + + + @@ -95,6 +101,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- ``usecols`` parameter in ``pd.read_csv`` is now respected even when the lines of a CSV file are not even (:issue:`12203`) - Bug in ``Period`` and ``PeriodIndex`` creation raises ``KeyError`` if ``freq="Minute"`` is specified. Note that "Minute" freq is deprecated in v0.17.0, and recommended to use ``freq="T"`` instead (:issue:`11854`) - Bug in printing data which contains ``Period`` with different ``freq`` raises ``ValueError`` (:issue:`12615`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fa9a5cf12570d..36a9abdfbca60 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1984,7 +1984,9 @@ def _rows_to_cols(self, content): raise ValueError('skip footer cannot be negative') # Loop through rows to verify lengths are correct. - if col_len != zip_len and self.index_col is not False: + if (col_len != zip_len and + self.index_col is not False and + self.usecols is None): i = 0 for (i, l) in enumerate(content): if len(l) != col_len: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 700ec3387d459..9f53fc1ded882 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2664,6 +2664,37 @@ def test_empty_header_read(count): for count in range(1, 101): test_empty_header_read(count) + def test_uneven_lines_with_usecols(self): + # See gh-12203 + csv = r"""a,b,c + 0,1,2 + 3,4,5,6,7 + 8,9,10 + """ + + # make sure that an error is still thrown + # when the 'usecols' parameter is not provided + msg = "Expected \d+ fields in line \d+, saw \d+" + with tm.assertRaisesRegexp(ValueError, msg): + df = self.read_csv(StringIO(csv)) + + expected = DataFrame({ + 'a': [0, 3, 8], + 'b': [1, 4, 9] + }) + + usecols = [0, 1] + df = self.read_csv(StringIO(csv), usecols=usecols) + tm.assert_frame_equal(df, expected) + + usecols = ['a', 1] + df = self.read_csv(StringIO(csv), usecols=usecols) + tm.assert_frame_equal(df, expected) + + usecols = ['a', 'b'] + df = self.read_csv(StringIO(csv), usecols=usecols) + tm.assert_frame_equal(df, expected) + class TestPythonParser(ParserTests, tm.TestCase): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index f9b8d921f02d1..e2ba8d9d07ae2 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -143,6 +143,8 @@ cdef extern from "parser/tokenizer.h": int allow_embedded_newline int strict # raise exception on bad CSV */ + int usecols + int expected_fields int error_bad_lines int warn_bad_lines @@ -350,6 +352,8 @@ cdef class TextReader: self.compression = compression self.memory_map = memory_map + self.parser.usecols = (usecols is not None) + self._setup_parser_source(source) parser_set_default_options(self.parser) @@ -1208,7 +1212,7 @@ cdef class TextReader: else: return None -class CParserError(Exception): +class CParserError(ValueError): pass diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index dae15215929b7..a75ce2bde80e6 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -494,7 +494,8 @@ static int end_line(parser_t *self) { /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */ if (!(self->lines <= self->header_end + 1) - && (self->expected_fields < 0 && fields > ex_fields)) { + && (self->expected_fields < 0 && fields > ex_fields) + && !(self->usecols)) { // increment file line count self->file_lines++; diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index a2d7925df08e2..2d1b7fae58714 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -184,6 +184,8 @@ typedef struct parser_t { int allow_embedded_newline; int strict; /* raise exception on bad CSV */ + int usecols; // Boolean: 1: usecols provided, 0: none provided + int expected_fields; int error_bad_lines; int warn_bad_lines;