diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 3a04b306fefdb..d2550f9aaf5ca 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -657,6 +657,7 @@ I/O - Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`) - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`) - Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) +- Bug in :func:`read_csv` raising ``ParserError`` when reading file in chunks and aome chunk blocks have fewer columns than header for ``engine="c"`` (:issue:`21211`) - Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`) - Bug in :func:`read_csv` and :func:`read_fwf` ignoring all ``skiprows`` except first when ``nrows`` is specified for ``engine='python'`` (:issue:`44021`, :issue:`10261`) - Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index d2975f83b97d7..5a6ad2194f99f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -926,12 +926,19 @@ cdef class TextReader: self.parser.line_fields[i] + \ (num_cols >= self.parser.line_fields[i]) * num_cols - if self.table_width - self.leading_cols > num_cols: - raise ParserError(f"Too many columns specified: expected " - f"{self.table_width - self.leading_cols} " - f"and found {num_cols}") + usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols + names_larger_num_cols = (self.names and + len(self.names) - self.leading_cols > num_cols) - if (self.usecols is not None and not callable(self.usecols) and + if self.table_width - self.leading_cols > num_cols: + if (usecols_not_callable_and_exists + and self.table_width - self.leading_cols < len(self.usecols) + or names_larger_num_cols): + raise ParserError(f"Too many columns specified: expected " + f"{self.table_width - self.leading_cols} " + f"and found {num_cols}") + + if (usecols_not_callable_and_exists and all(isinstance(u, int) for u in self.usecols)): missing_usecols = [col for col in self.usecols if col >= num_cols] if missing_usecols: diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index e8a8769bc6291..4c26047d98acc 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -248,3 +248,48 @@ def test_read_csv_memory_growth_chunksize(all_parsers): with parser.read_csv(path, chunksize=20) as result: for _ in result: pass + + +def test_chunksize_with_usecols_second_block_shorter(all_parsers): + # GH#21211 + parser = all_parsers + data = """1,2,3,4 +5,6,7,8 +9,10,11 +""" + + result_chunks = parser.read_csv( + StringIO(data), + names=["a", "b"], + chunksize=2, + usecols=[0, 1], + header=None, + ) + + expected_frames = [ + DataFrame({"a": [1, 5], "b": [2, 6]}), + DataFrame({"a": [9], "b": [10]}, index=[2]), + ] + + for i, result in enumerate(result_chunks): + tm.assert_frame_equal(result, expected_frames[i]) + + +def test_chunksize_second_block_shorter(all_parsers): + # GH#21211 + parser = all_parsers + data = """a,b,c,d +1,2,3,4 +5,6,7,8 +9,10,11 +""" + + result_chunks = parser.read_csv(StringIO(data), chunksize=2) + + expected_frames = [ + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), + DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]), + ] + + for i, result in enumerate(result_chunks): + tm.assert_frame_equal(result, expected_frames[i])