Skip to content

Commit 2d3f3df

Browse files
authored
BUG: read_csv raising ParserError when some chunks have less columns than header (#44644)
1 parent 6b53d07 commit 2d3f3df

File tree

3 files changed

+58
-5
lines changed

3 files changed

+58
-5
lines changed

doc/source/whatsnew/v1.4.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -657,6 +657,7 @@ I/O
657657
- Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`)
658658
- Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
659659
- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`)
660+
- Bug in :func:`read_csv` raising ``ParserError`` when reading file in chunks and aome chunk blocks have fewer columns than header for ``engine="c"`` (:issue:`21211`)
660661
- Bug in :func:`read_csv`, changed exception class when expecting a file path name or file-like object from ``OSError`` to ``TypeError`` (:issue:`43366`)
661662
- Bug in :func:`read_csv` and :func:`read_fwf` ignoring all ``skiprows`` except first when ``nrows`` is specified for ``engine='python'`` (:issue:`44021`, :issue:`10261`)
662663
- Bug in :func:`read_csv` keeping the original column in object format when ``keep_date_col=True`` is set (:issue:`13378`)

pandas/_libs/parsers.pyx

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -926,12 +926,19 @@ cdef class TextReader:
926926
self.parser.line_fields[i] + \
927927
(num_cols >= self.parser.line_fields[i]) * num_cols
928928

929-
if self.table_width - self.leading_cols > num_cols:
930-
raise ParserError(f"Too many columns specified: expected "
931-
f"{self.table_width - self.leading_cols} "
932-
f"and found {num_cols}")
929+
usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols
930+
names_larger_num_cols = (self.names and
931+
len(self.names) - self.leading_cols > num_cols)
933932

934-
if (self.usecols is not None and not callable(self.usecols) and
933+
if self.table_width - self.leading_cols > num_cols:
934+
if (usecols_not_callable_and_exists
935+
and self.table_width - self.leading_cols < len(self.usecols)
936+
or names_larger_num_cols):
937+
raise ParserError(f"Too many columns specified: expected "
938+
f"{self.table_width - self.leading_cols} "
939+
f"and found {num_cols}")
940+
941+
if (usecols_not_callable_and_exists and
935942
all(isinstance(u, int) for u in self.usecols)):
936943
missing_usecols = [col for col in self.usecols if col >= num_cols]
937944
if missing_usecols:

pandas/tests/io/parser/common/test_chunksize.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,3 +248,48 @@ def test_read_csv_memory_growth_chunksize(all_parsers):
248248
with parser.read_csv(path, chunksize=20) as result:
249249
for _ in result:
250250
pass
251+
252+
253+
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
254+
# GH#21211
255+
parser = all_parsers
256+
data = """1,2,3,4
257+
5,6,7,8
258+
9,10,11
259+
"""
260+
261+
result_chunks = parser.read_csv(
262+
StringIO(data),
263+
names=["a", "b"],
264+
chunksize=2,
265+
usecols=[0, 1],
266+
header=None,
267+
)
268+
269+
expected_frames = [
270+
DataFrame({"a": [1, 5], "b": [2, 6]}),
271+
DataFrame({"a": [9], "b": [10]}, index=[2]),
272+
]
273+
274+
for i, result in enumerate(result_chunks):
275+
tm.assert_frame_equal(result, expected_frames[i])
276+
277+
278+
def test_chunksize_second_block_shorter(all_parsers):
279+
# GH#21211
280+
parser = all_parsers
281+
data = """a,b,c,d
282+
1,2,3,4
283+
5,6,7,8
284+
9,10,11
285+
"""
286+
287+
result_chunks = parser.read_csv(StringIO(data), chunksize=2)
288+
289+
expected_frames = [
290+
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
291+
DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]),
292+
]
293+
294+
for i, result in enumerate(result_chunks):
295+
tm.assert_frame_equal(result, expected_frames[i])

0 commit comments

Comments
 (0)