diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 58c07a14dec39..cff65967c4229 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -284,7 +284,7 @@ I/O - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) - +- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5400d9bc60218..456ccf9944dbf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1901,6 +1901,12 @@ def __init__(self, src, **kwds): not set(usecols).issubset(self.orig_names)): _validate_usecols_names(usecols, self.orig_names) + # GH 25623 + # validate that column indices in usecols are not out of bounds + elif self.usecols_dtype == 'integer': + indices = lrange(self._reader.table_width) + _validate_usecols_names(usecols, indices) + if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] @@ -2204,7 +2210,8 @@ def __init__(self, f, **kwds): self.skipinitialspace = kwds['skipinitialspace'] self.lineterminator = kwds['lineterminator'] self.quoting = kwds['quoting'] - self.usecols, _ = _validate_usecols_arg(kwds['usecols']) + self.usecols, self.usecols_dtype = _validate_usecols_arg( + kwds['usecols']) self.skip_blank_lines = kwds['skip_blank_lines'] self.warn_bad_lines = kwds['warn_bad_lines'] @@ -2597,6 +2604,13 @@ def _infer_columns(self): if clear_buffer: self._clear_buffer() + # GH 25623 + # validate that column indices in usecols are not out of bounds + if self.usecols_dtype == 'integer': + for col in columns: + indices = lrange(len(col)) + _validate_usecols_names(self.usecols, indices) + if names is not None: if ((self.usecols is not None and len(names) != len(self.usecols)) or @@ -2632,6 +2646,11 @@ def _infer_columns(self): ncols = len(line) num_original_columns = ncols + # GH 25623 + # validate that column indices in usecols are not out of bounds + if self.usecols_dtype == 'integer': + _validate_usecols_names(self.usecols, lrange(ncols)) + if not names: if self.prefix: columns = [['{prefix}{idx}'.format( diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 652f78d198ee8..e513f2d755d07 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -21,6 +21,22 @@ "expected but not found: {0}") +@pytest.mark.parametrize("names,usecols,missing", [ + (None, [0, 3], r"\[3\]"), + (["a", "b", "c"], [0, -1, 2], r"\[-1\]"), + (None, [3], r"\[3\]"), + (["a"], [3], r"\[3\]") +]) +def test_usecols_out_of_bounds(all_parsers, names, usecols, missing): + # See gh-25623 + data = "a,b,c\n1,2,3\n4,5,6" + parser = all_parsers + + mssg = _msg_validate_usecols_names.format(missing) + with pytest.raises(ValueError, match=mssg): + parser.read_csv(StringIO(data), usecols=usecols, names=names) + + def test_raise_on_mixed_dtype_usecols(all_parsers): # See gh-12678 data = """a,b,c