Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ Performance Improvements

Bug Fixes
~~~~~~~~~

- Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove these; these have already been moved

- Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`)

Categorical
^^^^^^^^^^^
Expand Down Expand Up @@ -284,7 +285,7 @@ I/O
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)

- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)

Plotting
^^^^^^^^
Expand Down
18 changes: 17 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1901,6 +1901,11 @@ def __init__(self, src, **kwds):
not set(usecols).issubset(self.orig_names)):
_validate_usecols_names(usecols, self.orig_names)

# GH 25623
elif self.usecols_dtype == 'integer':
indices = lrange(self._reader.table_width)
_validate_usecols_names(usecols, indices)

if len(self.names) > len(usecols):
self.names = [n for i, n in enumerate(self.names)
if (i in usecols or n in usecols)]
Expand Down Expand Up @@ -2204,7 +2209,8 @@ def __init__(self, f, **kwds):
self.skipinitialspace = kwds['skipinitialspace']
self.lineterminator = kwds['lineterminator']
self.quoting = kwds['quoting']
self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
self.usecols, self.usecols_dtype = _validate_usecols_arg(
kwds['usecols'])
self.skip_blank_lines = kwds['skip_blank_lines']

self.warn_bad_lines = kwds['warn_bad_lines']
Expand Down Expand Up @@ -2597,6 +2603,12 @@ def _infer_columns(self):
if clear_buffer:
self._clear_buffer()

# GH 25623
if self.usecols_dtype == 'integer':
for col in columns:
indices = lrange(len(col))
_validate_usecols_names(self.usecols, indices)

if names is not None:
if ((self.usecols is not None and
len(names) != len(self.usecols)) or
Expand Down Expand Up @@ -2632,6 +2644,10 @@ def _infer_columns(self):
ncols = len(line)
num_original_columns = ncols

# GH25623
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here as well

if self.usecols_dtype == 'integer':
_validate_usecols_names(self.usecols, lrange(ncols))

if not names:
if self.prefix:
columns = [['{prefix}{idx}'.format(
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/io/parser/test_usecols.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,21 @@
"expected but not found: {0}")


@pytest.mark.parametrize("names,usecols,missing", [
(None, [0, 3], r"\[3\]"),
(["a", "b", "c"], [0, -1, 2], r"\[-1\]"),
(None, [3], r"\[3\]"),
(["a"], [3], r"\[3\]")
])
def test_usecols_out_of_bounds(all_parsers, names, usecols, missing):
data = "a,b,c\n1,2,3\n4,5,6"
parser = all_parsers

mssg = _msg_validate_usecols_names.format(missing)
with pytest.raises(ValueError, match=mssg):
parser.read_csv(StringIO(data), usecols=usecols, names=names)


def test_raise_on_mixed_dtype_usecols(all_parsers):
# See gh-12678
data = """a,b,c
Expand Down