diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 77b689569d57f..467cb5a40213c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -699,6 +699,7 @@ I/O - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) - Bug in :meth:`DataFrame.to_html` where header numbers would ignore display options when rounding (:issue:`17280`) +- Bug in :func:`read_hdf` where reading a table from an HDF5 file written directly with PyTables fails with a ``ValueError`` when using a sub-selection via the ``start`` or ``stop`` arguments (:issue:`11188`) - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) - Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 79d6d8563a162..17d580bae5cf1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1624,7 +1624,8 @@ def infer(self, handler): new_self.read_metadata(handler) return new_self - def convert(self, values, nan_rep, encoding, errors): + def convert(self, values, nan_rep, encoding, errors, start=None, + stop=None): """ set the values from this selection: take = take ownership """ # values is a recarray @@ -1813,10 +1814,29 @@ class GenericIndexCol(IndexCol): def is_indexed(self): return False - def convert(self, values, nan_rep, encoding, errors): - """ set the values from this selection: take = take ownership """ + def convert(self, values, nan_rep, encoding, errors, start=None, + stop=None): + """ set the values from this selection: take = take ownership + + Parameters + ---------- + + values : np.ndarray + nan_rep : str + encoding : str + errors : str + start : int, optional + Table row number: the start of the sub-selection. + stop : int, optional + Table row number: the end of the sub-selection. Values larger than + the underlying table's row count are normalized to that. + """ + + start = start if start is not None else 0 + stop = (min(stop, self.table.nrows) + if stop is not None else self.table.nrows) + self.values = Int64Index(np.arange(stop - start)) - self.values = Int64Index(np.arange(self.table.nrows)) return self def get_attr(self): @@ -2159,7 +2179,8 @@ def validate_attr(self, append): raise ValueError("appended items dtype do not match existing " "items dtype in table!") - def convert(self, values, nan_rep, encoding, errors): + def convert(self, values, nan_rep, encoding, errors, start=None, + stop=None): """set the data from this selection (and convert to the correct dtype if we can) """ @@ -3431,8 +3452,11 @@ def read_axes(self, where, **kwargs): # convert the data for a in self.axes: a.set_info(self.info) + # `kwargs` may contain `start` and `stop` arguments if passed to + # `store.select()`. If set they determine the index size. a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding, - errors=self.errors) + errors=self.errors, start=kwargs.get('start'), + stop=kwargs.get('stop')) return True diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py new file mode 100644 index 0000000000000..d74e1218ebdb0 --- /dev/null +++ b/pandas/tests/io/pytables/test_compat.py @@ -0,0 +1,76 @@ +import pytest + +import pandas as pd +from pandas.tests.io.test_pytables import ensure_clean_path +from pandas.util.testing import assert_frame_equal + +tables = pytest.importorskip('tables') + + +@pytest.fixture +def pytables_hdf5_file(): + """Use PyTables to create a simple HDF5 file.""" + + table_schema = { + 'c0': tables.Time64Col(pos=0), + 'c1': tables.StringCol(5, pos=1), + 'c2': tables.Int64Col(pos=2), + } + + t0 = 1561105000.0 + + testsamples = [ + {'c0': t0, 'c1': 'aaaaa', 'c2': 1}, + {'c0': t0 + 1, 'c1': 'bbbbb', 'c2': 2}, + {'c0': t0 + 2, 'c1': 'ccccc', 'c2': 10**5}, + {'c0': t0 + 3, 'c1': 'ddddd', 'c2': 4294967295}, + ] + + objname = 'pandas_test_timeseries' + + with ensure_clean_path('written_with_pytables.h5') as path: + # The `ensure_clean_path` context mgr removes the temp file upon exit. + with tables.open_file(path, mode='w') as f: + t = f.create_table('/', name=objname, description=table_schema) + for sample in testsamples: + for key, value in sample.items(): + t.row[key] = value + t.row.append() + + yield path, objname, pd.DataFrame(testsamples) + + +class TestReadPyTablesHDF5: + """ + A group of tests which covers reading HDF5 files written by plain PyTables + (not written by pandas). + + Was introduced for regression-testing issue 11188. + """ + + def test_read_complete(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + result = pd.read_hdf(path, key=objname) + expected = df + assert_frame_equal(result, expected) + + def test_read_with_start(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, start=1) + expected = df[1:].reset_index(drop=True) + assert_frame_equal(result, expected) + + def test_read_with_stop(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, stop=1) + expected = df[:1].reset_index(drop=True) + assert_frame_equal(result, expected) + + def test_read_with_startstop(self, pytables_hdf5_file): + path, objname, df = pytables_hdf5_file + # This is a regression test for pandas-dev/pandas/issues/11188 + result = pd.read_hdf(path, key=objname, start=1, stop=2) + expected = df[1:2].reset_index(drop=True) + assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 299c0feb502be..ef9dbc63d873d 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -105,7 +105,7 @@ def ensure_clean_store(path, mode='a', complevel=None, complib=None, def ensure_clean_path(path): """ return essentially a named temporary file that is not opened - and deleted on existing; if path is a list, then create and + and deleted on exiting; if path is a list, then create and return list of filenames """ try: