From a4612085e40dfded55cf00a05b01f789a8bcf784 Mon Sep 17 00:00:00 2001 From: Shawn Heide Date: Mon, 25 Jul 2016 16:13:16 -0700 Subject: [PATCH] BUG: fix categories in HDFStore not filtering correctly (#13322) --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/computation/pytables.py | 5 +++++ pandas/io/tests/test_pytables.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 11d2fab464d1f..24c38b5c660a9 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -788,3 +788,4 @@ Bug Fixes - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) +- Bug in ``pd.read_hdf()`` returns incorrect result when HDF Store contains a DataFrame with a categorical column and query doesn't match any values(:issue:`13792`) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index e375716b0d606..a4dd03a0fa7ee 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -198,6 +198,11 @@ def stringify(value): elif meta == u('category'): metadata = com._values_from_object(self.metadata) result = metadata.searchsorted(v, side='left') + + # result returns 0 if v is first element or if v is not in metadata + # check that metadata contains v + if not result and v not in metadata: + result = -1 return TermValue(result, result, u('integer')) elif kind == u('integer'): v = int(float(v)) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index f95e764ad4da3..e214ea5237f30 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4733,6 +4733,36 @@ def test_categorical(self): self.assertRaises( KeyError, lambda: store.select('df3/meta/s/meta')) + def test_categorical_conversion(self): + + # GH13322 + # Check that read_hdf with categorical columns doesn't return rows if + # where criteria isn't met. + obsids = ['ESP_012345_6789', 'ESP_987654_3210'] + imgids = ['APF00006np', 'APF0001imm'] + data = [4.3, 9.8] + + # Test without categories + df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data)) + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table', data_columns=True) + result = read_hdf(path, 'df', where='obsids=B') + tm.assert_frame_equal(result, expected) + + # Test with categories + df.obsids = df.obsids.astype('category') + df.imgids = df.imgids.astype('category') + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table', data_columns=True) + result = read_hdf(path, 'df', where='obsids=B') + tm.assert_frame_equal(result, expected) + def test_duplicate_column_name(self): df = DataFrame(columns=["a", "a"], data=[[0, 0]])