pandas-dev · shawnheide · Jul 25, 2016
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -788,3 +788,4 @@ Bug Fixes
 - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`)
 
 - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`)
+- Bug in ``pd.read_hdf()`` returns incorrect result when HDF Store contains a DataFrame with a categorical column and query doesn't match any values(:issue:`13792`)
diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py
@@ -198,6 +198,11 @@ def stringify(value):
         elif meta == u('category'):
             metadata = com._values_from_object(self.metadata)
             result = metadata.searchsorted(v, side='left')
+
+            # result returns 0 if v is first element or if v is not in metadata
+            # check that metadata contains v
+            if not result and v not in metadata:
+                result = -1
             return TermValue(result, result, u('integer'))
         elif kind == u('integer'):
             v = int(float(v))

diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -4733,6 +4733,36 @@ def test_categorical(self):
             self.assertRaises(
                 KeyError, lambda: store.select('df3/meta/s/meta'))
 
+    def test_categorical_conversion(self):
+
+        # GH13322
+        # Check that read_hdf with categorical columns doesn't return rows if
+        # where criteria isn't met.
+        obsids = ['ESP_012345_6789', 'ESP_987654_3210']
+        imgids = ['APF00006np', 'APF0001imm']
+        data = [4.3, 9.8]
+
+        # Test without categories
+        df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data))
+
+        # We are expecting an empty DataFrame matching types of df
+        expected = df.iloc[[], :]
+        with ensure_clean_path(self.path) as path:
+            df.to_hdf(path, 'df', format='table', data_columns=True)
+            result = read_hdf(path, 'df', where='obsids=B')
+            tm.assert_frame_equal(result, expected)
+
+        # Test with categories
+        df.obsids = df.obsids.astype('category')
+        df.imgids = df.imgids.astype('category')
+
+        # We are expecting an empty DataFrame matching types of df
+        expected = df.iloc[[], :]
+        with ensure_clean_path(self.path) as path:
+            df.to_hdf(path, 'df', format='table', data_columns=True)
+            result = read_hdf(path, 'df', where='obsids=B')
+            tm.assert_frame_equal(result, expected)
+
     def test_duplicate_column_name(self):
         df = DataFrame(columns=["a", "a"], data=[[0, 0]])
Original file line number	Diff line number	Diff line change
Expand Up		@@ -788,3 +788,4 @@ Bug Fixes
		- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`)

		- Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`)
		- Bug in ``pd.read_hdf()`` returns incorrect result when HDF Store contains a DataFrame with a categorical column and query doesn't match any values(:issue:`13792`)