diff --git a/doc/source/io.rst b/doc/source/io.rst index ded314229225c..21a5f13d65d73 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2677,6 +2677,19 @@ everything in the sub-store and BELOW, so be *careful*. store.remove('food') store +You can walk through the group hierarchy using the ``walk`` method which +will yield a tuple for each group key along with the relative keys of its contents. + +.. ipython:: python + + for (path, subgroups, subkeys) in store.walk(): + for subgroup in subgroups: + print('GROUP: {}/{}'.format(path, subgroup)) + for subkey in subkeys: + key = '/'.join([path, subkey]) + print('KEY: {}'.format(key)) + print(store.get(key)) + .. _io.hdf5-types: Storing Mixed Types in a Table diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 31b6bb0d5575d..30a268dd20f5b 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -255,6 +255,8 @@ Other enhancements pd.concat([foo, bar, baz], 1) +- New method ``HDFStore.walk`` will recursively walk the group hierarchy of a HDF5 file (:issue:`10932`) + .. _whatsnew_0170.api: .. _whatsnew_0170.api_breaking: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ea0a59ce2ab31..8d2e3c7ab1e25 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1038,6 +1038,38 @@ def groups(self): g._v_name != u('table'))) ] + def walk(self): + """ Walk the pytables group hierarchy yielding the group name and pandas object names + for each group. Any non-pandas PyTables objects that are not a group will be ignored. + + Returns + ------- + A generator yielding tuples (`path`, `groups`, `leaves`) where: + + - `path` is the full path to a group, + - `groups` is a list of group names contained in `path` + - `leaves` is a list of pandas object names contained in `path` + + """ + _tables() + self._check_if_open() + for g in self._handle.walk_groups(): + if getattr(g._v_attrs, 'pandas_type', None) is not None: + continue + + groups = [] + leaves = [] + for child in g._v_children.values(): + pandas_type = getattr(child._v_attrs, 'pandas_type', None) + if pandas_type is None: + if isinstance(child, _table_mod.group.Group): + groups.append(child._v_name) + else: + leaves.append(child._v_name) + + yield (g._v_pathname.rstrip('/'), groups, leaves) + + def get_node(self, key): """ return the node with the key or None if it does not exist """ self._check_if_open() diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index b4f1e6a429198..74821eadf13a6 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4813,6 +4813,45 @@ def test_read_nokey(self): df.to_hdf(path, 'df2', mode='a') self.assertRaises(ValueError, read_hdf, path) + # GH10143 + def test_walk(self): + + objs = { + 'df1': pd.DataFrame([1,2,3]), + 'df2': pd.DataFrame([4,5,6]), + 'df3': pd.DataFrame([6,7,8]), + 'df4': pd.DataFrame([9,10,11]), + 's1': pd.Series([10,9,8]), + 'a1': np.array([[1,2,3], [4,5,6]]) + } + + with ensure_clean_store('walk_groups.hdf', mode='w') as store: + store.put('/first_group/df1', objs['df1']) + store.put('/first_group/df2', objs['df2']) + store.put('/second_group/df3', objs['df3']) + store.put('/second_group/s1', objs['s1']) + store.put('/second_group/third_group/df4', objs['df4']) + g1 = store._handle.get_node('/first_group') + store._handle.create_array(g1, 'a1', objs['a1']) + + expect = { + '': (set(['first_group', 'second_group']), set()), + '/first_group': (set(), set(['df1', 'df2'])), + '/second_group': (set(['third_group']), set(['df3', 's1'])), + '/second_group/third_group': (set(), set(['df4'])), + } + + for path, groups, leaves in store.walk(): + self.assertIn(path, expect) + expect_groups, expect_frames = expect[path] + + self.assertEqual(expect_groups, set(groups)) + self.assertEqual(expect_frames, set(leaves)) + for leaf in leaves: + frame_path = '/'.join([path, leaf]) + df = store.get(frame_path) + self.assert_(df.equals(objs[leaf])) + class TestHDFComplexValues(Base): # GH10447