Skip to content

Commit 6ffed43

Browse files
committed
Merge pull request #4698 from adgaudio/master
HDFStore.append_to_multiple doesn't write rows that are all np.nan
2 parents ec3fb68 + e317087 commit 6ffed43

File tree

4 files changed

+59
-12
lines changed

4 files changed

+59
-12
lines changed

doc/source/io.rst

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2170,37 +2170,45 @@ multiple tables at once. The idea is to have one table (call it the
21702170
selector table) that you index most/all of the columns, and perform your
21712171
queries. The other table(s) are data tables with an index matching the
21722172
selector table's index. You can then perform a very fast query
2173-
on the selector table, yet get lots of data back. This method works similar to
2174-
having a very wide table, but is more efficient in terms of queries.
2173+
on the selector table, yet get lots of data back. This method is similar to
2174+
having a very wide table, but enables more efficient queries.
21752175
2176-
Note, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This
2177-
means, append to the tables in the same order; ``append_to_multiple``
2178-
splits a single object to multiple tables, given a specification (as a
2179-
dictionary). This dictionary is a mapping of the table names to the
2180-
'columns' you want included in that table. Pass a `None` for a single
2181-
table (optional) to let it have the remaining columns. The argument
2182-
``selector`` defines which table is the selector table.
2176+
The ``append_to_multiple`` method splits a given single DataFrame
2177+
into multiple tables according to ``d``, a dictionary that maps the
2178+
table names to a list of 'columns' you want in that table. If `None`
2179+
is used in place of a list, that table will have the remaining
2180+
unspecified columns of the given DataFrame. The argument ``selector``
2181+
defines which table is the selector table (which you can make queries from).
2182+
The argument ``dropna`` will drop rows from the input DataFrame to ensure
2183+
tables are synchronized. This means that if a row for one of the tables
2184+
being written to is entirely ``np.NaN``, that row will be dropped from all tables.
2185+
2186+
If ``dropna`` is False, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**.
2187+
Remember that entirely ``np.Nan`` rows are not written to the HDFStore, so if
2188+
you choose to call ``dropna=False``, some tables may have more rows than others,
2189+
and therefore ``select_as_multiple`` may not work or it may return unexpected
2190+
results.
21832191
21842192
.. ipython:: python
21852193
21862194
df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8),
21872195
columns=['A', 'B', 'C', 'D', 'E', 'F'])
21882196
df_mt['foo'] = 'bar'
2197+
df_mt.ix[1, ('A', 'B')] = np.nan
21892198
21902199
# you can also create the tables individually
21912200
store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None },
21922201
df_mt, selector='df1_mt')
21932202
store
21942203
2195-
# indiviual tables were created
2204+
# individual tables were created
21962205
store.select('df1_mt')
21972206
store.select('df2_mt')
21982207
21992208
# as a multiple
22002209
store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'],
22012210
selector = 'df1_mt')
22022211
2203-
.. _io.hdf5-delete:
22042212
22052213
Delete from a Table
22062214
~~~~~~~~~~~~~~~~~~~

doc/source/release.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ pandas 0.13
9595

9696
- ``HDFStore``
9797

98+
- ``append_to_multiple`` automatically synchronizes writing rows to multiple
99+
tables and adds a ``dropna`` kwarg (:issue:`4698`)
98100
- handle a passed ``Series`` in table format (:issue:`4330`)
99101
- added an ``is_open`` property to indicate if the underlying file handle is_open;
100102
a closed store will now report 'CLOSED' when viewing the store (rather than raising an error)

pandas/io/pytables.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,7 @@ def append(self, key, value, format=None, append=True, columns=None, dropna=None
786786
kwargs = self._validate_format(format, kwargs)
787787
self._write_to_group(key, value, append=append, dropna=dropna, **kwargs)
788788

789-
def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs):
789+
def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, dropna=True, **kwargs):
790790
"""
791791
Append to multiple tables
792792
@@ -798,6 +798,9 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, *
798798
selector : a string that designates the indexable table; all of its columns will
799799
be designed as data_columns, unless data_columns is passed, in which
800800
case these are used
801+
data_columns : list of columns to create as data columns, or True to use all columns
802+
dropna : if evaluates to True, drop rows from all tables if any single
803+
row in each table has all NaN
801804
802805
Notes
803806
-----
@@ -840,6 +843,14 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, *
840843
if data_columns is None:
841844
data_columns = d[selector]
842845

846+
# ensure rows are synchronized across the tables
847+
if dropna:
848+
idxs = (value[cols].dropna(how='all').index for cols in d.values())
849+
valid_index = next(idxs)
850+
for index in idxs:
851+
valid_index = valid_index.intersection(index)
852+
value = value.ix[valid_index]
853+
843854
# append
844855
for k, v in d.items():
845856
dc = data_columns if k == selector else None

pandas/io/tests/test_pytables.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2902,6 +2902,32 @@ def test_append_to_multiple(self):
29022902
expected = df[(df.A > 0) & (df.B > 0)]
29032903
tm.assert_frame_equal(result, expected)
29042904

2905+
def test_append_to_multiple_dropna(self):
2906+
df1 = tm.makeTimeDataFrame()
2907+
df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
2908+
df1.ix[1, ['A', 'B']] = np.nan
2909+
df = concat([df1, df2], axis=1)
2910+
2911+
with ensure_clean(self.path) as store:
2912+
# dropna=True should guarantee rows are synchronized
2913+
store.append_to_multiple(
2914+
{'df1': ['A', 'B'], 'df2': None}, df, selector='df1',
2915+
dropna=True)
2916+
result = store.select_as_multiple(['df1', 'df2'])
2917+
expected = df.dropna()
2918+
tm.assert_frame_equal(result, expected)
2919+
tm.assert_index_equal(store.select('df1').index,
2920+
store.select('df2').index)
2921+
2922+
# dropna=False shouldn't synchronize row indexes
2923+
store.append_to_multiple(
2924+
{'df1': ['A', 'B'], 'df2': None}, df, selector='df1',
2925+
dropna=False)
2926+
self.assertRaises(
2927+
ValueError, store.select_as_multiple, ['df1', 'df2'])
2928+
assert not store.select('df1').index.equals(
2929+
store.select('df2').index)
2930+
29052931
def test_select_as_multiple(self):
29062932

29072933
df1 = tm.makeTimeDataFrame()

0 commit comments

Comments
 (0)