Merge pull request #4698 from adgaudio/master

jreback · jreback · commit 6ffed435eed6 · 2013-09-03T04:08:13.000-07:00
HDFStore.append_to_multiple doesn't write rows that are all np.nan
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2170,37 +2170,45 @@ multiple tables at once. The idea is to have one table (call it the
 selector table) that you index most/all of the columns, and perform your
 queries. The other table(s) are data tables with an index matching the
 selector table's index. You can then perform a very fast query
-on the selector table, yet get lots of data back. This method works similar to
-having a very wide table, but is more efficient in terms of queries.
+on the selector table, yet get lots of data back. This method is similar to
+having a very wide table, but enables more efficient queries.
 
-Note, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**. This
-means, append to the tables in the same order; ``append_to_multiple``
-splits a single object to multiple tables, given a specification (as a
-dictionary). This dictionary is a mapping of the table names to the
-'columns' you want included in that table. Pass a `None` for a single
-table (optional) to let it have the remaining columns. The argument
-``selector`` defines which table is the selector table.
+The ``append_to_multiple`` method splits a given single DataFrame
+into multiple tables according to ``d``, a dictionary that maps the
+table names to a list of 'columns' you want in that table. If `None`
+is used in place of a list, that table will have the remaining
+unspecified columns of the given DataFrame. The argument ``selector``
+defines which table is the selector table (which you can make queries from).
+The argument ``dropna`` will drop rows from the input DataFrame to ensure
+tables are synchronized.  This means that if a row for one of the tables
+being written to is entirely ``np.NaN``, that row will be dropped from all tables.
+
+If ``dropna`` is False, **THE USER IS RESPONSIBLE FOR SYNCHRONIZING THE TABLES**.
+Remember that entirely ``np.Nan`` rows are not written to the HDFStore, so if
+you choose to call ``dropna=False``, some tables may have more rows than others,
+and therefore ``select_as_multiple`` may not work or it may return unexpected
+results.
 
 .. ipython:: python
 
    df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8),
                                   columns=['A', 'B', 'C', 'D', 'E', 'F'])
    df_mt['foo'] = 'bar'
+   df_mt.ix[1, ('A', 'B')] = np.nan
 
    # you can also create the tables individually
    store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None },
                              df_mt, selector='df1_mt')
    store
 
-   # indiviual tables were created
+   # individual tables were created
    store.select('df1_mt')
    store.select('df2_mt')
 
    # as a multiple
    store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'],
                              selector = 'df1_mt')
 
-.. _io.hdf5-delete:
 
 Delete from a Table
 ~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -95,6 +95,8 @@ pandas 0.13
 
   - ``HDFStore``
 
+    - ``append_to_multiple`` automatically synchronizes writing rows to multiple
+      tables and adds a ``dropna`` kwarg (:issue:`4698`)
     - handle a passed ``Series`` in table format (:issue:`4330`)
     - added an ``is_open`` property to indicate if the underlying file handle is_open;
       a closed store will now report 'CLOSED' when viewing the store (rather than raising an error)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -786,7 +786,7 @@ def append(self, key, value, format=None, append=True, columns=None, dropna=None
         kwargs = self._validate_format(format, kwargs)
         self._write_to_group(key, value, append=append, dropna=dropna, **kwargs)
 
-    def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs):
+    def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, dropna=True, **kwargs):
         """
         Append to multiple tables
 
@@ -798,6 +798,9 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, *
         selector : a string that designates the indexable table; all of its columns will
                    be designed as data_columns, unless data_columns is passed, in which
                    case these are used
+        data_columns : list of columns to create as data columns, or True to use all columns
+        dropna : if evaluates to True, drop rows from all tables if any single
+                 row in each table has all NaN
 
         Notes
         -----
@@ -840,6 +843,14 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, *
         if data_columns is None:
             data_columns = d[selector]
 
+        # ensure rows are synchronized across the tables
+        if dropna:
+            idxs = (value[cols].dropna(how='all').index for cols in d.values())
+            valid_index = next(idxs)
+            for index in idxs:
+                valid_index = valid_index.intersection(index)
+            value = value.ix[valid_index]
+
         # append
         for k, v in d.items():
             dc = data_columns if k == selector else None
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -2902,6 +2902,32 @@ def test_append_to_multiple(self):
             expected = df[(df.A > 0) & (df.B > 0)]
             tm.assert_frame_equal(result, expected)
 
+    def test_append_to_multiple_dropna(self):
+        df1 = tm.makeTimeDataFrame()
+        df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x)
+        df1.ix[1, ['A', 'B']] = np.nan
+        df = concat([df1, df2], axis=1)
+
+        with ensure_clean(self.path) as store:
+            # dropna=True should guarantee rows are synchronized
+            store.append_to_multiple(
+                {'df1': ['A', 'B'], 'df2': None}, df, selector='df1',
+                dropna=True)
+            result = store.select_as_multiple(['df1', 'df2'])
+            expected = df.dropna()
+            tm.assert_frame_equal(result, expected)
+            tm.assert_index_equal(store.select('df1').index,
+                                  store.select('df2').index)
+
+            # dropna=False shouldn't synchronize row indexes
+            store.append_to_multiple(
+                {'df1': ['A', 'B'], 'df2': None}, df, selector='df1',
+                dropna=False)
+            self.assertRaises(
+                ValueError, store.select_as_multiple, ['df1', 'df2'])
+            assert not store.select('df1').index.equals(
+                store.select('df2').index)
+
     def test_select_as_multiple(self):
 
         df1 = tm.makeTimeDataFrame()