BUG/ENH: fix performance regression in DataFrame constructor from nested dict with integer indexes, add vbench for it, speed up _stack_dict in internals, GH #621

wesm · wesm · commit 79cc4e049fe2 · 2012-01-12T16:48:08.000-05:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3850,6 +3850,8 @@ def _homogenize(data, index, columns, dtype=None):
     if dtype is not None:
         dtype = np.dtype(dtype)
 
+    oindex = None
+
     for k in columns:
         if k not in data:
             # no obvious "empty" int column
@@ -3870,7 +3872,9 @@ def _homogenize(data, index, columns, dtype=None):
                 v = v.reindex(index, copy=False)
         else:
             if isinstance(v, dict):
-                v = lib.fast_multiget(v, index, default=np.nan)
+                if oindex is None:
+                    oindex = index.astype('O')
+                v = lib.fast_multiget(v, oindex, default=np.nan)
 
             v = _sanitize_array(v, index, dtype=dtype, copy=False,
                                 raise_cast_failure=False)
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -953,14 +953,14 @@ def form_blocks(data, axes):
     return blocks
 
 def _simple_blockify(dct, ref_items, dtype):
-    block_items, values = _stack_dict(dct, ref_items)
+    block_items, values = _stack_dict(dct, ref_items, dtype)
     # CHECK DTYPE?
     if values.dtype != dtype: # pragma: no cover
         values = values.astype(dtype)
 
     return make_block(values, block_items, ref_items, do_integrity_check=True)
 
-def _stack_dict(dct, ref_items):
+def _stack_dict(dct, ref_items, dtype):
     from pandas.core.series import Series
 
     # fml
@@ -971,8 +971,23 @@ def _asarray_compat(x):
         else:
             return np.asarray(x)
 
+    def _shape_compat(x):
+        # sparseseries
+        if isinstance(x, Series):
+            return len(x),
+        else:
+            return x.shape
+
     items = [x for x in ref_items if x in dct]
-    stacked = np.vstack([_asarray_compat(dct[k]) for k in items])
+
+    first = dct[items[0]]
+    shape = (len(dct),) + _shape_compat(first)
+
+    stacked = np.empty(shape, dtype=dtype)
+    for i, item in enumerate(items):
+        stacked[i] = _asarray_compat(dct[item])
+
+    # stacked = np.vstack([_asarray_compat(dct[k]) for k in items])
     return items, stacked
 
 def _blocks_to_series_dict(blocks, index=None):
diff --git a/pandas/core/panel.py b/pandas/core/panel.py
@@ -258,7 +258,7 @@ def _init_dict(self, data, axes, dtype=None):
         axes = [items, major, minor]
         reshaped_data = data.copy() # shallow
 
-        item_shape = (1, len(major), len(minor))
+        item_shape = len(major), len(minor)
         for item in items:
             v = values = data.get(item)
             if v is None:
@@ -270,8 +270,8 @@ def _init_dict(self, data, axes, dtype=None):
                     v = v.astype(dtype)
                 values = v.values
 
-            if values.ndim == 2:
-                values = values[None, :, :]
+            # if values.ndim == 2:
+            #     values = values[None, :, :]
 
             reshaped_data[item] = values
 
diff --git a/vb_suite/frame_ctor.py b/vb_suite/frame_ctor.py
@@ -25,3 +25,10 @@
                                     start_date=datetime(2011, 12, 20))
 
 series_ctor_from_dict = Benchmark("Series(some_dict)", setup)
+
+# nested dict, integer indexes, regression described in #621
+
+setup = common_setup + """
+data = dict((i,dict((j,float(j)) for j in xrange(100))) for i in xrange(2000))
+"""
+frame_ctor_nested_dict_int64 = Benchmark("DataFrame(data)", setup)
diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py
@@ -0,0 +1,25 @@
+from vbench.benchmark import Benchmark
+from datetime import datetime
+
+common_setup = """from pandas_vb_common import *
+"""
+
+#----------------------------------------------------------------------
+# lookup
+
+setup = common_setup + """
+df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh'))
+df['foo'] = 'bar'
+
+row_labels = list(df.index[::10])[:900]
+col_labels = list(df.columns) * 100
+row_labels_all = list(df.index) * len(df.columns)
+col_labels_all = list(df.columns) * len(df.index)
+"""
+
+frame_fancy_lookup = Benchmark('df.lookup(row_labels, col_labels)', setup,
+                               start_date=datetime(2012, 1, 12))
+
+frame_fancy_lookup_all = Benchmark('df.lookup(row_labels_all, col_labels_all)',
+                                   setup,
+                                   start_date=datetime(2012, 1, 12))
diff --git a/vb_suite/suite.py b/vb_suite/suite.py
@@ -5,7 +5,8 @@
 
 modules = ['groupby', 'indexing', 'reindex',
            'sparse', 'index_object', 'miscellaneous',
-           'stat_ops', 'join_merge', 'panel_ctor', 'frame_ctor']
+           'stat_ops', 'join_merge', 'panel_ctor', 'frame_ctor',
+           'frame_methods']
 
 by_module = {}
 benchmarks = []