Skip to content

Commit 79cc4e0

Browse files
committed
BUG/ENH: fix performance regression in DataFrame constructor from nested dict with integer indexes, add vbench for it, speed up _stack_dict in internals, GH #621
1 parent 3ed22d7 commit 79cc4e0

File tree

6 files changed

+60
-8
lines changed

6 files changed

+60
-8
lines changed

pandas/core/frame.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3850,6 +3850,8 @@ def _homogenize(data, index, columns, dtype=None):
38503850
if dtype is not None:
38513851
dtype = np.dtype(dtype)
38523852

3853+
oindex = None
3854+
38533855
for k in columns:
38543856
if k not in data:
38553857
# no obvious "empty" int column
@@ -3870,7 +3872,9 @@ def _homogenize(data, index, columns, dtype=None):
38703872
v = v.reindex(index, copy=False)
38713873
else:
38723874
if isinstance(v, dict):
3873-
v = lib.fast_multiget(v, index, default=np.nan)
3875+
if oindex is None:
3876+
oindex = index.astype('O')
3877+
v = lib.fast_multiget(v, oindex, default=np.nan)
38743878

38753879
v = _sanitize_array(v, index, dtype=dtype, copy=False,
38763880
raise_cast_failure=False)

pandas/core/internals.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -953,14 +953,14 @@ def form_blocks(data, axes):
953953
return blocks
954954

955955
def _simple_blockify(dct, ref_items, dtype):
956-
block_items, values = _stack_dict(dct, ref_items)
956+
block_items, values = _stack_dict(dct, ref_items, dtype)
957957
# CHECK DTYPE?
958958
if values.dtype != dtype: # pragma: no cover
959959
values = values.astype(dtype)
960960

961961
return make_block(values, block_items, ref_items, do_integrity_check=True)
962962

963-
def _stack_dict(dct, ref_items):
963+
def _stack_dict(dct, ref_items, dtype):
964964
from pandas.core.series import Series
965965

966966
# fml
@@ -971,8 +971,23 @@ def _asarray_compat(x):
971971
else:
972972
return np.asarray(x)
973973

974+
def _shape_compat(x):
975+
# sparseseries
976+
if isinstance(x, Series):
977+
return len(x),
978+
else:
979+
return x.shape
980+
974981
items = [x for x in ref_items if x in dct]
975-
stacked = np.vstack([_asarray_compat(dct[k]) for k in items])
982+
983+
first = dct[items[0]]
984+
shape = (len(dct),) + _shape_compat(first)
985+
986+
stacked = np.empty(shape, dtype=dtype)
987+
for i, item in enumerate(items):
988+
stacked[i] = _asarray_compat(dct[item])
989+
990+
# stacked = np.vstack([_asarray_compat(dct[k]) for k in items])
976991
return items, stacked
977992

978993
def _blocks_to_series_dict(blocks, index=None):

pandas/core/panel.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ def _init_dict(self, data, axes, dtype=None):
258258
axes = [items, major, minor]
259259
reshaped_data = data.copy() # shallow
260260

261-
item_shape = (1, len(major), len(minor))
261+
item_shape = len(major), len(minor)
262262
for item in items:
263263
v = values = data.get(item)
264264
if v is None:
@@ -270,8 +270,8 @@ def _init_dict(self, data, axes, dtype=None):
270270
v = v.astype(dtype)
271271
values = v.values
272272

273-
if values.ndim == 2:
274-
values = values[None, :, :]
273+
# if values.ndim == 2:
274+
# values = values[None, :, :]
275275

276276
reshaped_data[item] = values
277277

vb_suite/frame_ctor.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,10 @@
2525
start_date=datetime(2011, 12, 20))
2626

2727
series_ctor_from_dict = Benchmark("Series(some_dict)", setup)
28+
29+
# nested dict, integer indexes, regression described in #621
30+
31+
setup = common_setup + """
32+
data = dict((i,dict((j,float(j)) for j in xrange(100))) for i in xrange(2000))
33+
"""
34+
frame_ctor_nested_dict_int64 = Benchmark("DataFrame(data)", setup)

vb_suite/frame_methods.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from vbench.benchmark import Benchmark
2+
from datetime import datetime
3+
4+
common_setup = """from pandas_vb_common import *
5+
"""
6+
7+
#----------------------------------------------------------------------
8+
# lookup
9+
10+
setup = common_setup + """
11+
df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh'))
12+
df['foo'] = 'bar'
13+
14+
row_labels = list(df.index[::10])[:900]
15+
col_labels = list(df.columns) * 100
16+
row_labels_all = list(df.index) * len(df.columns)
17+
col_labels_all = list(df.columns) * len(df.index)
18+
"""
19+
20+
frame_fancy_lookup = Benchmark('df.lookup(row_labels, col_labels)', setup,
21+
start_date=datetime(2012, 1, 12))
22+
23+
frame_fancy_lookup_all = Benchmark('df.lookup(row_labels_all, col_labels_all)',
24+
setup,
25+
start_date=datetime(2012, 1, 12))

vb_suite/suite.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55

66
modules = ['groupby', 'indexing', 'reindex',
77
'sparse', 'index_object', 'miscellaneous',
8-
'stat_ops', 'join_merge', 'panel_ctor', 'frame_ctor']
8+
'stat_ops', 'join_merge', 'panel_ctor', 'frame_ctor',
9+
'frame_methods']
910

1011
by_module = {}
1112
benchmarks = []

0 commit comments

Comments
 (0)