From 120c4c513feb5318eacbcf1133c8cdadf4dd4bac Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 28 Jan 2014 01:09:47 +0200 Subject: [PATCH 1/6] CLN: add comments in indexing code --- pandas/core/indexing.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 780ad57ed8f13..27dfea5a6b613 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -61,8 +61,7 @@ def _get_label(self, label, axis=0): return self.obj[label] elif (isinstance(label, tuple) and isinstance(label[axis], slice)): - - raise IndexingError('no slices here') + raise IndexingError('no slices here, handle elsewhere') try: return self.obj._xs(label, axis=axis, copy=False) @@ -677,13 +676,14 @@ def _getitem_lowerdim(self, tup): # a bit kludgy if isinstance(ax0, MultiIndex): try: + # fast path for series or for tup devoid of slices return self._get_label(tup, axis=0) except TypeError: # slices are unhashable pass except Exception as e1: if isinstance(tup[0], (slice, Index)): - raise IndexingError + raise IndexingError("Handle elsewhere") # raise the error if we are not sorted if not ax0.is_lexsorted_for_tuple(tup): @@ -694,7 +694,7 @@ def _getitem_lowerdim(self, tup): raise e1 if len(tup) > self.obj.ndim: - raise IndexingError + raise IndexingError("Too many indexers. handle elsewhere") # to avoid wasted computation # df.ix[d1:d2, 0] -> columns first (True) @@ -707,9 +707,9 @@ def _getitem_lowerdim(self, tup): if not _is_list_like(section): return section - # might have been a MultiIndex elif section.ndim == self.ndim: - + # we're in the middle of slicing through a MultiIndex + # revise the key wrt to `section` by inserting an _NS new_key = tup[:i] + (_NS,) + tup[i + 1:] else: @@ -725,6 +725,7 @@ def _getitem_lowerdim(self, tup): if len(new_key) == 1: new_key, = new_key + # This is an elided recursive call to iloc/loc/etc' return getattr(section, self.name)[new_key] raise IndexingError('not applicable') From 6923ec14486384fbe77312bbc10b826d14c9ef01 Mon Sep 17 00:00:00 2001 From: y-p Date: Mon, 27 Jan 2014 19:07:22 +0200 Subject: [PATCH 2/6] CLN: comment out possibly stale kludge fix and wait for explosion --- pandas/core/indexing.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 27dfea5a6b613..f2d94f11552a3 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -688,10 +688,17 @@ def _getitem_lowerdim(self, tup): # raise the error if we are not sorted if not ax0.is_lexsorted_for_tuple(tup): raise e1 - try: - loc = ax0.get_loc(tup[0]) - except KeyError: - raise e1 + + # GH911 introduced this clause, but the regression test + # added for it now passes even without it. Let's rock the boat. + # 2014/01/27 + + # # should we abort, or keep going? + # try: + # loc = ax0.get_loc(tup[0]) + # except KeyError: + # raise e1 + if len(tup) > self.obj.ndim: raise IndexingError("Too many indexers. handle elsewhere") From 7070dd1c062bef64637d645922ddf91c6a45592a Mon Sep 17 00:00:00 2001 From: y-p Date: Mon, 27 Jan 2014 20:59:01 +0200 Subject: [PATCH 3/6] CLN: Mark if clause for handling of per-axis tuple indexing with loc --- pandas/core/indexing.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f2d94f11552a3..ec40cca3eb88c 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1156,6 +1156,11 @@ def _getitem_axis(self, key, axis=0): raise ValueError('Cannot index with multidimensional key') return self._getitem_iterable(key, axis=axis) + elif isinstance(key, tuple) and isinstance(labels, MultiIndex) and \ + any([isinstance(x,slice) for x in key]): + # here we handle a tuple for indexing from an axis + # if it wasn't handled previosuly, it must have slices + raise NotImplementedError('Not yet implemented axis indexing with tuple containing slices') else: self._has_valid_type(key, axis) return self._get_label(key, axis=axis) From e37568491c04d8ab4584c0a7f28a703932c7b96b Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 28 Jan 2014 00:49:48 +0200 Subject: [PATCH 4/6] ENH: support per-axis, per-level indexing with loc[] --- pandas/core/indexing.py | 164 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 161 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ec40cca3eb88c..272fcedf16bb0 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1158,9 +1158,12 @@ def _getitem_axis(self, key, axis=0): return self._getitem_iterable(key, axis=axis) elif isinstance(key, tuple) and isinstance(labels, MultiIndex) and \ any([isinstance(x,slice) for x in key]): - # here we handle a tuple for indexing from an axis - # if it wasn't handled previosuly, it must have slices - raise NotImplementedError('Not yet implemented axis indexing with tuple containing slices') + # handle per-axis tuple containting label criteria for + # each level (or a prefix of levels), may contain + # (None) slices, list of labels or labels + specs = _tuple_to_mi_locs(labels,key) + g = _spec_to_array_indices(labels, specs) + return self.obj.iloc[list(g)] else: self._has_valid_type(key, axis) return self._get_label(key, axis=axis) @@ -1524,3 +1527,158 @@ def _maybe_droplevels(index, key): pass return index + +def _tuple_to_mi_locs(ix,tup): + """Convert a tuple of slices/label lists/labels to a level-wise spec + + Parameters + ---------- + ix: a sufficiently lexsorted, unique/non-dupe MultIindex. + tup: a tuple of slices, labels or lists of labels. + slice(None) is acceptable, and the case of len(tup)>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']]) + >>> for x in mi.get_values(): print(x) + ('A0', 'B0') + ('A0', 'B1') + ('A1', 'B0') + ('A1', 'B1') + ('A2', 'B0') + ('A2', 'B1') + >>> _tuple_to_mi_locs(mi,(slice('A0','A2'),['B0', 'B1'])) + [(0, 2), [0, 1]] + + read as: + - All labels in position [0,1) in first level + - for each of those, all labels at positions 0 or 1. + + The same effective result can be achieved by specifying the None Slice, + or omitting it completely. Note the tuple (0,2) has replaced the list [0 1], + but the outcome is the same. + + >>> _tuple_to_mi_locs(mi,(slice('A0','A2'),slice(None))) + [(0, 2), (0,2)] + + >>> _tuple_to_mi_locs(mi,(slice('A0','A2'),)) + [(0, 2), (0,2)] + + """ + + + ranges = [] + + # ix must be lexsorted to at least as many levels + # as there are elements in `tup` + assert ix.is_lexsorted_for_tuple(tup) + assert ix.is_unique + assert isinstance(ix,MultiIndex) + + for i,k in enumerate(tup): + level = ix.levels[i] + + if _is_list_like(k): + # a collection of labels to include from this level + ranges.append([level.get_loc(x) for x in k]) + continue + if k == slice(None): + start = 0 + stop = len(level) + elif isinstance(k,slice): + start = level.get_loc(k.start) + stop = len(level) + if k.stop: + stop = level.get_loc(k.stop) + else: + # a single label + start = level.get_loc(k) + stop = start + + ranges.append((start,stop)) + + for i in range(i+1,len(ix.levels)): + # omitting trailing dims + # means include all values + level = ix.levels[i] + start = 0 + stop = len(level) + ranges.append((start,stop)) + + return ranges + +def _spec_to_array_indices(ix, specs): + """Convert a tuple of slices/label lists/labels to a level-wise spec + + Parameters + ---------- + ix: a sufficiently lexsorted, unique/non-dupe MultIindex. + specs: a list of 2-tuples/list of label positions. Specifically, The + output of _tuple_to_mi_locs. + len(specs) must matc ix.nlevels. + + Returns + ------- + a generator of row positions relative to ix, corresponding to specs. + Suitable for usage with `iloc`. + + Example (This is *not* a doctest): + >>> mi = pd.MultiIndex.from_product([['A0', 'A1', 'A2'],['B0', 'B1']]) + >>> for x in mi.get_values(): print(x) + ('A0', 'B0') + ('A0', 'B1') + ('A1', 'B0') + ('A1', 'B1') + ('A2', 'B0') + ('A2', 'B1') + + >>> specs = _tuple_to_mi_locs(mi,(slice('A0','A2'),['B0', 'B1'])) + >>> list(_spec_to_array_indices(mi, specs)) + [0, 1, 2, 3] + + Which are all the labels having 'A0' to 'A2' (non-inclusive) at level=0 + and 'B0' or 'B1' at level = 0 + + """ + assert ix.is_lexsorted_for_tuple(specs) + assert len(specs) == ix.nlevels + assert ix.is_unique + assert isinstance(ix,MultiIndex) + + # step size/increment for iteration at each level + giant_steps = np.cumprod(ix.levshape[::-1])[::-1] + giant_steps[:-1] = giant_steps[1:] + giant_steps[-1] = 1 + + def _iter(specs, i=0): + step_size = giant_steps[i] + spec=specs[i] + if isinstance(spec,tuple): + # tuples are 2-tuples of (start,stop) label indices to include + valrange = compat.range(*spec) + elif isinstance(spec,list): + # lists are discrete label indicies to include + valrange = spec + + if len(specs)-1 == i: + # base case + for v in valrange: + yield v + else: + for base in valrange: + base *= step_size + for v in _iter(specs,i+1): + yield base + v + # validate + + return _iter(specs) From 6635b26d4d4187cf855f909ccd3e2a4977c993c5 Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 28 Jan 2014 03:46:19 +0200 Subject: [PATCH 5/6] PERF: vectorize _spec_to_array_indices, for 3-4x speedup --- pandas/core/indexing.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 272fcedf16bb0..04b2f88086054 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1660,7 +1660,26 @@ def _spec_to_array_indices(ix, specs): giant_steps[:-1] = giant_steps[1:] giant_steps[-1] = 1 - def _iter(specs, i=0): + def _iter_vectorize(specs, i=0): + step_size = giant_steps[i] + spec=specs[i] + if isinstance(spec,tuple): + # tuples are 2-tuples of (start,stop) label indices to include + valrange = compat.range(*spec) + elif isinstance(spec,list): + # lists are discrete label indicies to include + valrange = spec + + if len(specs)-1 == i: + return np.array(valrange) + else: + tmpl = np.array([v for v in _iter_vectorize(specs,i+1)]) + res=np.tile(tmpl,(len(valrange),1)) + steps=(np.array(valrange)*step_size).reshape((len(valrange),1)) + return (res+steps).flatten() + + + def _iter_generator(specs, i=0): step_size = giant_steps[i] spec=specs[i] if isinstance(spec,tuple): @@ -1677,8 +1696,8 @@ def _iter(specs, i=0): else: for base in valrange: base *= step_size - for v in _iter(specs,i+1): + for v in _iter_generator(specs,i+1): yield base + v # validate - return _iter(specs) + return _iter_vectorize(specs) From 6bee17a7db4455dfe7f6ac7f889f001f8a29ea3d Mon Sep 17 00:00:00 2001 From: y-p Date: Tue, 28 Jan 2014 04:22:31 +0200 Subject: [PATCH 6/6] PERF: remove no longer needed list conversion. 1.4x speedup --- pandas/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 04b2f88086054..4daf8031e79ea 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1163,7 +1163,7 @@ def _getitem_axis(self, key, axis=0): # (None) slices, list of labels or labels specs = _tuple_to_mi_locs(labels,key) g = _spec_to_array_indices(labels, specs) - return self.obj.iloc[list(g)] + return self.obj.iloc[g] else: self._has_valid_type(key, axis) return self._get_label(key, axis=axis)