From 3b680bca9dc833293c5a8ef0ebc2dfa78679bdc3 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 22 May 2018 22:42:03 +0200 Subject: [PATCH 1/4] REF: deduplicate _NDFrameIndexer._multi_take code --- pandas/core/frame.py | 3 +- pandas/core/indexing.py | 209 +++++++++++++++++++++------------------- 2 files changed, 113 insertions(+), 99 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 02c86d2f4dcc8..383f129a713ed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2723,7 +2723,8 @@ def _getitem_array(self, key): indexer = key.nonzero()[0] return self._take(indexer, axis=0) else: - indexer = self.loc._convert_to_indexer(key, axis=1) + indexer = self.loc._convert_to_indexer(key, axis=1, + raise_missing=True) return self._take(indexer, axis=1) def _getitem_multilevel(self, key): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0e4f040253560..aa8de9d2baad8 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -925,33 +925,10 @@ def _multi_take(self, tup): """ create the reindex map for our objects, raise the _exception if we can't create the indexer """ - try: - o = self.obj - d = {} - for key, axis in zip(tup, o._AXIS_ORDERS): - ax = o._get_axis(axis) - # Have the index compute an indexer or return None - # if it cannot handle: - indexer, keyarr = ax._convert_listlike_indexer(key, - kind=self.name) - # We only act on all found values: - if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer(key, indexer, axis) - d[axis] = (ax[indexer], indexer) - continue - - # If we are trying to get actual keys from empty Series, we - # patiently wait for a KeyError later on - otherwise, convert - if len(ax) or not len(key): - key = self._convert_for_reindex(key, axis) - indexer = ax.get_indexer_for(key) - keyarr = ax.reindex(keyarr)[0] - self._validate_read_indexer(keyarr, indexer, - o._get_axis_number(axis)) - d[axis] = (keyarr, indexer) - return o._reindex_with_indexers(d, copy=True, allow_dups=True) - except (KeyError, IndexingError) as detail: - raise self._exception(detail) + o = self.obj + d = {axis: self._get_listlike_indexer(key, axis) + for (key, axis) in zip(tup, o._AXIS_ORDERS)} + return o._reindex_with_indexers(d, copy=True, allow_dups=True) def _convert_for_reindex(self, key, axis=None): return key @@ -1124,63 +1101,110 @@ def _getitem_axis(self, key, axis=None): return self._get_label(key, axis=axis) - def _getitem_iterable(self, key, axis=None): - if axis is None: - axis = self.axis or 0 + def _get_listlike_indexer(self, key, axis, raise_missing=False): + """ + Transform a list-like of keys into a new index and an indexer. - self._validate_key(key, axis) + Parameters + ---------- + key : list-like + Target labels + axis: int + Dimension on which the indexing is being made + raise_missing: bool + Whether to raise a KeyError if some labels are not found. Will be + removed in the future, and then this method will always behave as + if raise_missing=True. - labels = self.obj._get_axis(axis) + Raises + ------ + KeyError + If at least one key was requested but none was found, and + raise_missing=True. - if com.is_bool_indexer(key): - key = check_bool_indexer(labels, key) - inds, = key.nonzero() - return self.obj._take(inds, axis=axis) - else: + Returns + ------- + keyarr: Index + New index (coinciding with 'key' if the axis is unique) + values : array-like + An indexer for the return object; -1 denotes keys not found + """ + o = self.obj + ax = o._get_axis(axis) + try: # Have the index compute an indexer or return None - # if it cannot handle; we only act on all found values - indexer, keyarr = labels._convert_listlike_indexer( - key, kind=self.name) + # if it cannot handle: + indexer, keyarr = ax._convert_listlike_indexer(key, + kind=self.name) + # We only act on all found values: if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer(key, indexer, axis) - return self.obj.take(indexer, axis=axis) + self._validate_read_indexer(key, indexer, axis, + raise_missing=raise_missing) + return ax[indexer], indexer - ax = self.obj._get_axis(axis) - # existing labels are unique and indexer are unique - if labels.is_unique and Index(keyarr).is_unique: + if ax.is_unique: + # If we are trying to get actual keys from empty Series, we + # patiently wait for a KeyError later on - otherwise, convert + if len(ax) or not len(key): + key = self._convert_for_reindex(key, axis) indexer = ax.get_indexer_for(key) - self._validate_read_indexer(key, indexer, axis) + keyarr = ax.reindex(keyarr)[0] + else: + keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) - d = {axis: [ax.reindex(keyarr)[0], indexer]} - return self.obj._reindex_with_indexers(d, copy=True, - allow_dups=True) + self._validate_read_indexer(keyarr, indexer, + o._get_axis_number(axis), + raise_missing=raise_missing) + return keyarr, indexer + except (KeyError, IndexingError) as detail: + raise self._exception(detail) - # existing labels are non-unique - else: + def _getitem_iterable(self, key, axis=None): + """ + Index current object with an an iterable key (which can be a boolean + indexer, or a collection of keys). - # reindex with the specified axis - if axis + 1 > self.obj.ndim: - raise AssertionError("invalid indexing error with " - "non-unique index") + Parameters + ---------- + key : iterable + Target labels, or boolean indexer + axis: int, default None + Dimension on which the indexing is being made - new_target, indexer, new_indexer = labels._reindex_non_unique( - keyarr) + Raises + ------ + KeyError + If no key was found. Will change in the future to raise if not all + keys were found. + IndexingError + If the boolean indexer is unalignable with the object being + indexed. - if new_indexer is not None: - result = self.obj._take(indexer[indexer != -1], axis=axis) + Returns + ------- + scalar, DataFrame, or Series: indexed value(s), + """ - self._validate_read_indexer(key, new_indexer, axis) - result = result._reindex_with_indexers( - {axis: [new_target, new_indexer]}, - copy=True, allow_dups=True) + if axis is None: + axis = self.axis or 0 - else: - self._validate_read_indexer(key, indexer, axis) - result = self.obj._take(indexer, axis=axis) + self._validate_key(key, axis) - return result + labels = self.obj._get_axis(axis) + + if com.is_bool_indexer(key): + # A boolean indexer + key = check_bool_indexer(labels, key) + inds, = key.nonzero() + return self.obj._take(inds, axis=axis) + else: + # A collection of keys + keyarr, indexer = self._get_listlike_indexer(key, axis, + raise_missing=False) + return self.obj._reindex_with_indexers({axis: [keyarr, indexer]}, + copy=True, allow_dups=True) - def _validate_read_indexer(self, key, indexer, axis): + def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): """ Check that indexer can be used to return a result (e.g. at least one element was found, unless the list of keys was actually empty). @@ -1193,11 +1217,16 @@ def _validate_read_indexer(self, key, indexer, axis): Indices corresponding to the key (with -1 indicating not found) axis: int Dimension on which the indexing is being made + raise_missing: bool + Whether to raise a KeyError if some labels are not found. Will be + removed in the future, and then this method will always behave as + if raise_missing=True. Raises ------ KeyError - If at least one key was requested none was found. + If at least one key was requested but none was found, and + raise_missing=True. """ ax = self.obj._get_axis(axis) @@ -1214,6 +1243,12 @@ def _validate_read_indexer(self, key, indexer, axis): u"None of [{key}] are in the [{axis}]".format( key=key, axis=self.obj._get_axis_name(axis))) + # We (temporarily) allow for some missing keys with .loc, except in + # some cases (e.g. setting) in which "raise_missing" will be False + if not(self.name == 'loc' and not raise_missing): + not_found = list(set(key) - set(ax)) + raise KeyError("{} not in index".format(not_found)) + # we skip the warning on Categorical/Interval # as this check is actually done (check for # non-missing values), but a bit later in the @@ -1229,9 +1264,10 @@ def _validate_read_indexer(self, key, indexer, axis): if not (ax.is_categorical() or ax.is_interval()): warnings.warn(_missing_key_warning, - FutureWarning, stacklevel=5) + FutureWarning, stacklevel=6) - def _convert_to_indexer(self, obj, axis=None, is_setter=False): + def _convert_to_indexer(self, obj, axis=None, is_setter=False, + raise_missing=False): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray @@ -1310,33 +1346,10 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False): inds, = obj.nonzero() return inds else: - - # Have the index compute an indexer or return None - # if it cannot handle - indexer, objarr = labels._convert_listlike_indexer( - obj, kind=self.name) - if indexer is not None: - return indexer - - # unique index - if labels.is_unique: - indexer = check = labels.get_indexer(objarr) - - # non-unique (dups) - else: - (indexer, - missing) = labels.get_indexer_non_unique(objarr) - # 'indexer' has dupes, create 'check' using 'missing' - check = np.zeros(len(objarr), dtype=np.intp) - check[missing] = -1 - - mask = check == -1 - if mask.any(): - raise KeyError('{mask} not in index' - .format(mask=objarr[mask])) - - return com._values_from_object(indexer) - + # When setting, missing keys are not allowed, even with .loc: + kwargs = {'raise_missing': True if is_setter else + raise_missing} + return self._get_listlike_indexer(obj, axis, **kwargs)[1] else: try: return labels.get_loc(obj) From 9bcbcfee15e92eda0855358cf0309aadd916a5ee Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 15 Jun 2018 21:45:29 +0200 Subject: [PATCH 2/4] BUG: handling of missing values in Index._reindex_non_unique with non unique target --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6a56278b0da49..ccecb6d4d0713 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3627,7 +3627,7 @@ def _reindex_non_unique(self, target): else: # need to retake to have the same size as the indexer - indexer[~check] = 0 + indexer[~check] = -1 # reset the new indexer to account for the new size new_indexer = np.arange(len(self.take(indexer))) From 2a64630762158578f28927eefdbac2622225a957 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 15 Jun 2018 21:49:49 +0200 Subject: [PATCH 3/4] CLN: transform lambda into def --- pandas/core/indexing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index aa8de9d2baad8..ad538e2b85169 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -688,7 +688,8 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): if isinstance(indexer, tuple): # flatten np.ndarray indexers - ravel = lambda i: i.ravel() if isinstance(i, np.ndarray) else i + def ravel(i): + return i.ravel() if isinstance(i, np.ndarray) else i indexer = tuple(map(ravel, indexer)) aligners = [not com.is_null_slice(idx) for idx in indexer] From 2b07ab0f91f349875e342cbe495f583177d57aa2 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 19 Jun 2018 11:08:02 +0200 Subject: [PATCH 4/4] CLN: remove unneeded try... except --- pandas/core/indexing.py | 48 ++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ad538e2b85169..d5e81105dd323 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1132,33 +1132,31 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False): """ o = self.obj ax = o._get_axis(axis) - try: - # Have the index compute an indexer or return None - # if it cannot handle: - indexer, keyarr = ax._convert_listlike_indexer(key, - kind=self.name) - # We only act on all found values: - if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer(key, indexer, axis, - raise_missing=raise_missing) - return ax[indexer], indexer - - if ax.is_unique: - # If we are trying to get actual keys from empty Series, we - # patiently wait for a KeyError later on - otherwise, convert - if len(ax) or not len(key): - key = self._convert_for_reindex(key, axis) - indexer = ax.get_indexer_for(key) - keyarr = ax.reindex(keyarr)[0] - else: - keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) - self._validate_read_indexer(keyarr, indexer, - o._get_axis_number(axis), + # Have the index compute an indexer or return None + # if it cannot handle: + indexer, keyarr = ax._convert_listlike_indexer(key, + kind=self.name) + # We only act on all found values: + if indexer is not None and (indexer != -1).all(): + self._validate_read_indexer(key, indexer, axis, raise_missing=raise_missing) - return keyarr, indexer - except (KeyError, IndexingError) as detail: - raise self._exception(detail) + return ax[indexer], indexer + + if ax.is_unique: + # If we are trying to get actual keys from empty Series, we + # patiently wait for a KeyError later on - otherwise, convert + if len(ax) or not len(key): + key = self._convert_for_reindex(key, axis) + indexer = ax.get_indexer_for(key) + keyarr = ax.reindex(keyarr)[0] + else: + keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) + + self._validate_read_indexer(keyarr, indexer, + o._get_axis_number(axis), + raise_missing=raise_missing) + return keyarr, indexer def _getitem_iterable(self, key, axis=None): """