Skip to content

BUG: Fix a bug when indexing np.nan via loc/iloc (GH5016) #5018

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 28, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,7 @@ Bug Fixes
- Fixed wrong index name during read_csv if using usecols. Applies to c parser only. (:issue:`4201`)
- ``Timestamp`` objects can now appear in the left hand side of a comparison
operation with a ``Series`` or ``DataFrame`` object (:issue:`4982`).
- Fix a bug when indexing with ``np.nan`` via ``iloc/loc`` (:issue:`5016`)

pandas 0.12.0
-------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def _convert_scalar_indexer(self, key, typ=None):
def to_int():
ikey = int(key)
if ikey != key:
self._convert_indexer_error(key, 'label')
return self._convert_indexer_error(key, 'label')
return ikey

if typ == 'iloc':
Expand Down
12 changes: 10 additions & 2 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# pylint: disable=W0223

from datetime import datetime
from pandas.core.common import _asarray_tuplesafe, is_list_like
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.compat import range, zip
import pandas.compat as compat
import pandas.core.common as com
from pandas.core.common import (_is_bool_indexer, is_integer_dtype,
_asarray_tuplesafe, is_list_like, isnull,
ABCSeries, ABCDataFrame, ABCPanel)
import pandas.lib as lib

Expand Down Expand Up @@ -979,12 +979,20 @@ def _has_valid_type(self, key, axis):
else:

def error():
if isnull(key):
raise ValueError("cannot use label indexing with a null key")
raise KeyError("the label [%s] is not in the [%s]" % (key,self.obj._get_axis_name(axis)))

key = self._convert_scalar_indexer(key, axis)
try:
key = self._convert_scalar_indexer(key, axis)
if not key in ax:
error()
except (TypeError) as e:

# python 3 type errors should be raised
if 'unorderable' in str(e): # pragma: no cover
error()
raise
except:
error()

Expand Down
39 changes: 35 additions & 4 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,13 @@ def ref_locs(self):
indexer = self.ref_items.get_indexer(self.items)
indexer = com._ensure_platform_int(indexer)
if (indexer == -1).any():
raise AssertionError('Some block items were not in block '
'ref_items')

# this means that we have nan's in our block
try:
indexer[indexer == -1] = np.arange(len(self.items))[isnull(self.items)]
except:
raise AssertionError('Some block items were not in block '
'ref_items')

self._ref_locs = indexer
return self._ref_locs
Expand Down Expand Up @@ -2500,9 +2505,18 @@ def _consolidate_inplace(self):

def get(self, item):
if self.items.is_unique:

if isnull(item):
indexer = np.arange(len(self.items))[isnull(self.items)]
return self.get_for_nan_indexer(indexer)

_, block = self._find_block(item)
return block.get(item)
else:

if isnull(item):
raise ValueError("cannot label index with a null key")

indexer = self.items.get_loc(item)
ref_locs = np.array(self._set_ref_locs())

Expand All @@ -2528,14 +2542,31 @@ def get(self, item):

def iget(self, i):
item = self.items[i]

# unique
if self.items.is_unique:
return self.get(item)
if notnull(item):
return self.get(item)
return self.get_for_nan_indexer(i)

# compute the duplicative indexer if needed
ref_locs = self._set_ref_locs()
b, loc = ref_locs[i]
return b.iget(loc)

def get_for_nan_indexer(self, indexer):

# allow a single nan location indexer
if not np.isscalar(indexer):
if len(indexer) == 1:
indexer = indexer.item()
else:
raise ValueError("cannot label index with a null key")

# take a nan indexer and return the values
ref_locs = self._set_ref_locs(do_refs='force')
b, loc = ref_locs[indexer]
return b.iget(loc)

def get_scalar(self, tup):
"""
Retrieve single item
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1053,10 +1053,10 @@ def __setitem__(self, key, value):
except TypeError as e:
if isinstance(key, tuple) and not isinstance(self.index, MultiIndex):
raise ValueError("Can only tuple-index with a MultiIndex")

# python 3 type errors should be raised
if 'unorderable' in str(e): # pragma: no cover
raise IndexError(key)
# Could not hash item

if _is_bool_indexer(key):
key = _check_bool_indexer(self.index, key)
Expand Down
16 changes: 16 additions & 0 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,8 @@ cdef class Float64HashTable(HashTable):

return uniques.to_array()

na_sentinel = object

cdef class PyObjectHashTable(HashTable):
# cdef kh_pymap_t *table

Expand All @@ -660,6 +662,8 @@ cdef class PyObjectHashTable(HashTable):
def __contains__(self, object key):
cdef khiter_t k
hash(key)
if key != key or key is None:
key = na_sentinel
k = kh_get_pymap(self.table, <PyObject*>key)
return k != self.table.n_buckets

Expand All @@ -669,6 +673,8 @@ cdef class PyObjectHashTable(HashTable):

cpdef get_item(self, object val):
cdef khiter_t k
if val != val or val is None:
val = na_sentinel
k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
return self.table.vals[k]
Expand All @@ -677,6 +683,8 @@ cdef class PyObjectHashTable(HashTable):

def get_iter_test(self, object key, Py_ssize_t iterations):
cdef Py_ssize_t i, val
if key != key or key is None:
key = na_sentinel
for i in range(iterations):
k = kh_get_pymap(self.table, <PyObject*>key)
if k != self.table.n_buckets:
Expand All @@ -689,6 +697,8 @@ cdef class PyObjectHashTable(HashTable):
char* buf

hash(key)
if key != key or key is None:
key = na_sentinel
k = kh_put_pymap(self.table, <PyObject*>key, &ret)
# self.table.keys[k] = key
if kh_exist_pymap(self.table, k):
Expand All @@ -706,6 +716,9 @@ cdef class PyObjectHashTable(HashTable):
for i in range(n):
val = values[i]
hash(val)
if val != val or val is None:
val = na_sentinel

k = kh_put_pymap(self.table, <PyObject*>val, &ret)
self.table.vals[k] = i

Expand All @@ -720,6 +733,9 @@ cdef class PyObjectHashTable(HashTable):
for i in range(n):
val = values[i]
hash(val)
if val != val or val is None:
val = na_sentinel

k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
locs[i] = self.table.vals[k]
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,8 @@ def test_setitem_clear_caches(self):
def test_setitem_None(self):
# GH #766
self.frame[None] = self.frame['A']
assert_series_equal(self.frame.iloc[:,-1], self.frame['A'])
assert_series_equal(self.frame.loc[:,None], self.frame['A'])
assert_series_equal(self.frame[None], self.frame['A'])
repr(self.frame)

Expand Down Expand Up @@ -4475,6 +4477,41 @@ def test_constructor_lists_to_object_dtype(self):
self.assert_(d['a'].dtype == np.object_)
self.assert_(d['a'][1] is False)

def test_constructor_with_nas(self):
# GH 5016
# na's in indicies

def check(df):
for i in range(len(df.columns)):
df.iloc[:,i]

# allow single nans to succeed
indexer = np.arange(len(df.columns))[isnull(df.columns)]

if len(indexer) == 1:
assert_series_equal(df.iloc[:,indexer[0]],df.loc[:,np.nan])


# multiple nans should fail
else:

def f():
df.loc[:,np.nan]
self.assertRaises(ValueError, f)


df = DataFrame([[1,2,3],[4,5,6]], index=[1,np.nan])
check(df)

df = DataFrame([[1,2,3],[4,5,6]], columns=[1.1,2.2,np.nan])
check(df)

df = DataFrame([[0,1,2,3],[4,5,6,7]], columns=[np.nan,1.1,2.2,np.nan])
check(df)

df = DataFrame([[0.0,1,2,3.0],[4,5,6,7]], columns=[np.nan,1.1,2.2,np.nan])
check(df)

def test_logical_with_nas(self):
d = DataFrame({'a': [np.nan, False], 'b': [True, True]})

Expand Down