Skip to content

PERF: correctly report memory used by Index's #15237

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,43 @@ New Behavior:
In [5]: df['a']['2011-12-31 23:59:59']
Out[5]: 1

.. _whatsnew_0200.api_breaking.memory_usage:

Memory Usage for Index is more Accurate
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In previous versions, showing ``.memory_usage()`` on a pandas structure that has an index, would only include actual index values and not include structures that facilitated fast indexing. This will generally be different for ``Index`` and ``MultiIndex`` and less-so for other index types. (:issue:`15237`)

Previous Behavior:

.. code-block:: ipython

In [8]: index = Index(['foo', 'bar', 'baz'])

In [9]: index.memory_usage(deep=True)
Out[9]: 180

In [10]: index.get_loc('foo')
Out[10]: 0

In [11]: index.memory_usage(deep=True)
Out[11]: 180

New Behavior:

.. code-block:: ipython

In [8]: index = Index(['foo', 'bar', 'baz'])

In [9]: index.memory_usage(deep=True)
Out[9]: 180

In [10]: index.get_loc('foo')
Out[10]: 0

In [11]: index.memory_usage(deep=True)
Out[11]: 260

.. _whatsnew_0200.api:

Other API Changes
Expand Down
1 change: 1 addition & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1067,6 +1067,7 @@ def memory_usage(self, deep=False):
v = self.values.nbytes
if deep and is_object_dtype(self):
v += lib.memory_usage_of_objects(self.values)

return v

def factorize(self, sort=False, na_sentinel=-1):
Expand Down
9 changes: 9 additions & 0 deletions pandas/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,15 @@ cdef class IndexEngine:

return result

def sizeof(self, deep=False):
""" return the sizeof our mapping """
if not self.is_mapping_populated:
return 0
return self.mapping.sizeof(deep=deep)

def __sizeof__(self):
return self.sizeof()

property is_unique:

def __get__(self):
Expand Down
8 changes: 8 additions & 0 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,14 @@ def get_values(self):
""" return the underlying data as an ndarray """
return self.values

@Appender(IndexOpsMixin.memory_usage.__doc__)
def memory_usage(self, deep=False):
result = super(Index, self).memory_usage(deep=deep)

# include our engine hashtable
result += self._engine.sizeof(deep=deep)
return result

# ops compat
def tolist(self):
"""
Expand Down
8 changes: 7 additions & 1 deletion pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,13 +446,19 @@ def _nbytes(self, deep=False):
return the number of bytes in the underlying data
deeply introspect the level data if deep=True

include the engine hashtable

*this is in internal routine*

"""
level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels))
label_nbytes = sum((i.nbytes for i in self.labels))
names_nbytes = sum((getsizeof(i) for i in self.names))
return level_nbytes + label_nbytes + names_nbytes
result = level_nbytes + label_nbytes + names_nbytes

# include our engine hashtable
result += self._engine.sizeof(deep=deep)
return result

def _format_attrs(self):
"""
Expand Down
19 changes: 19 additions & 0 deletions pandas/src/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ cdef class ObjectVector:


cdef class HashTable:

pass

{{py:
Expand Down Expand Up @@ -237,6 +238,12 @@ cdef class {{name}}HashTable(HashTable):
k = kh_get_{{dtype}}(self.table, key)
return k != self.table.n_buckets

def sizeof(self, deep=False):
""" return the size of my table in bytes """
return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
sizeof(size_t) + # vals
sizeof(uint32_t)) # flags

cpdef get_item(self, {{dtype}}_t val):
cdef khiter_t k
k = kh_get_{{dtype}}(self.table, val)
Expand Down Expand Up @@ -464,6 +471,12 @@ cdef class StringHashTable(HashTable):
kh_destroy_str(self.table)
self.table = NULL

def sizeof(self, deep=False):
""" return the size of my table in bytes """
return self.table.n_buckets * (sizeof(char *) + # keys
sizeof(ssize_t) + # vals
sizeof(uint32_t)) # flags

cpdef get_item(self, object val):
cdef:
khiter_t k
Expand Down Expand Up @@ -714,6 +727,12 @@ cdef class PyObjectHashTable(HashTable):
k = kh_get_pymap(self.table, <PyObject*>key)
return k != self.table.n_buckets

def sizeof(self, deep=False):
""" return the size of my table in bytes """
return self.table.n_buckets * (sizeof(PyObject *) + # keys
sizeof(size_t) + # vals
sizeof(uint32_t)) # flags

cpdef get_item(self, object val):
cdef khiter_t k
if val != val or val is None:
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,26 @@ def test_compat(self):
for ind in self.indices.values():
self.assertEqual(ind.tolist(), list(ind))

def test_memory_usage(self):
for name, index in compat.iteritems(self.indices):
result = index.memory_usage()
if len(index):
index.get_loc(index[0])
result2 = index.memory_usage()
result3 = index.memory_usage(deep=True)

# RangeIndex doesn't use a hashtable engine
if not isinstance(index, RangeIndex):
self.assertTrue(result2 > result)

if index.inferred_type == 'object':
self.assertTrue(result3 > result2)

else:

# we report 0 for no-length
self.assertEqual(result, 0)

def test_argsort(self):
for k, ind in self.indices.items():

Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1555,11 +1555,13 @@ def test_nbytes(self):

def test_memory_usage(self):
cat = pd.Categorical([1, 2, 3])
self.assertEqual(cat.nbytes, cat.memory_usage())
self.assertEqual(cat.nbytes, cat.memory_usage(deep=True))

# .categories is an index, so we include the hashtable
self.assertTrue(cat.nbytes > 0 and cat.nbytes <= cat.memory_usage())
self.assertTrue(cat.nbytes > 0 and
cat.nbytes <= cat.memory_usage(deep=True))

cat = pd.Categorical(['foo', 'foo', 'bar'])
self.assertEqual(cat.nbytes, cat.memory_usage())
self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes)

# sys.getsizeof will call the .memory_usage with
Expand Down