From d77c002fa597c554a4dda848aff0258d5d458844 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 25 Jan 2017 22:01:46 -0500 Subject: [PATCH] PERF: correctly report memory used by Index's --- doc/source/whatsnew/v0.20.0.txt | 37 ++++++++++++++++++++++++ pandas/core/base.py | 1 + pandas/index.pyx | 9 ++++++ pandas/indexes/base.py | 8 +++++ pandas/indexes/multi.py | 8 ++++- pandas/src/hashtable_class_helper.pxi.in | 19 ++++++++++++ pandas/tests/indexes/common.py | 20 +++++++++++++ pandas/tests/test_categorical.py | 8 +++-- 8 files changed, 106 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2dc15f2fe0781..15a696987d628 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -314,6 +314,43 @@ New Behavior: In [5]: df['a']['2011-12-31 23:59:59'] Out[5]: 1 +.. _whatsnew_0200.api_breaking.memory_usage: + +Memory Usage for Index is more Accurate +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, showing ``.memory_usage()`` on a pandas structure that has an index, would only include actual index values and not include structures that facilitated fast indexing. This will generally be different for ``Index`` and ``MultiIndex`` and less-so for other index types. (:issue:`15237`) + +Previous Behavior: + +.. code-block:: ipython + + In [8]: index = Index(['foo', 'bar', 'baz']) + + In [9]: index.memory_usage(deep=True) + Out[9]: 180 + + In [10]: index.get_loc('foo') + Out[10]: 0 + + In [11]: index.memory_usage(deep=True) + Out[11]: 180 + +New Behavior: + +.. code-block:: ipython + + In [8]: index = Index(['foo', 'bar', 'baz']) + + In [9]: index.memory_usage(deep=True) + Out[9]: 180 + + In [10]: index.get_loc('foo') + Out[10]: 0 + + In [11]: index.memory_usage(deep=True) + Out[11]: 260 + .. _whatsnew_0200.api: Other API Changes diff --git a/pandas/core/base.py b/pandas/core/base.py index 77272f7721b32..e7a79c3291a92 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1067,6 +1067,7 @@ def memory_usage(self, deep=False): v = self.values.nbytes if deep and is_object_dtype(self): v += lib.memory_usage_of_objects(self.values) + return v def factorize(self, sort=False, na_sentinel=-1): diff --git a/pandas/index.pyx b/pandas/index.pyx index d575defe17422..0c975d1775a03 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -203,6 +203,15 @@ cdef class IndexEngine: return result + def sizeof(self, deep=False): + """ return the sizeof our mapping """ + if not self.is_mapping_populated: + return 0 + return self.mapping.sizeof(deep=deep) + + def __sizeof__(self): + return self.sizeof() + property is_unique: def __get__(self): diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 2009a064afd73..bc2dce4e97e5b 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -536,6 +536,14 @@ def get_values(self): """ return the underlying data as an ndarray """ return self.values + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + result = super(Index, self).memory_usage(deep=deep) + + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result + # ops compat def tolist(self): """ diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index d8991a0982db2..00ead012a916a 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -446,13 +446,19 @@ def _nbytes(self, deep=False): return the number of bytes in the underlying data deeply introspect the level data if deep=True + include the engine hashtable + *this is in internal routine* """ level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels)) label_nbytes = sum((i.nbytes for i in self.labels)) names_nbytes = sum((getsizeof(i) for i in self.names)) - return level_nbytes + label_nbytes + names_nbytes + result = level_nbytes + label_nbytes + names_nbytes + + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result def _format_attrs(self): """ diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 1d3c4b2cb5889..360f43e18f340 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -203,6 +203,7 @@ cdef class ObjectVector: cdef class HashTable: + pass {{py: @@ -237,6 +238,12 @@ cdef class {{name}}HashTable(HashTable): k = kh_get_{{dtype}}(self.table, key) return k != self.table.n_buckets + def sizeof(self, deep=False): + """ return the size of my table in bytes """ + return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys + sizeof(size_t) + # vals + sizeof(uint32_t)) # flags + cpdef get_item(self, {{dtype}}_t val): cdef khiter_t k k = kh_get_{{dtype}}(self.table, val) @@ -464,6 +471,12 @@ cdef class StringHashTable(HashTable): kh_destroy_str(self.table) self.table = NULL + def sizeof(self, deep=False): + """ return the size of my table in bytes """ + return self.table.n_buckets * (sizeof(char *) + # keys + sizeof(ssize_t) + # vals + sizeof(uint32_t)) # flags + cpdef get_item(self, object val): cdef: khiter_t k @@ -714,6 +727,12 @@ cdef class PyObjectHashTable(HashTable): k = kh_get_pymap(self.table, key) return k != self.table.n_buckets + def sizeof(self, deep=False): + """ return the size of my table in bytes """ + return self.table.n_buckets * (sizeof(PyObject *) + # keys + sizeof(size_t) + # vals + sizeof(uint32_t)) # flags + cpdef get_item(self, object val): cdef khiter_t k if val != val or val is None: diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 63e9fe580d73d..5a482acf403cd 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -366,6 +366,26 @@ def test_compat(self): for ind in self.indices.values(): self.assertEqual(ind.tolist(), list(ind)) + def test_memory_usage(self): + for name, index in compat.iteritems(self.indices): + result = index.memory_usage() + if len(index): + index.get_loc(index[0]) + result2 = index.memory_usage() + result3 = index.memory_usage(deep=True) + + # RangeIndex doesn't use a hashtable engine + if not isinstance(index, RangeIndex): + self.assertTrue(result2 > result) + + if index.inferred_type == 'object': + self.assertTrue(result3 > result2) + + else: + + # we report 0 for no-length + self.assertEqual(result, 0) + def test_argsort(self): for k, ind in self.indices.items(): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 2321467d04c79..745914d3e7ef5 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1555,11 +1555,13 @@ def test_nbytes(self): def test_memory_usage(self): cat = pd.Categorical([1, 2, 3]) - self.assertEqual(cat.nbytes, cat.memory_usage()) - self.assertEqual(cat.nbytes, cat.memory_usage(deep=True)) + + # .categories is an index, so we include the hashtable + self.assertTrue(cat.nbytes > 0 and cat.nbytes <= cat.memory_usage()) + self.assertTrue(cat.nbytes > 0 and + cat.nbytes <= cat.memory_usage(deep=True)) cat = pd.Categorical(['foo', 'foo', 'bar']) - self.assertEqual(cat.nbytes, cat.memory_usage()) self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes) # sys.getsizeof will call the .memory_usage with