From c2e74e810fbbfe91588ac37448704c22b4fe294a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 27 Sep 2016 18:34:34 -0400 Subject: [PATCH] PERF: unnecessary materialization of a MultiIndex.values when introspecting memory closes #14308 --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/indexes/multi.py | 19 ++++++++++++++++++- pandas/tests/frame/test_repr_info.py | 24 ++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 2f8baae416dea..f4110cba68c31 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1409,7 +1409,7 @@ Performance Improvements - Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`) - Improved performance of by lazily creating indexing hashtables on larger Indexes (:issue:`14266`) - Improved performance of ``groupby.groups`` (:issue:`14293`) - +- Unecessary materializing of a MultiIndex when introspecting for memory usage (:issue:`14308`) .. _whatsnew_0190.bug_fixes: diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index e6aefaeb01a15..1ab5dbb737739 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -413,10 +413,27 @@ def _shallow_copy(self, values=None, **kwargs): def dtype(self): return np.dtype('O') + @Appender(Index.memory_usage.__doc__) + def memory_usage(self, deep=False): + # we are overwriting our base class to avoid + # computing .values here which could materialize + # a tuple representation uncessarily + return self._nbytes(deep) + @cache_readonly def nbytes(self): """ return the number of bytes in the underlying data """ - level_nbytes = sum((i.nbytes for i in self.levels)) + return self._nbytes(False) + + def _nbytes(self, deep=False): + """ + return the number of bytes in the underlying data + deeply introspect the level data if deep=True + + *this is in internal routine* + + """ + level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels)) label_nbytes = sum((i.nbytes for i in self.labels)) names_nbytes = sum((getsizeof(i) for i in self.names)) return level_nbytes + label_nbytes + names_nbytes diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 66e592c013fb1..5e5e9abda1200 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -381,3 +381,27 @@ def test_info_memory_usage(self): # deep=True, and add on some GC overhead diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df) self.assertTrue(abs(diff) < 100) + + def test_info_memory_usage_bug_on_multiindex(self): + # GH 14308 + # memory usage introspection should not materialize .values + + from string import ascii_uppercase as uppercase + + def memory_usage(f): + return f.memory_usage(deep=True).sum() + + N = 100 + M = len(uppercase) + index = pd.MultiIndex.from_product([list(uppercase), + pd.date_range('20160101', + periods=N)], + names=['id', 'date']) + df = DataFrame({'value': np.random.randn(N * M)}, index=index) + + unstacked = df.unstack('id') + self.assertEqual(df.values.nbytes, unstacked.values.nbytes) + self.assertTrue(memory_usage(df) > memory_usage(unstacked)) + + # high upper bound + self.assertTrue(memory_usage(unstacked) - memory_usage(df) < 2000)