Skip to content

index is included in memory usage by default #11867

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ Other enhancements
- ``read_excel`` now supports s3 urls of the format ``s3://bucketname/filename`` (:issue:`11447`)
- A simple version of ``Panel.round()`` is now implemented (:issue:`11763`)
- For Python 3.x, ``round(DataFrame)``, ``round(Series)``, ``round(Panel)`` will work (:issue:`11763`)
- ``Dataframe`` has gained a ``_repr_latex_`` method in order to allow for automatic conversion to latex in a ipython/jupyter notebook using nbconvert. Options ``display.latex.escape`` and ``display.latex.longtable`` have been added to the configuration and are used automatically by the ``to_latex`` method.(:issue:`11778`)
- ``DataFrame`` has gained a ``_repr_latex_`` method in order to allow for automatic conversion to latex in a ipython/jupyter notebook using nbconvert. Options ``display.latex.escape`` and ``display.latex.longtable`` have been added to the configuration and are used automatically by the ``to_latex`` method.(:issue:`11778`)
- ``sys.getsizeof(obj)`` returns the memory usage of a pandas object, including the
values it contains (:issue:`11597`)

.. _whatsnew_0180.enhancements.rounding:

Expand Down Expand Up @@ -264,6 +266,7 @@ other anchored offsets like ``MonthBegin`` and ``YearBegin``.
Other API Changes
^^^^^^^^^^^^^^^^^

- ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`)



Expand Down
16 changes: 16 additions & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,22 @@ def _reset_cache(self, key=None):
else:
self._cache.pop(key, None)

def __sizeof__(self):
"""
Generates the total memory usage for a object that returns
either a value or Series of values
"""
if hasattr(self, 'memory_usage'):
mem = self.memory_usage(deep=True)
if not lib.isscalar(mem):
mem = mem.sum()
return int(mem)

# no memory_usage attribute, so fall back to
# object's 'sizeof'
return super(self, PandasObject).__sizeof__()


class NoNewAttributesMixin(object):
"""Mixin which prevents adding new attributes.

Expand Down
10 changes: 5 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import numpy as np
import numpy.ma as ma

from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _not_none,
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
_default_index, _maybe_upcast, is_sequence,
_infer_dtype_from_scalar, _values_from_object,
is_list_like, _maybe_box_datetimelike,
Expand All @@ -46,8 +46,7 @@
from pandas.compat import(range, map, zip, lrange, lmap, lzip, StringIO, u,
OrderedDict, raise_with_traceback)
from pandas import compat
from pandas.sparse.array import SparseArray
from pandas.util.decorators import (cache_readonly, deprecate, Appender,
from pandas.util.decorators import (deprecate, Appender,
Substitution, deprecate_kwarg)

from pandas.tseries.period import PeriodIndex
Expand Down Expand Up @@ -1720,10 +1719,11 @@ def _sizeof_fmt(num, size_qualifier):
size_qualifier = '+'
mem_usage = self.memory_usage(index=True, deep=deep).sum()
lines.append("memory usage: %s\n" %
_sizeof_fmt(mem_usage, size_qualifier))
_sizeof_fmt(mem_usage, size_qualifier)
)
_put_lines(buf, lines)

def memory_usage(self, index=False, deep=False):
def memory_usage(self, index=True, deep=False):
"""Memory usage of DataFrame columns.

Parameters
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2335,7 +2335,7 @@ def reindex_axis(self, labels, axis=0, **kwargs):
raise ValueError("cannot reindex series on non-zero axis!")
return self.reindex(index=labels, **kwargs)

def memory_usage(self, index=False, deep=False):
def memory_usage(self, index=True, deep=False):
"""Memory usage of the Series

Parameters
Expand Down
58 changes: 33 additions & 25 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
# -*- coding: utf-8 -*-
from __future__ import print_function

import re
import sys
from datetime import datetime, timedelta

import numpy as np
import pandas.compat as compat

import pandas as pd
from pandas.compat import u, StringIO
from pandas.core.base import FrozenList, FrozenNDArray, PandasDelegate, NoNewAttributesMixin
import pandas.compat as compat
import pandas.core.common as com
import pandas.util.testing as tm
from pandas import Series, Index, DatetimeIndex, \
TimedeltaIndex, PeriodIndex, Timedelta
from pandas.compat import u, StringIO
from pandas.core.base import FrozenList, FrozenNDArray, \
PandasDelegate, NoNewAttributesMixin
from pandas.tseries.base import DatetimeIndexOpsMixin
from pandas.util.testing import assertRaisesRegexp, assertIsInstance
from pandas.tseries.common import is_datetimelike
from pandas import Series, Index, Int64Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta
import pandas.tslib as tslib
from pandas import _np_version_under1p9
import nose
from pandas.util.testing import assertRaisesRegexp, \
assertIsInstance

import pandas.util.testing as tm

class CheckStringMixin(object):
def test_string_methods_dont_fail(self):
Expand Down Expand Up @@ -112,7 +115,9 @@ def setUp(self):
def test_shallow_copying(self):
original = self.container.copy()
assertIsInstance(self.container.view(), FrozenNDArray)
self.assertFalse(isinstance(self.container.view(np.ndarray), FrozenNDArray))
self.assertFalse(isinstance(
self.container.view(np.ndarray), FrozenNDArray
))
self.assertIsNot(self.container.view(), self.container)
self.assert_numpy_array_equal(self.container, original)
# shallow copy should be the same too
Expand Down Expand Up @@ -881,27 +886,30 @@ def get_fill_value(obj):
# check shallow_copied
self.assertFalse(o is result)


def test_memory_usage(self):
for o in self.objs:
res = o.memory_usage()
res2 = o.memory_usage(deep=True)
res_deep = o.memory_usage(deep=True)

if com.is_object_dtype(o):
self.assertTrue(res2 > res)
if (com.is_object_dtype(o) or
(isinstance(o, Series) and
com.is_object_dtype(o.index))):
# if there are objects, only deep will pick them up
self.assertTrue(res_deep > res)
else:
self.assertEqual(res, res2)
self.assertEqual(res, res_deep)

if isinstance(o, Series):
res = o.memory_usage(index=True)
res2 = o.memory_usage(index=True, deep=True)
if com.is_object_dtype(o) or com.is_object_dtype(o.index):
self.assertTrue(res2 > res)
else:
self.assertEqual(res, res2)

self.assertEqual(o.memory_usage(index=False) + o.index.memory_usage(),
o.memory_usage(index=True))
self.assertEqual(
(o.memory_usage(index=False) +
o.index.memory_usage()),
o.memory_usage(index=True)
)

# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = res_deep - sys.getsizeof(o)
self.assertTrue(abs(diff) < 100)


class TestFloat64HashTable(tm.TestCase):
Expand Down
29 changes: 18 additions & 11 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
# -*- coding: utf-8 -*-
# pylint: disable=E1101,E1103,W0232

from datetime import datetime
from pandas.compat import range, lrange, u, PY3
import os
import pickle
import re
import sys
from datetime import datetime
from distutils.version import LooseVersion

import numpy as np
import pandas as pd

from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp, CategoricalIndex

from pandas.core.config import option_context
import pandas.core.common as com
import pandas as pd
import pandas.compat as compat
import pandas.core.common as com
import pandas.util.testing as tm
from pandas import Categorical, Index, Series, DataFrame, \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, though we usually do this with parens

PeriodIndex, Timestamp, CategoricalIndex
from pandas.compat import range, lrange, u, PY3
from pandas.core.config import option_context


class TestCategorical(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down Expand Up @@ -1219,10 +1219,17 @@ def test_memory_usage(self):
self.assertEqual(cat.nbytes, cat.memory_usage())
self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes)

# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
self.assertTrue(abs(diff) < 100)

def test_searchsorted(self):
# https://github.com/pydata/pandas/issues/8420
s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ])
s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ])
s1 = pd.Series(['apple', 'bread', 'bread', 'cheese',
'milk'])
s2 = pd.Series(['apple', 'bread', 'bread', 'cheese',
'milk', 'donuts'])
c1 = pd.Categorical(s1, ordered=True)
c2 = pd.Categorical(s2, ordered=True)

Expand Down
32 changes: 22 additions & 10 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7697,23 +7697,35 @@ def test_info_memory_usage(self):
df.columns = dtypes
# Ensure df size is as expected
df_size = df.memory_usage().sum()
exp_size = len(dtypes) * n * 8 # cols * rows * bytes
exp_size = (len(dtypes) + 1) * n * 8 # (cols + index) * rows * bytes
self.assertEqual(df_size, exp_size)
# Ensure number of cols in memory_usage is the same as df
size_df = np.size(df.columns.values) # index=False; default
size_df = np.size(df.columns.values) + 1 # index=True; default
self.assertEqual(size_df, np.size(df.memory_usage()))

# assert deep works only on object
self.assertEqual(df.memory_usage().sum(),df.memory_usage(deep=True).sum())
self.assertEqual(df.memory_usage().sum(),
df.memory_usage(deep=True).sum())

# test for validity
DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True)
DataFrame(1,index=['a'],columns=['A']).index.nbytes
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).memory_usage(index=True)
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes
DataFrame(1, index=['a'], columns=['A']
).memory_usage(index=True)
DataFrame(1, index=['a'], columns=['A']
).index.nbytes
df = DataFrame(
data=1,
index=pd.MultiIndex.from_product(
[['a'], range(1000)]),
columns=['A']
)
df.index.nbytes
df.memory_usage(index=True)
df.index.values.nbytes

# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df)
self.assertTrue(abs(diff) < 100)

def test_dtypes(self):
self.mixed_frame['bool'] = self.mixed_frame['A'] > 0
Expand Down