diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 58d003b5c9dc7..34c58fc06ac82 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -107,7 +107,9 @@ Other enhancements - ``read_excel`` now supports s3 urls of the format ``s3://bucketname/filename`` (:issue:`11447`) - A simple version of ``Panel.round()`` is now implemented (:issue:`11763`) - For Python 3.x, ``round(DataFrame)``, ``round(Series)``, ``round(Panel)`` will work (:issue:`11763`) -- ``Dataframe`` has gained a ``_repr_latex_`` method in order to allow for automatic conversion to latex in a ipython/jupyter notebook using nbconvert. Options ``display.latex.escape`` and ``display.latex.longtable`` have been added to the configuration and are used automatically by the ``to_latex`` method.(:issue:`11778`) +- ``DataFrame`` has gained a ``_repr_latex_`` method in order to allow for automatic conversion to latex in a ipython/jupyter notebook using nbconvert. Options ``display.latex.escape`` and ``display.latex.longtable`` have been added to the configuration and are used automatically by the ``to_latex`` method.(:issue:`11778`) +- ``sys.getsizeof(obj)`` returns the memory usage of a pandas object, including the + values it contains (:issue:`11597`) .. _whatsnew_0180.enhancements.rounding: @@ -264,6 +266,7 @@ other anchored offsets like ``MonthBegin`` and ``YearBegin``. Other API Changes ^^^^^^^^^^^^^^^^^ +- ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`) diff --git a/pandas/core/base.py b/pandas/core/base.py index a1e1c20344ea4..548b922926f02 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -112,6 +112,22 @@ def _reset_cache(self, key=None): else: self._cache.pop(key, None) + def __sizeof__(self): + """ + Generates the total memory usage for a object that returns + either a value or Series of values + """ + if hasattr(self, 'memory_usage'): + mem = self.memory_usage(deep=True) + if not lib.isscalar(mem): + mem = mem.sum() + return int(mem) + + # no memory_usage attribute, so fall back to + # object's 'sizeof' + return super(self, PandasObject).__sizeof__() + + class NoNewAttributesMixin(object): """Mixin which prevents adding new attributes. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fea9318349d0b..b66c51bc4411e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,7 +23,7 @@ import numpy as np import numpy.ma as ma -from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _not_none, +from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, is_sequence, _infer_dtype_from_scalar, _values_from_object, is_list_like, _maybe_box_datetimelike, @@ -46,8 +46,7 @@ from pandas.compat import(range, map, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) from pandas import compat -from pandas.sparse.array import SparseArray -from pandas.util.decorators import (cache_readonly, deprecate, Appender, +from pandas.util.decorators import (deprecate, Appender, Substitution, deprecate_kwarg) from pandas.tseries.period import PeriodIndex @@ -1720,10 +1719,11 @@ def _sizeof_fmt(num, size_qualifier): size_qualifier = '+' mem_usage = self.memory_usage(index=True, deep=deep).sum() lines.append("memory usage: %s\n" % - _sizeof_fmt(mem_usage, size_qualifier)) + _sizeof_fmt(mem_usage, size_qualifier) + ) _put_lines(buf, lines) - def memory_usage(self, index=False, deep=False): + def memory_usage(self, index=True, deep=False): """Memory usage of DataFrame columns. Parameters diff --git a/pandas/core/series.py b/pandas/core/series.py index 29abd8f031206..9910966bd4d2c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2335,7 +2335,7 @@ def reindex_axis(self, labels, axis=0, **kwargs): raise ValueError("cannot reindex series on non-zero axis!") return self.reindex(index=labels, **kwargs) - def memory_usage(self, index=False, deep=False): + def memory_usage(self, index=True, deep=False): """Memory usage of the Series Parameters diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index db391dca7114c..a1c7aaa259fa8 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1,22 +1,25 @@ # -*- coding: utf-8 -*- from __future__ import print_function + import re +import sys from datetime import datetime, timedelta + import numpy as np -import pandas.compat as compat + import pandas as pd -from pandas.compat import u, StringIO -from pandas.core.base import FrozenList, FrozenNDArray, PandasDelegate, NoNewAttributesMixin +import pandas.compat as compat import pandas.core.common as com +import pandas.util.testing as tm +from pandas import Series, Index, DatetimeIndex, \ + TimedeltaIndex, PeriodIndex, Timedelta +from pandas.compat import u, StringIO +from pandas.core.base import FrozenList, FrozenNDArray, \ + PandasDelegate, NoNewAttributesMixin from pandas.tseries.base import DatetimeIndexOpsMixin -from pandas.util.testing import assertRaisesRegexp, assertIsInstance -from pandas.tseries.common import is_datetimelike -from pandas import Series, Index, Int64Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta -import pandas.tslib as tslib -from pandas import _np_version_under1p9 -import nose +from pandas.util.testing import assertRaisesRegexp, \ + assertIsInstance -import pandas.util.testing as tm class CheckStringMixin(object): def test_string_methods_dont_fail(self): @@ -112,7 +115,9 @@ def setUp(self): def test_shallow_copying(self): original = self.container.copy() assertIsInstance(self.container.view(), FrozenNDArray) - self.assertFalse(isinstance(self.container.view(np.ndarray), FrozenNDArray)) + self.assertFalse(isinstance( + self.container.view(np.ndarray), FrozenNDArray + )) self.assertIsNot(self.container.view(), self.container) self.assert_numpy_array_equal(self.container, original) # shallow copy should be the same too @@ -881,27 +886,30 @@ def get_fill_value(obj): # check shallow_copied self.assertFalse(o is result) - def test_memory_usage(self): for o in self.objs: res = o.memory_usage() - res2 = o.memory_usage(deep=True) + res_deep = o.memory_usage(deep=True) - if com.is_object_dtype(o): - self.assertTrue(res2 > res) + if (com.is_object_dtype(o) or + (isinstance(o, Series) and + com.is_object_dtype(o.index))): + # if there are objects, only deep will pick them up + self.assertTrue(res_deep > res) else: - self.assertEqual(res, res2) + self.assertEqual(res, res_deep) if isinstance(o, Series): - res = o.memory_usage(index=True) - res2 = o.memory_usage(index=True, deep=True) - if com.is_object_dtype(o) or com.is_object_dtype(o.index): - self.assertTrue(res2 > res) - else: - self.assertEqual(res, res2) - - self.assertEqual(o.memory_usage(index=False) + o.index.memory_usage(), - o.memory_usage(index=True)) + self.assertEqual( + (o.memory_usage(index=False) + + o.index.memory_usage()), + o.memory_usage(index=True) + ) + + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = res_deep - sys.getsizeof(o) + self.assertTrue(abs(diff) < 100) class TestFloat64HashTable(tm.TestCase): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 64908f96bfdd8..d6a6446eb9f47 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1,22 +1,22 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101,E1103,W0232 -from datetime import datetime -from pandas.compat import range, lrange, u, PY3 import os -import pickle -import re +import sys +from datetime import datetime from distutils.version import LooseVersion import numpy as np -import pandas as pd -from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp, CategoricalIndex - -from pandas.core.config import option_context -import pandas.core.common as com +import pandas as pd import pandas.compat as compat +import pandas.core.common as com import pandas.util.testing as tm +from pandas import Categorical, Index, Series, DataFrame, \ + PeriodIndex, Timestamp, CategoricalIndex +from pandas.compat import range, lrange, u, PY3 +from pandas.core.config import option_context + class TestCategorical(tm.TestCase): _multiprocess_can_split_ = True @@ -1219,10 +1219,17 @@ def test_memory_usage(self): self.assertEqual(cat.nbytes, cat.memory_usage()) self.assertTrue(cat.memory_usage(deep=True) > cat.nbytes) + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = cat.memory_usage(deep=True) - sys.getsizeof(cat) + self.assertTrue(abs(diff) < 100) + def test_searchsorted(self): # https://github.com/pydata/pandas/issues/8420 - s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ]) - s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) + s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', + 'milk']) + s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', + 'milk', 'donuts']) c1 = pd.Categorical(s1, ordered=True) c2 = pd.Categorical(s2, ordered=True) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 57e75e3393b1b..a214d9ad5ded1 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7697,23 +7697,35 @@ def test_info_memory_usage(self): df.columns = dtypes # Ensure df size is as expected df_size = df.memory_usage().sum() - exp_size = len(dtypes) * n * 8 # cols * rows * bytes + exp_size = (len(dtypes) + 1) * n * 8 # (cols + index) * rows * bytes self.assertEqual(df_size, exp_size) # Ensure number of cols in memory_usage is the same as df - size_df = np.size(df.columns.values) # index=False; default + size_df = np.size(df.columns.values) + 1 # index=True; default self.assertEqual(size_df, np.size(df.memory_usage())) # assert deep works only on object - self.assertEqual(df.memory_usage().sum(),df.memory_usage(deep=True).sum()) + self.assertEqual(df.memory_usage().sum(), + df.memory_usage(deep=True).sum()) # test for validity - DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True) - DataFrame(1,index=['a'],columns=['A']).index.nbytes - DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes - DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes - DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).memory_usage(index=True) - DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes - DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes + DataFrame(1, index=['a'], columns=['A'] + ).memory_usage(index=True) + DataFrame(1, index=['a'], columns=['A'] + ).index.nbytes + df = DataFrame( + data=1, + index=pd.MultiIndex.from_product( + [['a'], range(1000)]), + columns=['A'] + ) + df.index.nbytes + df.memory_usage(index=True) + df.index.values.nbytes + + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df) + self.assertTrue(abs(diff) < 100) def test_dtypes(self): self.mixed_frame['bool'] = self.mixed_frame['A'] > 0