From a903f328eae45a401a2a7b86492ce200719646c3 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 8 Oct 2020 00:44:29 +0700 Subject: [PATCH 01/26] TST: add series info tests --- pandas/tests/io/formats/test_info.py | 121 ++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index d98530b5435e7..7369e87accec2 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -67,8 +67,19 @@ def test_info_categorical_column(): buf = StringIO() df2.info(buf=buf) + s = Series( + np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + ).astype("category") + s.isna() + buf = StringIO() + s.info(buf=buf) -def test_info(float_frame, datetime_frame): + s2 = s[s == "d"] + buf = StringIO() + s2.info(buf=buf) + + +def test_info_frame(float_frame, datetime_frame): io = StringIO() float_frame.info(buf=io) datetime_frame.info(buf=io) @@ -79,6 +90,32 @@ def test_info(float_frame, datetime_frame): frame.info(verbose=False) +@pytest.mark.parametrize("verbose", [True, False]) +def test_info_series(verbose): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(range(len(index)), index=index, name="sth") + buf = StringIO() + s.info(verbose=verbose, buf=buf) + expected = """ +MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') +""" + if verbose: + expected += """Series name: sth +Non-Null Count Dtype +-------------- ----- +10 non-null int64 +""" + expected += f"""dtypes: int64(1) +memory usage: {s.memory_usage()}.0+ bytes +""" + result = buf.getvalue() + assert result == expected + + def test_info_verbose(): buf = StringIO() size = 1001 @@ -180,6 +217,25 @@ def test_info_memory(): ) assert result == expected + s = Series([1, 2], dtype="i8") + buf = StringIO() + s.info(buf=buf) + result = buf.getvalue() + bytes = float(s.memory_usage()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Series name: None + Non-Null Count Dtype + -------------- ----- + 2 non-null int64 + dtypes: int64(1) + memory usage: {bytes} bytes + """ + ) + assert result == expected + def test_info_wide(): io = StringIO() @@ -198,6 +254,11 @@ def test_info_wide(): assert rs == xp reset_option("display.max_info_columns") + s = Series(np.random.randn(101)) + msg = "Argument `max_cols` can only be passed in DataFrame.info, not Series.info" + with pytest.raises(ValueError, match=msg): + s.info(max_cols=1) + def test_info_duplicate_columns(): io = StringIO() @@ -246,6 +307,14 @@ def test_info_shows_column_dtypes(): name = f" {i:d} {i:d} {n:d} non-null {dtype}" assert name in res + for dtype in dtypes: + s = Series(np.random.randint(2, size=n).astype(dtype)) + buf = StringIO() + s.info(buf=buf) + res = buf.getvalue() + name = f"{n:d} non-null {dtype}" + assert name in res + def test_info_max_cols(): df = DataFrame(np.random.randn(10, 5)) @@ -378,6 +447,14 @@ def test_info_memory_usage_deep_not_pypy(): df_object = DataFrame({"a": ["a"]}) assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) > s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) > s_object.memory_usage() + @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(): @@ -390,6 +467,14 @@ def test_info_memory_usage_deep_pypy(): df_object = DataFrame({"a": ["a"]}) assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) == s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) == s_object.memory_usage() + @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") def test_usage_via_getsizeof(): @@ -429,6 +514,26 @@ def test_info_memory_usage_qualified(): df.info(buf=buf) assert "+" in buf.getvalue() + buf = StringIO() + s = Series(1, index=[1, 2, 3]) + s.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + s = Series(1, index=list("ABC")) + s.info(buf=buf) + assert "+" in buf.getvalue() + + buf = StringIO() + s = Series(1, index=MultiIndex.from_product([range(3), range(3)]),) + s.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + s = Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]]),) + s.info(buf=buf) + assert "+" in buf.getvalue() + def test_info_memory_usage_bug_on_multiindex(): # GH 14308 @@ -451,6 +556,15 @@ def memory_usage(f): # high upper bound assert memory_usage(unstacked) - memory_usage(df) < 2000 + s = Series(np.random.randn(N * M), index=index) + + unstacked = s.unstack("id") + assert s.values.nbytes == unstacked.values.nbytes + assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() + + # high upper bound + assert unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) < 2000 + def test_info_categorical(): # GH14298 @@ -459,3 +573,8 @@ def test_info_categorical(): buf = StringIO() df.info(buf=buf) + + s = Series(np.zeros((2)), index=idx) + + buf = StringIO() + s.info(buf=buf) From e07d6e24194877bbf1701e47f02090cb0defe03d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 8 Oct 2020 00:46:02 +0700 Subject: [PATCH 02/26] TST: remove test that series has no info --- pandas/tests/series/test_api.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index d92edb6fe149a..f9169186e40f1 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -318,12 +318,6 @@ def test_items_strings(self, string_series): # assert is lazy (generators don't define reverse, lists do) assert not hasattr(string_series.items(), "reverse") - def test_raise_on_info(self): - s = Series(np.random.randn(10)) - msg = "'Series' object has no attribute 'info'" - with pytest.raises(AttributeError, match=msg): - s.info() - def test_copy(self): for deep in [None, False, True]: From 0990d54d8a1ebee9801093c961a789e4e5f2b25b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 8 Oct 2020 00:48:19 +0700 Subject: [PATCH 03/26] ENH: add method Series.info --- pandas/core/series.py | 89 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5cc163807fac6..751716950f9b2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -97,6 +97,7 @@ from pandas.core.tools.datetimes import to_datetime import pandas.io.formats.format as fmt +from pandas.io.formats.info import SeriesInfo import pandas.plotting if TYPE_CHECKING: @@ -4553,6 +4554,94 @@ def replace( method=method, ) + @Substitution( + klass="Series", + type_sub="", + max_cols_sub="", + examples_sub=( + """ + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> s = pd.Series(text_values, index=int_values) + >>> s.info() + + Int64Index: 5 entries, 1 to 5 + Series name: None + Non-Null Count Dtype + -------------- ----- + 5 non-null object + dtypes: object(1) + memory usage: 80.0+ bytes + + Prints a summary excluding information about its values: + + >>> s.info(verbose=False) + + Int64Index: 5 entries, 1 to 5 + dtypes: object(1) + memory usage: 80.0+ bytes + + Pipe output of Series.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> s.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big Series and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) + >>> s.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object + dtypes: object(1) + memory usage: 7.6+ MB + + >>> s.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object + dtypes: object(1) + memory usage: 55.3 MB""" + ), + see_also_sub=( + """ + Series.describe: Generate descriptive statistics of Series. + Series.memory_usage: Memory usage of Series.""" + ), + ) + @doc(SeriesInfo.info) + def info( + self, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, + ) -> None: + if max_cols is not None: + raise ValueError( + "Argument `max_cols` can only be passed " + "in DataFrame.info, not Series.info" + ) + return SeriesInfo( + self, verbose, buf, max_cols, memory_usage, null_counts + ).info() + @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( From 181479527443ade4d4867fbcccf163ff3e64c931 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 8 Oct 2020 01:27:17 +0700 Subject: [PATCH 04/26] REF: split tests for frame and series --- pandas/io/formats/info.py | 4 + pandas/tests/io/formats/test_info.py | 1047 +++++++++++++------------- 2 files changed, 545 insertions(+), 506 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 970bb8c535534..69e899d89eb93 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -267,6 +267,10 @@ def info(self) -> None: fmt.buffer_put_lines(self.buf, lines) +class SeriesInfo(BaseInfo): + pass + + class DataFrameInfo(BaseInfo): def _get_mem_usage(self, deep: bool) -> int: return self.data.memory_usage(index=True, deep=deep).sum() diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 7369e87accec2..76c6351d678e6 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -51,530 +51,565 @@ def datetime_frame(): return DataFrame(tm.getTimeSeriesData()) -def test_info_categorical_column(): - - # make sure it works - n = 2500 - df = DataFrame({"int64": np.random.randint(100, size=n)}) - df["category"] = Series( - np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) - ).astype("category") - df.isna() - buf = StringIO() - df.info(buf=buf) - - df2 = df[df["category"] == "d"] - buf = StringIO() - df2.info(buf=buf) - - s = Series( - np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) - ).astype("category") - s.isna() - buf = StringIO() - s.info(buf=buf) - - s2 = s[s == "d"] - buf = StringIO() - s2.info(buf=buf) - - -def test_info_frame(float_frame, datetime_frame): - io = StringIO() - float_frame.info(buf=io) - datetime_frame.info(buf=io) - - frame = DataFrame(np.random.randn(5, 3)) - - frame.info() - frame.info(verbose=False) - - -@pytest.mark.parametrize("verbose", [True, False]) -def test_info_series(verbose): - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["first", "second"], - ) - s = Series(range(len(index)), index=index, name="sth") - buf = StringIO() - s.info(verbose=verbose, buf=buf) - expected = """ -MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') -""" - if verbose: - expected += """Series name: sth -Non-Null Count Dtype --------------- ----- -10 non-null int64 -""" - expected += f"""dtypes: int64(1) -memory usage: {s.memory_usage()}.0+ bytes -""" - result = buf.getvalue() - assert result == expected - - -def test_info_verbose(): - buf = StringIO() - size = 1001 - start = 5 - frame = DataFrame(np.random.randn(3, size)) - frame.info(verbose=True, buf=buf) - - res = buf.getvalue() - header = " # Column Dtype \n--- ------ ----- " - assert header in res - - frame.info(verbose=True, buf=buf) - buf.seek(0) - lines = buf.readlines() - assert len(lines) > 0 - - for i, line in enumerate(lines): - if i >= start and i < start + size: - line_nr = f" {i - start} " - assert line.startswith(line_nr) - - -@pytest.mark.parametrize( - "size, header_exp, separator_exp, first_line_exp, last_line_exp", - [ - ( - 4, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 3 3 3 non-null float64", - ), - ( - 11, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 10 10 3 non-null float64", - ), - ( - 101, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 100 100 3 non-null float64", - ), - ( - 1001, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 1000 1000 3 non-null float64", - ), - ( - 10001, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 10000 10000 3 non-null float64", - ), - ], -) -def test_info_verbose_with_counts_spacing( - size, header_exp, separator_exp, first_line_exp, last_line_exp -): - """Test header column, spacer, first line and last line in verbose mode.""" - frame = DataFrame(np.random.randn(3, size)) - buf = StringIO() - frame.info(verbose=True, null_counts=True, buf=buf) - all_lines = buf.getvalue().splitlines() - # Here table would contain only header, separator and table lines - # dframe repr, index summary, memory usage and dtypes are excluded - table = all_lines[3:-2] - header, separator, first_line, *rest, last_line = table - assert header == header_exp - assert separator == separator_exp - assert first_line == first_line_exp - assert last_line == last_line_exp - - -def test_info_memory(): - # https://github.com/pandas-dev/pandas/issues/21056 - df = DataFrame({"a": Series([1, 2], dtype="i8")}) - buf = StringIO() - df.info(buf=buf) - result = buf.getvalue() - bytes = float(df.memory_usage().sum()) - expected = textwrap.dedent( - f"""\ - - RangeIndex: 2 entries, 0 to 1 - Data columns (total 1 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 a 2 non-null int64 - dtypes: int64(1) - memory usage: {bytes} bytes - """ - ) - assert result == expected - - s = Series([1, 2], dtype="i8") - buf = StringIO() - s.info(buf=buf) - result = buf.getvalue() - bytes = float(s.memory_usage()) - expected = textwrap.dedent( - f"""\ - - RangeIndex: 2 entries, 0 to 1 - Series name: None - Non-Null Count Dtype - -------------- ----- - 2 non-null int64 - dtypes: int64(1) - memory usage: {bytes} bytes - """ - ) - assert result == expected - - -def test_info_wide(): - io = StringIO() - df = DataFrame(np.random.randn(5, 101)) - df.info(buf=io) - - io = StringIO() - df.info(buf=io, max_cols=101) - rs = io.getvalue() - assert len(rs.splitlines()) > 100 - xp = rs - - set_option("display.max_info_columns", 101) - io = StringIO() - df.info(buf=io) - assert rs == xp - reset_option("display.max_info_columns") - - s = Series(np.random.randn(101)) - msg = "Argument `max_cols` can only be passed in DataFrame.info, not Series.info" - with pytest.raises(ValueError, match=msg): - s.info(max_cols=1) - - -def test_info_duplicate_columns(): - io = StringIO() - - # it works! - frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) - frame.info(buf=io) - - -def test_info_duplicate_columns_shows_correct_dtypes(): - # GH11761 - io = StringIO() - - frame = DataFrame([[1, 2.0]], columns=["a", "a"]) - frame.info(buf=io) - io.seek(0) - lines = io.readlines() - assert " 0 a 1 non-null int64 \n" == lines[5] - assert " 1 a 1 non-null float64\n" == lines[6] - - -def test_info_shows_column_dtypes(): - dtypes = [ - "int64", - "float64", - "datetime64[ns]", - "timedelta64[ns]", - "complex128", - "object", - "bool", - ] - data = {} - n = 10 - for i, dtype in enumerate(dtypes): - data[i] = np.random.randint(2, size=n).astype(dtype) - df = DataFrame(data) - buf = StringIO() - df.info(buf=buf) - res = buf.getvalue() - header = ( - " # Column Non-Null Count Dtype \n" - "--- ------ -------------- ----- " - ) - assert header in res - for i, dtype in enumerate(dtypes): - name = f" {i:d} {i:d} {n:d} non-null {dtype}" - assert name in res - - for dtype in dtypes: - s = Series(np.random.randint(2, size=n).astype(dtype)) +class TestDataFrameInfo: + def test_info_categorical_column_just_works(self): + n = 2500 + df = DataFrame({"int64": np.random.randint(100, size=n)}) + df["category"] = Series( + np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + ).astype("category") + df.isna() buf = StringIO() - s.info(buf=buf) - res = buf.getvalue() - name = f"{n:d} non-null {dtype}" - assert name in res - - -def test_info_max_cols(): - df = DataFrame(np.random.randn(10, 5)) - for len_, verbose in [(5, None), (5, False), (12, True)]: - # For verbose always ^ setting ^ summarize ^ full output - with option_context("max_info_columns", 4): - buf = StringIO() - df.info(buf=buf, verbose=verbose) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - for len_, verbose in [(12, None), (5, False), (12, True)]: - - # max_cols not exceeded - with option_context("max_info_columns", 5): - buf = StringIO() - df.info(buf=buf, verbose=verbose) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ + df.info(buf=buf) - for len_, max_cols in [(12, 5), (5, 4)]: - # setting truncates - with option_context("max_info_columns", 4): - buf = StringIO() - df.info(buf=buf, max_cols=max_cols) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - # setting wouldn't truncate - with option_context("max_info_columns", 5): - buf = StringIO() - df.info(buf=buf, max_cols=max_cols) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - -def test_info_memory_usage(): - # Ensure memory usage is displayed, when asserted, on the last line - dtypes = [ - "int64", - "float64", - "datetime64[ns]", - "timedelta64[ns]", - "complex128", - "object", - "bool", - ] - data = {} - n = 10 - for i, dtype in enumerate(dtypes): - data[i] = np.random.randint(2, size=n).astype(dtype) - df = DataFrame(data) - buf = StringIO() - - # display memory usage case - df.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - assert "memory usage: " in res[-1] - - # do not display memory usage case - df.info(buf=buf, memory_usage=False) - res = buf.getvalue().splitlines() - assert "memory usage: " not in res[-1] - - df.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - - # memory usage is a lower bound, so print it as XYZ+ MB - assert re.match(r"memory usage: [^+]+\+", res[-1]) - - df.iloc[:, :5].info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - - # excluded column with object dtype, so estimate is accurate - assert not re.match(r"memory usage: [^+]+\+", res[-1]) - - # Test a DataFrame with duplicate columns - dtypes = ["int64", "int64", "int64", "float64"] - data = {} - n = 100 - for i, dtype in enumerate(dtypes): - data[i] = np.random.randint(2, size=n).astype(dtype) - df = DataFrame(data) - df.columns = dtypes - - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) - df_with_object_index.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - assert re.match(r"memory usage: [^+]+\+", res[-1]) - - df_with_object_index.info(buf=buf, memory_usage="deep") - res = buf.getvalue().splitlines() - assert re.match(r"memory usage: [^+]+$", res[-1]) - - # Ensure df size is as expected - # (cols * rows * bytes) + index size - df_size = df.memory_usage().sum() - exp_size = len(dtypes) * n * 8 + df.index.nbytes - assert df_size == exp_size - - # Ensure number of cols in memory_usage is the same as df - size_df = np.size(df.columns.values) + 1 # index=True; default - assert size_df == np.size(df.memory_usage()) - - # assert deep works only on object - assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() - - # test for validity - DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) - DataFrame(1, index=["a"], columns=["A"]).index.nbytes - df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] - ) - df.index.nbytes - df.memory_usage(index=True) - df.index.values.nbytes - - mem = df.memory_usage(deep=True).sum() - assert mem > 0 - - -@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") -def test_info_memory_usage_deep_not_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) - assert ( - df_with_object_index.memory_usage(index=True, deep=True).sum() - > df_with_object_index.memory_usage(index=True).sum() - ) - - df_object = DataFrame({"a": ["a"]}) - assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() - - s_with_object_index = Series({"a": [1]}, index=["foo"]) - assert s_with_object_index.memory_usage( - index=True, deep=True - ) > s_with_object_index.memory_usage(index=True) + df2 = df[df["category"] == "d"] + buf = StringIO() + df2.info(buf=buf) - s_object = Series({"a": ["a"]}) - assert s_object.memory_usage(deep=True) > s_object.memory_usage() + def test_info_frame_float_frame_just_works(self, float_frame): + io = StringIO() + float_frame.info(buf=io) + def test_info_datetime_just_works(self, datetime_frame): + io = StringIO() + datetime_frame.info(buf=io) -@pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") -def test_info_memory_usage_deep_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) - assert ( - df_with_object_index.memory_usage(index=True, deep=True).sum() - == df_with_object_index.memory_usage(index=True).sum() - ) + def test_info_verbose_just_works(self): + frame = DataFrame(np.random.randn(5, 3)) + frame.info() - df_object = DataFrame({"a": ["a"]}) - assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() + def test_info_non_verbose_just_works(self): + frame = DataFrame(np.random.randn(5, 3)) + frame.info(verbose=False) - s_with_object_index = Series({"a": [1]}, index=["foo"]) - assert s_with_object_index.memory_usage( - index=True, deep=True - ) == s_with_object_index.memory_usage(index=True) - - s_object = Series({"a": ["a"]}) - assert s_object.memory_usage(deep=True) == s_object.memory_usage() + def test_info_small_frame_default_verbose(self): + frame = DataFrame(np.random.randn(5, 3)) + frame.info() == frame.info(verbose=True) + def test_info_verbose_check_header_separator_body(self): + buf = StringIO() + size = 1001 + start = 5 + frame = DataFrame(np.random.randn(3, size)) + frame.info(verbose=True, buf=buf) -@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") -def test_usage_via_getsizeof(): - df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + res = buf.getvalue() + header = " # Column Dtype \n--- ------ ----- " + assert header in res + + frame.info(verbose=True, buf=buf) + buf.seek(0) + lines = buf.readlines() + assert len(lines) > 0 + + for i, line in enumerate(lines): + if i >= start and i < start + size: + line_nr = f" {i - start} " + assert line.startswith(line_nr) + + @pytest.mark.parametrize( + "size, header_exp, separator_exp, first_line_exp, last_line_exp", + [ + ( + 4, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 3 3 3 non-null float64", + ), + ( + 11, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 10 10 3 non-null float64", + ), + ( + 101, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 100 100 3 non-null float64", + ), + ( + 1001, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 1000 1000 3 non-null float64", + ), + ( + 10001, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 10000 10000 3 non-null float64", + ), + ], ) - mem = df.memory_usage(deep=True).sum() - # sys.getsizeof will call the .memory_usage with - # deep=True, and add on some GC overhead - diff = mem - sys.getsizeof(df) - assert abs(diff) < 100 - - -def test_info_memory_usage_qualified(): - - buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) - df.info(buf=buf) - assert "+" not in buf.getvalue() - - buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=list("ABC")) - df.info(buf=buf) - assert "+" in buf.getvalue() + def test_info_verbose_with_counts_spacing( + self, size, header_exp, separator_exp, first_line_exp, last_line_exp + ): + """Test header column, spacer, first line and last line in verbose mode.""" + frame = DataFrame(np.random.randn(3, size)) + buf = StringIO() + frame.info(verbose=True, null_counts=True, buf=buf) + all_lines = buf.getvalue().splitlines() + # Here table would contain only header, separator and table lines + # dframe repr, index summary, memory usage and dtypes are excluded + table = all_lines[3:-2] + header, separator, first_line, *rest, last_line = table + assert header == header_exp + assert separator == separator_exp + assert first_line == first_line_exp + assert last_line == last_line_exp + + def test_info_memory(self): + # https://github.com/pandas-dev/pandas/issues/21056 + df = DataFrame({"a": Series([1, 2], dtype="i8")}) + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + bytes = float(df.memory_usage().sum()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Data columns (total 1 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 2 non-null int64 + dtypes: int64(1) + memory usage: {bytes} bytes + """ + ) + assert result == expected + + def test_info_wide(self): + io = StringIO() + df = DataFrame(np.random.randn(5, 101)) + df.info(buf=io) + + io = StringIO() + df.info(buf=io, max_cols=101) + rs = io.getvalue() + assert len(rs.splitlines()) > 100 + xp = rs + + set_option("display.max_info_columns", 101) + io = StringIO() + df.info(buf=io) + assert rs == xp + reset_option("display.max_info_columns") + + def test_info_duplicate_columns_works(self): + io = StringIO() + # it works! + frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) + frame.info(buf=io) + + def test_info_duplicate_columns_shows_correct_dtypes(self): + # GH11761 + io = StringIO() + frame = DataFrame([[1, 2.0]], columns=["a", "a"]) + frame.info(buf=io) + lines = io.getvalue().splitlines(True) + assert " 0 a 1 non-null int64 \n" == lines[5] + assert " 1 a 1 non-null float64\n" == lines[6] + + def test_info_shows_column_dtypes(self): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + df.info(buf=buf) + res = buf.getvalue() + header = ( + " # Column Non-Null Count Dtype \n" + "--- ------ -------------- ----- " + ) + assert header in res + for i, dtype in enumerate(dtypes): + name = f" {i:d} {i:d} {n:d} non-null {dtype}" + assert name in res + + def test_info_max_cols(self): + df = DataFrame(np.random.randn(10, 5)) + for len_, verbose in [(5, None), (5, False), (12, True)]: + # For verbose always ^ setting ^ summarize ^ full output + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, verbose in [(12, None), (5, False), (12, True)]: + # max_cols not exceeded + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, max_cols in [(12, 5), (5, 4)]: + # setting truncates + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + # setting wouldn't truncate + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + def test_info_memory_usage(self): + # Ensure memory usage is displayed, when asserted, on the last line + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() - buf = StringIO() - df = DataFrame( - 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) - ) - df.info(buf=buf) - assert "+" not in buf.getvalue() + # display memory usage case + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert "memory usage: " in res[-1] + + # do not display memory usage case + df.info(buf=buf, memory_usage=False) + res = buf.getvalue().splitlines() + assert "memory usage: " not in res[-1] + + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # memory usage is a lower bound, so print it as XYZ+ MB + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df.iloc[:, :5].info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # excluded column with object dtype, so estimate is accurate + assert not re.match(r"memory usage: [^+]+\+", res[-1]) + + # Test a DataFrame with duplicate columns + dtypes = ["int64", "int64", "int64", "float64"] + data = {} + n = 100 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + df.columns = dtypes + + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df_with_object_index.info(buf=buf, memory_usage="deep") + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+$", res[-1]) + + # Ensure df size is as expected + # (cols * rows * bytes) + index size + df_size = df.memory_usage().sum() + exp_size = len(dtypes) * n * 8 + df.index.nbytes + assert df_size == exp_size + + # Ensure number of cols in memory_usage is the same as df + size_df = np.size(df.columns.values) + 1 # index=True; default + assert size_df == np.size(df.memory_usage()) + + # assert deep works only on object + assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() + + # test for validity + DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) + DataFrame(1, index=["a"], columns=["A"]).index.nbytes + df = DataFrame( + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) + df.index.nbytes + df.memory_usage(index=True) + df.index.values.nbytes + + mem = df.memory_usage(deep=True).sum() + assert mem > 0 + + @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") + def test_info_memory_usage_deep_not_pypy(self): + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + > df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + + @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") + def test_info_memory_usage_deep_pypy(self): + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + == df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() + + @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") + def test_usage_via_getsizeof(self): + df = DataFrame( + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) + mem = df.memory_usage(deep=True).sum() + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = mem - sys.getsizeof(df) + assert abs(diff) < 100 + + def test_info_memory_usage_qualified(self): + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) + df.info(buf=buf) + assert "+" not in buf.getvalue() - buf = StringIO() - df = DataFrame( - 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) - ) - df.info(buf=buf) - assert "+" in buf.getvalue() + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df.info(buf=buf) + assert "+" in buf.getvalue() - buf = StringIO() - s = Series(1, index=[1, 2, 3]) - s.info(buf=buf) - assert "+" not in buf.getvalue() + buf = StringIO() + df = DataFrame( + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) + ) + df.info(buf=buf) + assert "+" not in buf.getvalue() - buf = StringIO() - s = Series(1, index=list("ABC")) - s.info(buf=buf) - assert "+" in buf.getvalue() + buf = StringIO() + df = DataFrame( + 1, + columns=list("ab"), + index=MultiIndex.from_product([range(3), ["foo", "bar"]]), + ) + df.info(buf=buf) + assert "+" in buf.getvalue() + + def test_info_memory_usage_bug_on_multiindex(self): + # GH 14308 + # memory usage introspection should not materialize .values + + def memory_usage(f): + return f.memory_usage(deep=True).sum() + + N = 100 + M = len(uppercase) + index = MultiIndex.from_product( + [list(uppercase), date_range("20160101", periods=N)], + names=["id", "date"], + ) + df = DataFrame({"value": np.random.randn(N * M)}, index=index) + + unstacked = df.unstack("id") + assert df.values.nbytes == unstacked.values.nbytes + assert memory_usage(df) > memory_usage(unstacked) + + # high upper bound + assert memory_usage(unstacked) - memory_usage(df) < 2000 + + def test_info_categorical(self): + # GH14298 + idx = CategoricalIndex(["a", "b"]) + df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx) - buf = StringIO() - s = Series(1, index=MultiIndex.from_product([range(3), range(3)]),) - s.info(buf=buf) - assert "+" not in buf.getvalue() + buf = StringIO() + df.info(buf=buf) - buf = StringIO() - s = Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]]),) - s.info(buf=buf) - assert "+" in buf.getvalue() +class TestSeriesInfo: + def test_info_categorical_column_just_works(self): + n = 2500 + data = np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + s = Series(data).astype("category") + s.isna() + buf = StringIO() + s.info(buf=buf) -def test_info_memory_usage_bug_on_multiindex(): - # GH 14308 - # memory usage introspection should not materialize .values + s2 = s[s == "d"] + buf = StringIO() + s2.info(buf=buf) - def memory_usage(f): - return f.memory_usage(deep=True).sum() + def test_info_categorical(self): + # GH14298 + idx = CategoricalIndex(["a", "b"]) + s = Series(np.zeros((2)), index=idx) + buf = StringIO() + s.info(buf=buf) - N = 100 - M = len(uppercase) - index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"] + @pytest.mark.parametrize("verbose", [True, False]) + def test_info_series(self, verbose): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(range(len(index)), index=index, name="sth") + buf = StringIO() + s.info(verbose=verbose, buf=buf) + result = buf.getvalue() + + expected = textwrap.dedent( + """\ + + MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') + """ + ) + if verbose: + expected += textwrap.dedent( + """\ + Series name: sth + Non-Null Count Dtype + -------------- ----- + 10 non-null int64 + """ + ) + expected += textwrap.dedent( + f"""\ + dtypes: int64(1) + memory usage: {s.memory_usage()}.0+ bytes + """ + ) + assert result == expected + + def test_info_memory(self): + s = Series([1, 2], dtype="i8") + buf = StringIO() + s.info(buf=buf) + result = buf.getvalue() + memory_bytes = float(s.memory_usage()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Series name: None + Non-Null Count Dtype + -------------- ----- + 2 non-null int64 + dtypes: int64(1) + memory usage: {memory_bytes} bytes + """ + ) + assert result == expected + + def test_info_wide(self): + s = Series(np.random.randn(101)) + msg = ( + "Argument `max_cols` can only be passed in DataFrame.info, " + "not Series.info" + ) + with pytest.raises(ValueError, match=msg): + s.info(max_cols=1) + + def test_info_shows_dtypes(self): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + n = 10 + for dtype in dtypes: + s = Series(np.random.randint(2, size=n).astype(dtype)) + buf = StringIO() + s.info(buf=buf) + res = buf.getvalue() + name = f"{n:d} non-null {dtype}" + assert name in res + + @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") + def test_info_memory_usage_deep_not_pypy(self): + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) > s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) > s_object.memory_usage() + + @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") + def test_info_memory_usage_deep_pypy(self): + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) == s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) == s_object.memory_usage() + + @pytest.mark.parametrize( + "series, plus", + [ + (Series(1, index=[1, 2, 3]), False), + (Series(1, index=list("ABC")), True), + (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), + ( + Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), + True, + ), + ], ) - df = DataFrame({"value": np.random.randn(N * M)}, index=index) - - unstacked = df.unstack("id") - assert df.values.nbytes == unstacked.values.nbytes - assert memory_usage(df) > memory_usage(unstacked) - - # high upper bound - assert memory_usage(unstacked) - memory_usage(df) < 2000 - - s = Series(np.random.randn(N * M), index=index) - - unstacked = s.unstack("id") - assert s.values.nbytes == unstacked.values.nbytes - assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() - - # high upper bound - assert unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) < 2000 - - -def test_info_categorical(): - # GH14298 - idx = CategoricalIndex(["a", "b"]) - df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx) - - buf = StringIO() - df.info(buf=buf) - - s = Series(np.zeros((2)), index=idx) - - buf = StringIO() - s.info(buf=buf) + def test_info_memory_usage_qualified(self, series, plus): + buf = StringIO() + series.info(buf=buf) + if plus: + assert "+" in buf.getvalue() + else: + assert "+" not in buf.getvalue() + + def test_info_memory_usage_bug_on_multiindex(self): + # GH 14308 + # memory usage introspection should not materialize .values + N = 100 + M = len(uppercase) + index = MultiIndex.from_product( + [list(uppercase), date_range("20160101", periods=N)], + names=["id", "date"], + ) + s = Series(np.random.randn(N * M), index=index) + + unstacked = s.unstack("id") + assert s.values.nbytes == unstacked.values.nbytes + assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() + + # high upper bound + diff = unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) + assert diff < 2000 From 4c390a87371c115578a9d9b88892b2de6a04ef49 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 8 Oct 2020 01:31:59 +0700 Subject: [PATCH 05/26] REF: param test on frame memory_usage_qualified --- pandas/tests/io/formats/test_info.py | 54 +++++++++++++++------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 76c6351d678e6..da0e990b0ec99 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -394,32 +394,36 @@ def test_usage_via_getsizeof(self): diff = mem - sys.getsizeof(df) assert abs(diff) < 100 - def test_info_memory_usage_qualified(self): - buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) - df.info(buf=buf) - assert "+" not in buf.getvalue() - - buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=list("ABC")) - df.info(buf=buf) - assert "+" in buf.getvalue() - - buf = StringIO() - df = DataFrame( - 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) - ) - df.info(buf=buf) - assert "+" not in buf.getvalue() - + @pytest.mark.parametrize( + "frame, plus", + [ + (DataFrame(1, columns=list("ab"), index=[1, 2, 3]), False), + (DataFrame(1, columns=list("ab"), index=list("ABC")), True), + ( + DataFrame( + 1, + columns=list("ab"), + index=MultiIndex.from_product([range(3), range(3)]), + ), + False, + ), + ( + DataFrame( + 1, + columns=list("ab"), + index=MultiIndex.from_product([range(3), ["foo", "bar"]]), + ), + True, + ), + ], + ) + def test_info_memory_usage_qualified(self, frame, plus): buf = StringIO() - df = DataFrame( - 1, - columns=list("ab"), - index=MultiIndex.from_product([range(3), ["foo", "bar"]]), - ) - df.info(buf=buf) - assert "+" in buf.getvalue() + frame.info(buf=buf) + if plus: + assert "+" in buf.getvalue() + else: + assert "+" not in buf.getvalue() def test_info_memory_usage_bug_on_multiindex(self): # GH 14308 From 824d8d64b201cefef11f1cd10d86baebfb47e05d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 21 Oct 2020 23:35:38 +0700 Subject: [PATCH 06/26] ENH: enable series info --- pandas/core/series.py | 10 +- pandas/io/formats/info.py | 601 ++++++++++++++++++++++++++------------ 2 files changed, 428 insertions(+), 183 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0f2f59cc37b83..61956e10830d9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4635,7 +4635,7 @@ def replace( Series.memory_usage: Memory usage of Series.""" ), ) - @doc(SeriesInfo.info) + @doc(SeriesInfo.to_buffer) def info( self, verbose: Optional[bool] = None, @@ -4649,9 +4649,11 @@ def info( "Argument `max_cols` can only be passed " "in DataFrame.info, not Series.info" ) - return SeriesInfo( - self, verbose, buf, max_cols, memory_usage, null_counts - ).info() + return SeriesInfo(self, memory_usage).to_buffer( + buf=buf, + verbose=verbose, + show_counts=null_counts, + ) @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 5fb09a084b13b..36170e6c70ba2 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,10 +1,20 @@ from abc import ABC, abstractmethod import sys -from typing import IO, TYPE_CHECKING, Iterator, List, Mapping, Optional, Sequence, Union +from typing import ( + IO, + TYPE_CHECKING, + Iterable, + Iterator, + List, + Mapping, + Optional, + Sequence, + Union, +) from pandas._config import get_option -from pandas._typing import Dtype, FrameOrSeries +from pandas._typing import Dtype, FrameOrSeriesUnion from pandas.core.indexes.api import Index @@ -87,7 +97,7 @@ class BaseInfo(ABC): Parameters ---------- - data : FrameOrSeries + data : FrameOrSeriesUnion Either dataframe or series. memory_usage : bool or str, optional If "deep", introspect the data deeply by interrogating object dtypes @@ -95,18 +105,18 @@ class BaseInfo(ABC): values. """ - def __init__( - self, - data: FrameOrSeries, - memory_usage: Optional[Union[bool, str]] = None, - ): - self.data = data - self.memory_usage = _initialize_memory_usage(memory_usage) + data: FrameOrSeriesUnion + memory_usage: Union[bool, str] @property - @abstractmethod - def ids(self) -> Index: - """Column names or index names.""" + def dtypes(self) -> Iterable[Dtype]: + """Dtypes. + + Returns + ------- + dtypes : sequence + Dtype of each of the DataFrame's columns (or one series column). + """ @property @abstractmethod @@ -120,17 +130,6 @@ def non_null_counts(self) -> Sequence[int]: @property @abstractmethod - def dtypes(self) -> "Series": - """Dtypes. - - Returns - ------- - dtypes : Series - Dtype of each of the DataFrame's columns. - """ - return self.data.dtypes - - @property def memory_usage_bytes(self) -> int: """Memory usage in bytes. @@ -139,11 +138,6 @@ def memory_usage_bytes(self) -> int: memory_usage_bytes : int Object's total memory usage in bytes. """ - if self.memory_usage == "deep": - deep = True - else: - deep = False - return self.data.memory_usage(index=True, deep=deep).sum() @property def memory_usage_string(self) -> str: @@ -167,45 +161,61 @@ def size_qualifier(self) -> str: class DataFrameInfo(BaseInfo): - """Class storing dataframe-specific info.""" + """ + Class storing dataframe-specific info. + """ + + def __init__( + self, + data: "DataFrame", + memory_usage: Optional[Union[bool, str]] = None, + ): + self.data: "DataFrame" = data + self.memory_usage = _initialize_memory_usage(memory_usage) @property - def ids(self) -> Index: - """Column names. + def dtype_counts(self) -> Mapping[str, int]: + return _get_dataframe_dtype_counts(self.data) + + @property + def dtypes(self) -> Iterable[Dtype]: + """Dtypes. Returns ------- - ids : Index - DataFrame's column names. + dtypes + Dtype of each of the DataFrame's columns. """ - return self.data.columns + return self.data.dtypes @property - def dtypes(self) -> "Series": - """Dtypes. + def ids(self) -> Index: + """Column names. Returns ------- - dtypes : Series - Dtype of each of the DataFrame's columns. + ids : Index + DataFrame's column names. """ - return self.data.dtypes + return self.data.columns @property - def dtype_counts(self) -> Mapping[str, int]: - """Mapping dtype - number of counts.""" - # groupby dtype.name to collect e.g. Categorical columns - return self.dtypes.value_counts().groupby(lambda x: x.name).sum() + def col_count(self) -> int: + """Number of columns to be summarized.""" + return len(self.ids) @property def non_null_counts(self) -> Sequence[int]: - """Sequence of non-null counts for all columns.""" + """Sequence of non-null counts for all columns or column (if series).""" return self.data.count() @property - def col_count(self) -> int: - """Number of columns to be summarized.""" - return len(self.ids) + def memory_usage_bytes(self) -> int: + if self.memory_usage == "deep": + deep = True + else: + deep = False + return self.data.memory_usage(index=True, deep=deep).sum() def to_buffer( self, @@ -266,7 +276,7 @@ def to_buffer( -------- %(examples_sub)s """ - printer = InfoPrinter( + printer = DataFrameInfoPrinter( info=self, max_cols=max_cols, verbose=verbose, @@ -276,11 +286,83 @@ def to_buffer( class SeriesInfo(BaseInfo): - pass + """ + Class storing series-specific info. + """ + + def __init__( + self, + data: "Series", + memory_usage: Optional[Union[bool, str]] = None, + ): + self.data: "Series" = data + self.memory_usage = _initialize_memory_usage(memory_usage) + + def to_buffer( + self, + *, + buf: Optional[IO[str]], + verbose: Optional[bool], + show_counts: Optional[bool], + ) -> None: + printer = SeriesInfoPrinter( + info=self, + verbose=verbose, + show_counts=show_counts, + ) + printer.to_buffer(buf) + + @property + def non_null_counts(self) -> Sequence[int]: + return [self.data.count()] + @property + def dtypes(self) -> Iterable[Dtype]: + return [self.data.dtypes] + + @property + def dtype_counts(self): + from pandas.core.frame import DataFrame -class InfoPrinter: - """Class for printing dataframe or series info. + return _get_dataframe_dtype_counts(DataFrame(self.data)) + + @property + def memory_usage_bytes(self) -> int: + """Memory usage in bytes. + + Returns + ------- + memory_usage_bytes : int + Object's total memory usage in bytes. + """ + if self.memory_usage == "deep": + deep = True + else: + deep = False + return self.data.memory_usage(index=True, deep=deep) + + +class InfoPrinterAbstract: + """ + Class for printing dataframe or series info. + """ + + def to_buffer(self, buf: Optional[IO[str]] = None) -> None: + """Save dataframe info into buffer.""" + table_builder = self._create_table_builder() + lines = table_builder.get_lines() + if buf is None: # pragma: no cover + buf = sys.stdout + fmt.buffer_put_lines(buf, lines) + + @abstractmethod + def _create_table_builder(self) -> "TableBuilderAbstract": + """Create instance of table builder.""" + + +class DataFrameInfoPrinter(InfoPrinterAbstract): + """ + Class for printing dataframe info. Parameters ---------- @@ -338,14 +420,6 @@ def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: else: return show_counts - def to_buffer(self, buf: Optional[IO[str]] = None) -> None: - """Save dataframe info into buffer.""" - table_builder = self._create_table_builder() - lines = table_builder.get_lines() - if buf is None: # pragma: no cover - buf = sys.stdout - fmt.buffer_put_lines(buf, lines) - def _create_table_builder(self) -> "DataFrameTableBuilder": """ Create instance of table builder based on verbosity and display settings. @@ -367,72 +441,79 @@ def _create_table_builder(self) -> "DataFrameTableBuilder": ) -class TableBuilderAbstract(ABC): - """Abstract builder for info table. +class SeriesInfoPrinter(InfoPrinterAbstract): + """Class for printing series info. Parameters ---------- - info : BaseInfo - Instance of DataFrameInfo or SeriesInfo. + info : SeriesInfo + Instance of SeriesInfo. + verbose : bool, optional + Whether to print the full summary. + show_counts : bool, optional + Whether to show the non-null counts. """ - _lines: List[str] - - def __init__(self, *, info): + def __init__( + self, + info: SeriesInfo, + verbose: Optional[bool] = None, + show_counts: Optional[bool] = None, + ): self.info = info + self.data = info.data + self.verbose = verbose + self.show_counts = self._initialize_show_counts(show_counts) - @abstractmethod - def get_lines(self) -> List[str]: - """Product in a form of list of lines (strings).""" + def _create_table_builder(self) -> "SeriesTableBuilder": + """ + Create instance of table builder based on verbosity. + """ + if self.verbose or self.verbose is None: + return SeriesTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) + else: + return SeriesTableBuilderNonVerbose(info=self.info) + def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: + if show_counts is None: + return True + else: + return show_counts -class DataFrameTableBuilder(TableBuilderAbstract): - """Abstract builder for dataframe info table.""" - def get_lines(self) -> List[str]: - self._lines = [] - if self.col_count == 0: - self._fill_empty_info() - else: - self._fill_non_empty_info() - return self._lines +class TableBuilderAbstract(ABC): + """ + Abstract builder for info table. + """ - def _fill_empty_info(self) -> None: - """Add lines to the info table, pertaining to empty dataframe.""" - self.add_object_type_line() - self.add_index_range_line() - self._lines.append(f"Empty {type(self.data).__name__}") + _lines: List[str] + info: BaseInfo - def _fill_non_empty_info(self) -> None: - """Add lines to the info table, pertaining to non-empty dataframe.""" - self.add_object_type_line() - self.add_index_range_line() - self.add_columns_summary_line() - self.add_header_line() - self.add_separator_line() - self.add_body_lines() - self.add_dtypes_line() - if self.display_memory_usage: - self.add_memory_usage_line() + @abstractmethod + def get_lines(self) -> List[str]: + """Product in a form of list of lines (strings).""" @property - def data(self) -> "DataFrame": - """DataFrame.""" + def data(self) -> FrameOrSeriesUnion: return self.info.data + @property + def dtypes(self) -> Iterable[Dtype]: + """Dtypes of each of the DataFrame's columns.""" + return self.info.dtypes + @property def dtype_counts(self) -> Mapping[str, int]: """Mapping dtype - number of counts.""" return self.info.dtype_counts - @property - def non_null_counts(self) -> Sequence[int]: - return self.info.non_null_counts - @property def display_memory_usage(self) -> bool: """Whether to display memory usage.""" - return self.info.memory_usage + return bool(self.info.memory_usage) @property def memory_usage_string(self) -> str: @@ -440,19 +521,8 @@ def memory_usage_string(self) -> str: return self.info.memory_usage_string @property - def ids(self) -> Index: - """Dataframe columns.""" - return self.info.ids - - @property - def dtypes(self) -> "Series": - """Dtypes of each of the DataFrame's columns.""" - return self.info.dtypes - - @property - def col_count(self) -> int: - """Number of dataframe columns to be summarized.""" - return self.info.col_count + def non_null_counts(self) -> Sequence[int]: + return self.info.non_null_counts def add_object_type_line(self) -> None: """Add line with string representation of dataframe to the table.""" @@ -462,22 +532,6 @@ def add_index_range_line(self) -> None: """Add line with range of indices to the table.""" self._lines.append(self.data.index._summary()) - @abstractmethod - def add_columns_summary_line(self) -> None: - """Add line with columns summary to the table.""" - - @abstractmethod - def add_header_line(self) -> None: - """Add header line to the table.""" - - @abstractmethod - def add_separator_line(self) -> None: - """Add separator line between header and body of the table.""" - - @abstractmethod - def add_body_lines(self) -> None: - """Add content of the table body.""" - def add_dtypes_line(self) -> None: """Add summary line with dtypes present in dataframe.""" collected_dtypes = [ @@ -485,62 +539,90 @@ def add_dtypes_line(self) -> None: ] self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") + +class DataFrameTableBuilder(TableBuilderAbstract): + """ + Abstract builder for dataframe info table. + + Parameters + ---------- + info : DataFrameInfo. + Instance of DataFrameInfo. + """ + + def __init__(self, *, info: DataFrameInfo): + self.info: DataFrameInfo = info + + def get_lines(self) -> List[str]: + self._lines = [] + if self.col_count == 0: + self._fill_empty_info() + else: + self._fill_non_empty_info() + return self._lines + + def _fill_empty_info(self) -> None: + """Add lines to the info table, pertaining to empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self._lines.append(f"Empty {type(self.data).__name__}") + + @abstractmethod + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + + @property + def data(self) -> "DataFrame": + """DataFrame.""" + return self.info.data + + @property + def ids(self) -> Index: + """Dataframe columns.""" + return self.info.ids + + @property + def col_count(self) -> int: + """Number of dataframe columns to be summarized.""" + return self.info.col_count + def add_memory_usage_line(self) -> None: """Add line containing memory usage.""" self._lines.append(f"memory usage: {self.memory_usage_string}") class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): - """Info table builder for non-verbose output.""" + """ + Dataframe info table builder for non-verbose output. + """ + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() def add_columns_summary_line(self) -> None: self._lines.append(self.ids._summary(name="Columns")) - def add_header_line(self) -> None: - """No header in non-verbose output.""" - - def add_separator_line(self) -> None: - """No separator in non-verbose output.""" - - def add_body_lines(self) -> None: - """No body in non-verbose output.""" - -class DataFrameTableBuilderVerbose(DataFrameTableBuilder): - """Info table builder for verbose output.""" +class TableBuilderVerboseMixin(TableBuilderAbstract): + """ + Mixin for verbose info output. + """ - SPACING = " " * 2 - - def __init__( - self, - *, - info: DataFrameInfo, - with_counts: bool, - ): - super().__init__(info=info) - self.with_counts = with_counts - self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) - self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + SPACING: str = " " * 2 + strrows: Sequence[Sequence[str]] + gross_column_widths: Sequence[int] + with_counts: bool @property + @abstractmethod def headers(self) -> Sequence[str]: """Headers names of the columns in verbose table.""" - if self.with_counts: - return [" # ", "Column", "Non-Null Count", "Dtype"] - return [" # ", "Column", "Dtype"] - - def _gen_rows(self) -> Iterator[Sequence[str]]: - """Generator function yielding rows content. - - Each element represents a row comprising a sequence of strings. - """ - if self.with_counts: - return self._gen_rows_with_counts() - else: - return self._gen_rows_without_counts() - - def add_columns_summary_line(self) -> None: - self._lines.append(f"Data columns (total {self.col_count} columns):") @property def header_column_widths(self) -> Sequence[int]: @@ -560,6 +642,24 @@ def _get_body_column_widths(self) -> Sequence[int]: strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) return [max(len(x) for x in col) for col in strcols] + def _gen_rows(self) -> Iterator[Sequence[str]]: + """Generator function yielding rows content. + + Each element represents a row comprising a sequence of strings. + """ + if self.with_counts: + return self._gen_rows_with_counts() + else: + return self._gen_rows_without_counts() + + @abstractmethod + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + + @abstractmethod + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + def add_header_line(self) -> None: header_line = self.SPACING.join( [ @@ -590,6 +690,55 @@ def add_body_lines(self) -> None: ) self._lines.append(body_line) + def _gen_non_null_counts(self) -> Iterator[str]: + """Iterator with string representation of non-null counts.""" + for count in self.non_null_counts: + yield f"{count} non-null" + + def _gen_dtypes(self) -> Iterator[str]: + """Iterator with string representation of column dtypes.""" + for dtype in self.dtypes: + yield pprint_thing(dtype) + + +class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin): + """ + Dataframe info table builder for verbose output. + """ + + def __init__( + self, + *, + info: DataFrameInfo, + with_counts: bool, + ): + self.info = info + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return [" # ", "Column", "Non-Null Count", "Dtype"] + return [" # ", "Column", "Dtype"] + + def add_columns_summary_line(self) -> None: + self._lines.append(f"Data columns (total {self.col_count} columns):") + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: """Iterator with string representation of body data without counts.""" yield from zip( @@ -617,12 +766,106 @@ def _gen_columns(self) -> Iterator[str]: for col in self.ids: yield pprint_thing(col) - def _gen_dtypes(self) -> Iterator[str]: - """Iterator with string representation of column dtypes.""" - for dtype in self.dtypes: - yield pprint_thing(dtype) - def _gen_non_null_counts(self) -> Iterator[str]: - """Iterator with string representation of non-null counts.""" - for count in self.non_null_counts: - yield f"{count} non-null" +class SeriesTableBuilder(TableBuilderAbstract): + """ + Abstract builder for series info table. + + Parameters + ---------- + info : SeriesInfo. + Instance of SeriesInfo. + """ + + def __init__(self, *, info: SeriesInfo): + self.info: SeriesInfo = info + + def get_lines(self) -> List[str]: + self._lines = [] + self._fill_non_empty_info() + return self._lines + + @property + def data(self) -> "Series": + """Series.""" + return self.info.data + + def add_memory_usage_line(self) -> None: + """Add line containing memory usage.""" + self._lines.append(f"memory usage: {self.memory_usage_string}") + + @abstractmethod + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty series.""" + + +class SeriesTableBuilderNonVerbose(SeriesTableBuilder): + """ + Series info table builder for non-verbose output. + """ + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty series.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + +class SeriesTableBuilderVerbose(SeriesTableBuilder, TableBuilderVerboseMixin): + """ + Series info table builder for verbose output. + """ + + def __init__( + self, + *, + info: SeriesInfo, + with_counts: bool, + ): + self.info = info + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty series.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_series_name_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + def add_series_name_line(self): + self._lines.append(f"Series name: {self.data.name}") + + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return ["Non-Null Count", "Dtype"] + return ["Dtype"] + + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + yield from self._gen_dtypes() + + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + yield from zip( + self._gen_non_null_counts(), + self._gen_dtypes(), + ) + + +def _get_dataframe_dtype_counts(df: "DataFrame") -> Mapping[str, int]: + """ + Create mapping between datatypes and their number of occurences. + """ + # groupby dtype.name to collect e.g. Categorical columns + return df.dtypes.value_counts().groupby(lambda x: x.name).sum() From ce68e94e9dc1072a3d9ad45c4d9e71cd798a363a Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 22 Oct 2020 02:34:26 +0700 Subject: [PATCH 07/26] CLN: remove extra parens --- pandas/tests/io/formats/test_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index eb3f3c0019175..d43edb7049a6d 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -493,7 +493,7 @@ def test_info_categorical_column_just_works(self): def test_info_categorical(self): # GH14298 idx = CategoricalIndex(["a", "b"]) - s = Series(np.zeros((2)), index=idx) + s = Series(np.zeros(2), index=idx) buf = StringIO() s.info(buf=buf) From ede6dc47d4f9d521cc9778008c7425b129a7c576 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 23 Oct 2020 21:33:23 +0700 Subject: [PATCH 08/26] REF: split series-related tests --- pandas/tests/io/formats/test_info.py | 1015 ++++++++----------- pandas/tests/io/formats/test_series_info.py | 178 ++++ 2 files changed, 614 insertions(+), 579 deletions(-) create mode 100644 pandas/tests/io/formats/test_series_info.py diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index d43edb7049a6d..33a3c798072e3 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -51,590 +51,447 @@ def datetime_frame(): return DataFrame(tm.getTimeSeriesData()) -class TestDataFrameInfo: - def test_info_categorical_column_just_works(self): - n = 2500 - df = DataFrame({"int64": np.random.randint(100, size=n)}) - df["category"] = Series( - np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) - ).astype("category") - df.isna() - buf = StringIO() - df.info(buf=buf) - - df2 = df[df["category"] == "d"] - buf = StringIO() - df2.info(buf=buf) - - def test_info_frame_float_frame_just_works(self, float_frame): - io = StringIO() - float_frame.info(buf=io) - - def test_info_datetime_just_works(self, datetime_frame): - io = StringIO() - datetime_frame.info(buf=io) - - def test_info_verbose_just_works(self): - frame = DataFrame(np.random.randn(5, 3)) - frame.info() - - def test_info_non_verbose_just_works(self): - frame = DataFrame(np.random.randn(5, 3)) - frame.info(verbose=False) - - def test_info_small_frame_default_verbose(self): - frame = DataFrame(np.random.randn(5, 3)) - frame.info() == frame.info(verbose=True) - - def test_info_verbose_check_header_separator_body(self): - buf = StringIO() - size = 1001 - start = 5 - frame = DataFrame(np.random.randn(3, size)) - frame.info(verbose=True, buf=buf) - - res = buf.getvalue() - header = " # Column Dtype \n--- ------ ----- " - assert header in res - - frame.info(verbose=True, buf=buf) - buf.seek(0) - lines = buf.readlines() - assert len(lines) > 0 - - for i, line in enumerate(lines): - if i >= start and i < start + size: - line_nr = f" {i - start} " - assert line.startswith(line_nr) - - @pytest.mark.parametrize( - "size, header_exp, separator_exp, first_line_exp, last_line_exp", - [ - ( - 4, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 3 3 3 non-null float64", - ), - ( - 11, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 10 10 3 non-null float64", - ), - ( - 101, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 100 100 3 non-null float64", - ), - ( - 1001, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 1000 1000 3 non-null float64", +def test_info_categorical_column_just_works(): + n = 2500 + df = DataFrame({"int64": np.random.randint(100, size=n)}) + df["category"] = Series( + np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + ).astype("category") + df.isna() + buf = StringIO() + df.info(buf=buf) + + df2 = df[df["category"] == "d"] + buf = StringIO() + df2.info(buf=buf) + + +def test_info_frame_float_frame_just_works(float_frame): + io = StringIO() + float_frame.info(buf=io) + + +def test_info_datetime_just_works(datetime_frame): + io = StringIO() + datetime_frame.info(buf=io) + + +def test_info_verbose_just_works(): + frame = DataFrame(np.random.randn(5, 3)) + frame.info() + + +def test_info_non_verbose_just_works(): + frame = DataFrame(np.random.randn(5, 3)) + frame.info(verbose=False) + + +def test_info_small_frame_default_verbose(): + frame = DataFrame(np.random.randn(5, 3)) + frame.info() == frame.info(verbose=True) + + +def test_info_verbose_check_header_separator_body(): + buf = StringIO() + size = 1001 + start = 5 + frame = DataFrame(np.random.randn(3, size)) + frame.info(verbose=True, buf=buf) + + res = buf.getvalue() + header = " # Column Dtype \n--- ------ ----- " + assert header in res + + frame.info(verbose=True, buf=buf) + buf.seek(0) + lines = buf.readlines() + assert len(lines) > 0 + + for i, line in enumerate(lines): + if i >= start and i < start + size: + line_nr = f" {i - start} " + assert line.startswith(line_nr) + + +@pytest.mark.parametrize( + "size, header_exp, separator_exp, first_line_exp, last_line_exp", + [ + ( + 4, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 3 3 3 non-null float64", + ), + ( + 11, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 10 10 3 non-null float64", + ), + ( + 101, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 100 100 3 non-null float64", + ), + ( + 1001, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 1000 1000 3 non-null float64", + ), + ( + 10001, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 10000 10000 3 non-null float64", + ), + ], +) +def test_info_verbose_with_counts_spacing( + size, header_exp, separator_exp, first_line_exp, last_line_exp +): + """Test header column, spacer, first line and last line in verbose mode.""" + frame = DataFrame(np.random.randn(3, size)) + buf = StringIO() + frame.info(verbose=True, null_counts=True, buf=buf) + all_lines = buf.getvalue().splitlines() + # Here table would contain only header, separator and table lines + # dframe repr, index summary, memory usage and dtypes are excluded + table = all_lines[3:-2] + header, separator, first_line, *rest, last_line = table + assert header == header_exp + assert separator == separator_exp + assert first_line == first_line_exp + assert last_line == last_line_exp + + +def test_info_memory(): + # https://github.com/pandas-dev/pandas/issues/21056 + df = DataFrame({"a": Series([1, 2], dtype="i8")}) + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + bytes = float(df.memory_usage().sum()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Data columns (total 1 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 2 non-null int64 + dtypes: int64(1) + memory usage: {bytes} bytes + """ + ) + assert result == expected + + +def test_info_wide(): + io = StringIO() + df = DataFrame(np.random.randn(5, 101)) + df.info(buf=io) + + io = StringIO() + df.info(buf=io, max_cols=101) + rs = io.getvalue() + assert len(rs.splitlines()) > 100 + xp = rs + + set_option("display.max_info_columns", 101) + io = StringIO() + df.info(buf=io) + assert rs == xp + reset_option("display.max_info_columns") + + +def test_info_duplicate_columns_works(): + io = StringIO() + # it works! + frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) + frame.info(buf=io) + + +def test_info_duplicate_columns_shows_correct_dtypes(): + # GH11761 + io = StringIO() + frame = DataFrame([[1, 2.0]], columns=["a", "a"]) + frame.info(buf=io) + lines = io.getvalue().splitlines(True) + assert " 0 a 1 non-null int64 \n" == lines[5] + assert " 1 a 1 non-null float64\n" == lines[6] + + +def test_info_shows_column_dtypes(): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + df.info(buf=buf) + res = buf.getvalue() + header = ( + " # Column Non-Null Count Dtype \n" + "--- ------ -------------- ----- " + ) + assert header in res + for i, dtype in enumerate(dtypes): + name = f" {i:d} {i:d} {n:d} non-null {dtype}" + assert name in res + + +def test_info_max_cols(): + df = DataFrame(np.random.randn(10, 5)) + for len_, verbose in [(5, None), (5, False), (12, True)]: + # For verbose always ^ setting ^ summarize ^ full output + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, verbose in [(12, None), (5, False), (12, True)]: + # max_cols not exceeded + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, max_cols in [(12, 5), (5, 4)]: + # setting truncates + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + # setting wouldn't truncate + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + +def test_info_memory_usage(): + # Ensure memory usage is displayed, when asserted, on the last line + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + + # display memory usage case + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert "memory usage: " in res[-1] + + # do not display memory usage case + df.info(buf=buf, memory_usage=False) + res = buf.getvalue().splitlines() + assert "memory usage: " not in res[-1] + + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # memory usage is a lower bound, so print it as XYZ+ MB + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df.iloc[:, :5].info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # excluded column with object dtype, so estimate is accurate + assert not re.match(r"memory usage: [^+]+\+", res[-1]) + + # Test a DataFrame with duplicate columns + dtypes = ["int64", "int64", "int64", "float64"] + data = {} + n = 100 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + df.columns = dtypes + + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df_with_object_index.info(buf=buf, memory_usage="deep") + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+$", res[-1]) + + # Ensure df size is as expected + # (cols * rows * bytes) + index size + df_size = df.memory_usage().sum() + exp_size = len(dtypes) * n * 8 + df.index.nbytes + assert df_size == exp_size + + # Ensure number of cols in memory_usage is the same as df + size_df = np.size(df.columns.values) + 1 # index=True; default + assert size_df == np.size(df.memory_usage()) + + # assert deep works only on object + assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() + + # test for validity + DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) + DataFrame(1, index=["a"], columns=["A"]).index.nbytes + df = DataFrame( + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) + df.index.nbytes + df.memory_usage(index=True) + df.index.values.nbytes + + mem = df.memory_usage(deep=True).sum() + assert mem > 0 + + +@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") +def test_info_memory_usage_deep_not_pypy(): + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + > df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + + +@pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") +def test_info_memory_usage_deep_pypy(): + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + == df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() + + +@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") +def test_usage_via_getsizeof(): + df = DataFrame( + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) + mem = df.memory_usage(deep=True).sum() + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = mem - sys.getsizeof(df) + assert abs(diff) < 100 + + +@pytest.mark.parametrize( + "frame, plus", + [ + (DataFrame(1, columns=list("ab"), index=[1, 2, 3]), False), + (DataFrame(1, columns=list("ab"), index=list("ABC")), True), + ( + DataFrame( + 1, + columns=list("ab"), + index=MultiIndex.from_product([range(3), range(3)]), ), - ( - 10001, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 10000 10000 3 non-null float64", + False, + ), + ( + DataFrame( + 1, + columns=list("ab"), + index=MultiIndex.from_product([range(3), ["foo", "bar"]]), ), - ], + True, + ), + ], +) +def test_info_memory_usage_qualified(frame, plus): + buf = StringIO() + frame.info(buf=buf) + if plus: + assert "+" in buf.getvalue() + else: + assert "+" not in buf.getvalue() + + +def test_info_memory_usage_bug_on_multiindex(): + # GH 14308 + # memory usage introspection should not materialize .values + + def memory_usage(f): + return f.memory_usage(deep=True).sum() + + N = 100 + M = len(uppercase) + index = MultiIndex.from_product( + [list(uppercase), date_range("20160101", periods=N)], + names=["id", "date"], ) - def test_info_verbose_with_counts_spacing( - self, size, header_exp, separator_exp, first_line_exp, last_line_exp - ): - """Test header column, spacer, first line and last line in verbose mode.""" - frame = DataFrame(np.random.randn(3, size)) - buf = StringIO() - frame.info(verbose=True, null_counts=True, buf=buf) - all_lines = buf.getvalue().splitlines() - # Here table would contain only header, separator and table lines - # dframe repr, index summary, memory usage and dtypes are excluded - table = all_lines[3:-2] - header, separator, first_line, *rest, last_line = table - assert header == header_exp - assert separator == separator_exp - assert first_line == first_line_exp - assert last_line == last_line_exp - - def test_info_memory(self): - # https://github.com/pandas-dev/pandas/issues/21056 - df = DataFrame({"a": Series([1, 2], dtype="i8")}) - buf = StringIO() - df.info(buf=buf) - result = buf.getvalue() - bytes = float(df.memory_usage().sum()) - expected = textwrap.dedent( - f"""\ + df = DataFrame({"value": np.random.randn(N * M)}, index=index) + + unstacked = df.unstack("id") + assert df.values.nbytes == unstacked.values.nbytes + assert memory_usage(df) > memory_usage(unstacked) + + # high upper bound + assert memory_usage(unstacked) - memory_usage(df) < 2000 + + +def test_info_categorical(): + # GH14298 + idx = CategoricalIndex(["a", "b"]) + df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx) + + buf = StringIO() + df.info(buf=buf) + + +def test_info_int_columns(): + # GH#37245 + df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) + buf = StringIO() + df.info(null_counts=True, buf=buf) + result = buf.getvalue() + expected = textwrap.dedent( + """\ - RangeIndex: 2 entries, 0 to 1 - Data columns (total 1 columns): + Index: 2 entries, A to B + Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 a 2 non-null int64 - dtypes: int64(1) - memory usage: {bytes} bytes - """ - ) - assert result == expected - - def test_info_wide(self): - io = StringIO() - df = DataFrame(np.random.randn(5, 101)) - df.info(buf=io) - - io = StringIO() - df.info(buf=io, max_cols=101) - rs = io.getvalue() - assert len(rs.splitlines()) > 100 - xp = rs - - set_option("display.max_info_columns", 101) - io = StringIO() - df.info(buf=io) - assert rs == xp - reset_option("display.max_info_columns") - - def test_info_duplicate_columns_works(self): - io = StringIO() - # it works! - frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) - frame.info(buf=io) - - def test_info_duplicate_columns_shows_correct_dtypes(self): - # GH11761 - io = StringIO() - frame = DataFrame([[1, 2.0]], columns=["a", "a"]) - frame.info(buf=io) - lines = io.getvalue().splitlines(True) - assert " 0 a 1 non-null int64 \n" == lines[5] - assert " 1 a 1 non-null float64\n" == lines[6] - - def test_info_shows_column_dtypes(self): - dtypes = [ - "int64", - "float64", - "datetime64[ns]", - "timedelta64[ns]", - "complex128", - "object", - "bool", - ] - data = {} - n = 10 - for i, dtype in enumerate(dtypes): - data[i] = np.random.randint(2, size=n).astype(dtype) - df = DataFrame(data) - buf = StringIO() - df.info(buf=buf) - res = buf.getvalue() - header = ( - " # Column Non-Null Count Dtype \n" - "--- ------ -------------- ----- " - ) - assert header in res - for i, dtype in enumerate(dtypes): - name = f" {i:d} {i:d} {n:d} non-null {dtype}" - assert name in res - - def test_info_max_cols(self): - df = DataFrame(np.random.randn(10, 5)) - for len_, verbose in [(5, None), (5, False), (12, True)]: - # For verbose always ^ setting ^ summarize ^ full output - with option_context("max_info_columns", 4): - buf = StringIO() - df.info(buf=buf, verbose=verbose) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - for len_, verbose in [(12, None), (5, False), (12, True)]: - # max_cols not exceeded - with option_context("max_info_columns", 5): - buf = StringIO() - df.info(buf=buf, verbose=verbose) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - for len_, max_cols in [(12, 5), (5, 4)]: - # setting truncates - with option_context("max_info_columns", 4): - buf = StringIO() - df.info(buf=buf, max_cols=max_cols) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - # setting wouldn't truncate - with option_context("max_info_columns", 5): - buf = StringIO() - df.info(buf=buf, max_cols=max_cols) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - def test_info_memory_usage(self): - # Ensure memory usage is displayed, when asserted, on the last line - dtypes = [ - "int64", - "float64", - "datetime64[ns]", - "timedelta64[ns]", - "complex128", - "object", - "bool", - ] - data = {} - n = 10 - for i, dtype in enumerate(dtypes): - data[i] = np.random.randint(2, size=n).astype(dtype) - df = DataFrame(data) - buf = StringIO() - - # display memory usage case - df.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - assert "memory usage: " in res[-1] - - # do not display memory usage case - df.info(buf=buf, memory_usage=False) - res = buf.getvalue().splitlines() - assert "memory usage: " not in res[-1] - - df.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - - # memory usage is a lower bound, so print it as XYZ+ MB - assert re.match(r"memory usage: [^+]+\+", res[-1]) - - df.iloc[:, :5].info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - - # excluded column with object dtype, so estimate is accurate - assert not re.match(r"memory usage: [^+]+\+", res[-1]) - - # Test a DataFrame with duplicate columns - dtypes = ["int64", "int64", "int64", "float64"] - data = {} - n = 100 - for i, dtype in enumerate(dtypes): - data[i] = np.random.randint(2, size=n).astype(dtype) - df = DataFrame(data) - df.columns = dtypes - - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) - df_with_object_index.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - assert re.match(r"memory usage: [^+]+\+", res[-1]) - - df_with_object_index.info(buf=buf, memory_usage="deep") - res = buf.getvalue().splitlines() - assert re.match(r"memory usage: [^+]+$", res[-1]) - - # Ensure df size is as expected - # (cols * rows * bytes) + index size - df_size = df.memory_usage().sum() - exp_size = len(dtypes) * n * 8 + df.index.nbytes - assert df_size == exp_size - - # Ensure number of cols in memory_usage is the same as df - size_df = np.size(df.columns.values) + 1 # index=True; default - assert size_df == np.size(df.memory_usage()) - - # assert deep works only on object - assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() - - # test for validity - DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) - DataFrame(1, index=["a"], columns=["A"]).index.nbytes - df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] - ) - df.index.nbytes - df.memory_usage(index=True) - df.index.values.nbytes - - mem = df.memory_usage(deep=True).sum() - assert mem > 0 - - @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") - def test_info_memory_usage_deep_not_pypy(self): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) - assert ( - df_with_object_index.memory_usage(index=True, deep=True).sum() - > df_with_object_index.memory_usage(index=True).sum() - ) - - df_object = DataFrame({"a": ["a"]}) - assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() - - @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") - def test_info_memory_usage_deep_pypy(self): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) - assert ( - df_with_object_index.memory_usage(index=True, deep=True).sum() - == df_with_object_index.memory_usage(index=True).sum() - ) - - df_object = DataFrame({"a": ["a"]}) - assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() - - @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") - def test_usage_via_getsizeof(self): - df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] - ) - mem = df.memory_usage(deep=True).sum() - # sys.getsizeof will call the .memory_usage with - # deep=True, and add on some GC overhead - diff = mem - sys.getsizeof(df) - assert abs(diff) < 100 - - @pytest.mark.parametrize( - "frame, plus", - [ - (DataFrame(1, columns=list("ab"), index=[1, 2, 3]), False), - (DataFrame(1, columns=list("ab"), index=list("ABC")), True), - ( - DataFrame( - 1, - columns=list("ab"), - index=MultiIndex.from_product([range(3), range(3)]), - ), - False, - ), - ( - DataFrame( - 1, - columns=list("ab"), - index=MultiIndex.from_product([range(3), ["foo", "bar"]]), - ), - True, - ), - ], - ) - def test_info_memory_usage_qualified(self, frame, plus): - buf = StringIO() - frame.info(buf=buf) - if plus: - assert "+" in buf.getvalue() - else: - assert "+" not in buf.getvalue() - - def test_info_memory_usage_bug_on_multiindex(self): - # GH 14308 - # memory usage introspection should not materialize .values - - def memory_usage(f): - return f.memory_usage(deep=True).sum() - - N = 100 - M = len(uppercase) - index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], - names=["id", "date"], - ) - df = DataFrame({"value": np.random.randn(N * M)}, index=index) - - unstacked = df.unstack("id") - assert df.values.nbytes == unstacked.values.nbytes - assert memory_usage(df) > memory_usage(unstacked) - - # high upper bound - assert memory_usage(unstacked) - memory_usage(df) < 2000 - - def test_info_categorical(self): - # GH14298 - idx = CategoricalIndex(["a", "b"]) - df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx) - - buf = StringIO() - df.info(buf=buf) - - def test_info_int_columns(self): - # GH#37245 - df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) - buf = StringIO() - df.info(null_counts=True, buf=buf) - result = buf.getvalue() - expected = textwrap.dedent( - """\ - - Index: 2 entries, A to B - Data columns (total 2 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 1 2 non-null int64 - 1 2 2 non-null int64 - dtypes: int64(2) - memory usage: 48.0+ bytes - """ - ) - assert result == expected - - -class TestSeriesInfo: - def test_info_categorical_column_just_works(self): - n = 2500 - data = np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) - s = Series(data).astype("category") - s.isna() - buf = StringIO() - s.info(buf=buf) - - s2 = s[s == "d"] - buf = StringIO() - s2.info(buf=buf) - - def test_info_categorical(self): - # GH14298 - idx = CategoricalIndex(["a", "b"]) - s = Series(np.zeros(2), index=idx) - buf = StringIO() - s.info(buf=buf) - - @pytest.mark.parametrize("verbose", [True, False]) - def test_info_series(self, verbose): - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["first", "second"], - ) - s = Series(range(len(index)), index=index, name="sth") - buf = StringIO() - s.info(verbose=verbose, buf=buf) - result = buf.getvalue() - - expected = textwrap.dedent( - """\ - - MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') - """ - ) - if verbose: - expected += textwrap.dedent( - """\ - Series name: sth - Non-Null Count Dtype - -------------- ----- - 10 non-null int64 - """ - ) - expected += textwrap.dedent( - f"""\ - dtypes: int64(1) - memory usage: {s.memory_usage()}.0+ bytes - """ - ) - assert result == expected - - def test_info_memory(self): - s = Series([1, 2], dtype="i8") - buf = StringIO() - s.info(buf=buf) - result = buf.getvalue() - memory_bytes = float(s.memory_usage()) - expected = textwrap.dedent( - f"""\ - - RangeIndex: 2 entries, 0 to 1 - Series name: None - Non-Null Count Dtype - -------------- ----- - 2 non-null int64 - dtypes: int64(1) - memory usage: {memory_bytes} bytes + 0 1 2 non-null int64 + 1 2 2 non-null int64 + dtypes: int64(2) + memory usage: 48.0+ bytes """ - ) - assert result == expected - - def test_info_wide(self): - s = Series(np.random.randn(101)) - msg = ( - "Argument `max_cols` can only be passed in DataFrame.info, " - "not Series.info" - ) - with pytest.raises(ValueError, match=msg): - s.info(max_cols=1) - - def test_info_shows_dtypes(self): - dtypes = [ - "int64", - "float64", - "datetime64[ns]", - "timedelta64[ns]", - "complex128", - "object", - "bool", - ] - n = 10 - for dtype in dtypes: - s = Series(np.random.randint(2, size=n).astype(dtype)) - buf = StringIO() - s.info(buf=buf) - res = buf.getvalue() - name = f"{n:d} non-null {dtype}" - assert name in res - - @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") - def test_info_memory_usage_deep_not_pypy(self): - s_with_object_index = Series({"a": [1]}, index=["foo"]) - assert s_with_object_index.memory_usage( - index=True, deep=True - ) > s_with_object_index.memory_usage(index=True) - - s_object = Series({"a": ["a"]}) - assert s_object.memory_usage(deep=True) > s_object.memory_usage() - - @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") - def test_info_memory_usage_deep_pypy(self): - s_with_object_index = Series({"a": [1]}, index=["foo"]) - assert s_with_object_index.memory_usage( - index=True, deep=True - ) == s_with_object_index.memory_usage(index=True) - - s_object = Series({"a": ["a"]}) - assert s_object.memory_usage(deep=True) == s_object.memory_usage() - - @pytest.mark.parametrize( - "series, plus", - [ - (Series(1, index=[1, 2, 3]), False), - (Series(1, index=list("ABC")), True), - (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), - ( - Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), - True, - ), - ], ) - def test_info_memory_usage_qualified(self, series, plus): - buf = StringIO() - series.info(buf=buf) - if plus: - assert "+" in buf.getvalue() - else: - assert "+" not in buf.getvalue() - - def test_info_memory_usage_bug_on_multiindex(self): - # GH 14308 - # memory usage introspection should not materialize .values - N = 100 - M = len(uppercase) - index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], - names=["id", "date"], - ) - s = Series(np.random.randn(N * M), index=index) - - unstacked = s.unstack("id") - assert s.values.nbytes == unstacked.values.nbytes - assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() - - # high upper bound - diff = unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) - assert diff < 2000 + assert result == expected diff --git a/pandas/tests/io/formats/test_series_info.py b/pandas/tests/io/formats/test_series_info.py new file mode 100644 index 0000000000000..2be31f550e75a --- /dev/null +++ b/pandas/tests/io/formats/test_series_info.py @@ -0,0 +1,178 @@ +from io import StringIO +from string import ascii_uppercase as uppercase +import textwrap + +import numpy as np +import pytest + +from pandas.compat import PYPY + +from pandas import CategoricalIndex, MultiIndex, Series, date_range + + +def test_info_categorical_column_just_works(): + n = 2500 + data = np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + s = Series(data).astype("category") + s.isna() + buf = StringIO() + s.info(buf=buf) + + s2 = s[s == "d"] + buf = StringIO() + s2.info(buf=buf) + + +def test_info_categorical(): + # GH14298 + idx = CategoricalIndex(["a", "b"]) + s = Series(np.zeros(2), index=idx) + buf = StringIO() + s.info(buf=buf) + + +@pytest.mark.parametrize("verbose", [True, False]) +def test_info_series(verbose): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(range(len(index)), index=index, name="sth") + buf = StringIO() + s.info(verbose=verbose, buf=buf) + result = buf.getvalue() + + expected = textwrap.dedent( + """\ + + MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') + """ + ) + if verbose: + expected += textwrap.dedent( + """\ + Series name: sth + Non-Null Count Dtype + -------------- ----- + 10 non-null int64 + """ + ) + expected += textwrap.dedent( + f"""\ + dtypes: int64(1) + memory usage: {s.memory_usage()}.0+ bytes + """ + ) + assert result == expected + + +def test_info_memory(): + s = Series([1, 2], dtype="i8") + buf = StringIO() + s.info(buf=buf) + result = buf.getvalue() + memory_bytes = float(s.memory_usage()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Series name: None + Non-Null Count Dtype + -------------- ----- + 2 non-null int64 + dtypes: int64(1) + memory usage: {memory_bytes} bytes + """ + ) + assert result == expected + + +def test_info_wide(): + s = Series(np.random.randn(101)) + msg = "Argument `max_cols` can only be passed in DataFrame.info, " "not Series.info" + with pytest.raises(ValueError, match=msg): + s.info(max_cols=1) + + +def test_info_shows_dtypes(): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + n = 10 + for dtype in dtypes: + s = Series(np.random.randint(2, size=n).astype(dtype)) + buf = StringIO() + s.info(buf=buf) + res = buf.getvalue() + name = f"{n:d} non-null {dtype}" + assert name in res + + +@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") +def test_info_memory_usage_deep_not_pypy(): + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) > s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) > s_object.memory_usage() + + +@pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") +def test_info_memory_usage_deep_pypy(): + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) == s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) == s_object.memory_usage() + + +@pytest.mark.parametrize( + "series, plus", + [ + (Series(1, index=[1, 2, 3]), False), + (Series(1, index=list("ABC")), True), + (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), + ( + Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), + True, + ), + ], +) +def test_info_memory_usage_qualified(series, plus): + buf = StringIO() + series.info(buf=buf) + if plus: + assert "+" in buf.getvalue() + else: + assert "+" not in buf.getvalue() + + +def test_info_memory_usage_bug_on_multiindex(): + # GH 14308 + # memory usage introspection should not materialize .values + N = 100 + M = len(uppercase) + index = MultiIndex.from_product( + [list(uppercase), date_range("20160101", periods=N)], + names=["id", "date"], + ) + s = Series(np.random.randn(N * M), index=index) + + unstacked = s.unstack("id") + assert s.values.nbytes == unstacked.values.nbytes + assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() + + # high upper bound + diff = unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) + assert diff < 2000 From 789e03e6a944541f8c0163fa54b3d1ef84a20e8d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 23 Oct 2020 21:46:03 +0700 Subject: [PATCH 09/26] DOC: add release note --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e4d97168692b3..27091bfd6d671 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -221,6 +221,7 @@ Other enhancements - :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`) - :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`) - :class:`DataFrame` now supports ``divmod`` operation (:issue:`37165`) +- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) .. _whatsnew_120.api_breaking.python: From f41596dda1d1c6af1827d487a33424f9223bebde Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 23 Oct 2020 21:57:14 +0700 Subject: [PATCH 10/26] CLN: merge two lines --- pandas/tests/io/formats/test_series_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_series_info.py b/pandas/tests/io/formats/test_series_info.py index 2be31f550e75a..86219a2b98fe4 100644 --- a/pandas/tests/io/formats/test_series_info.py +++ b/pandas/tests/io/formats/test_series_info.py @@ -90,7 +90,7 @@ def test_info_memory(): def test_info_wide(): s = Series(np.random.randn(101)) - msg = "Argument `max_cols` can only be passed in DataFrame.info, " "not Series.info" + msg = "Argument `max_cols` can only be passed in DataFrame.info, not Series.info" with pytest.raises(ValueError, match=msg): s.info(max_cols=1) From 40b71f85c05518d7bac5ef193f57582a625dd201 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 23 Oct 2020 22:57:22 +0700 Subject: [PATCH 11/26] DOC: unify series/frame docstrings, fix indent --- pandas/core/frame.py | 29 +++++++---- pandas/core/series.py | 19 ++++--- pandas/io/formats/info.py | 102 +++++++++++++++++++------------------- 3 files changed, 82 insertions(+), 68 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 285d4fc34cf98..86930d395d216 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -151,7 +151,7 @@ from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import DataFrameInfo +from pandas.io.formats.info import BaseInfo, DataFrameInfo import pandas.plotting if TYPE_CHECKING: @@ -2506,16 +2506,25 @@ def to_html( @Substitution( klass="DataFrame", type_sub=" and columns", - max_cols_sub=( - """max_cols : int, optional + max_cols_sub=dedent( + """\ + max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. - """ + ``pandas.options.display.max_info_columns`` is used.""" ), - examples_sub=( - """ + null_counts_sub=dedent( + """\ + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the DataFrame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts.""" + ), + examples_sub=dedent( + """\ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] @@ -2598,14 +2607,14 @@ def to_html( dtypes: object(3) memory usage: 165.9 MB""" ), - see_also_sub=( - """ + see_also_sub=dedent( + """\ DataFrame.describe: Generate descriptive statistics of DataFrame columns. DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) - @doc(DataFrameInfo.to_buffer) + @doc(BaseInfo.to_buffer) def info( self, verbose: Optional[bool] = None, diff --git a/pandas/core/series.py b/pandas/core/series.py index 61956e10830d9..eb49e34204008 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -97,7 +97,7 @@ from pandas.core.tools.datetimes import to_datetime import pandas.io.formats.format as fmt -from pandas.io.formats.info import SeriesInfo +from pandas.io.formats.info import BaseInfo, SeriesInfo import pandas.plotting if TYPE_CHECKING: @@ -4569,8 +4569,13 @@ def replace( klass="Series", type_sub="", max_cols_sub="", - examples_sub=( - """ + null_counts_sub=dedent( + """\ + null_counts : bool, default True. + Whether to show the non-null counts.""" + ), + examples_sub=dedent( + """\ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> s = pd.Series(text_values, index=int_values) @@ -4629,20 +4634,20 @@ def replace( dtypes: object(1) memory usage: 55.3 MB""" ), - see_also_sub=( - """ + see_also_sub=dedent( + """\ Series.describe: Generate descriptive statistics of Series. Series.memory_usage: Memory usage of Series.""" ), ) - @doc(SeriesInfo.to_buffer) + @doc(BaseInfo.to_buffer) def info( self, verbose: Optional[bool] = None, buf: Optional[IO[str]] = None, max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, - null_counts: Optional[bool] = None, + null_counts: bool = True, ) -> None: if max_cols is not None: raise ValueError( diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 36170e6c70ba2..2c6201071b0be 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -159,6 +159,55 @@ def size_qualifier(self) -> str: size_qualifier = "+" return size_qualifier + @abstractmethod + def to_buffer(): + """ + Print a concise summary of a %(klass)s. + + This method prints information about a %(klass)s including + the index dtype%(type_sub)s, non-null values and memory usage. + + Parameters + ---------- + data : %(klass)s + %(klass)s to print information about. + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + %(max_cols_sub)s + memory_usage : bool, str, optional + Specifies whether total memory usage of the %(klass)s + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. + %(null_counts_sub)s + + Returns + ------- + None + This method prints a summary of a %(klass)s and returns None. + + See Also + -------- + %(see_also_sub)s + + Examples + -------- + %(examples_sub)s + """ + class DataFrameInfo(BaseInfo): """ @@ -225,57 +274,6 @@ def to_buffer( verbose: Optional[bool], show_counts: Optional[bool], ) -> None: - """ - Print a concise summary of a %(klass)s. - - This method prints information about a %(klass)s including - the index dtype%(type_sub)s, non-null values and memory usage. - - Parameters - ---------- - data : %(klass)s - %(klass)s to print information about. - verbose : bool, optional - Whether to print the full summary. By default, the setting in - ``pandas.options.display.max_info_columns`` is followed. - buf : writable buffer, defaults to sys.stdout - Where to send the output. By default, the output is printed to - sys.stdout. Pass a writable buffer if you need to further process - the output. - %(max_cols_sub)s - memory_usage : bool, str, optional - Specifies whether total memory usage of the %(klass)s - elements (including the index) should be displayed. By default, - this follows the ``pandas.options.display.memory_usage`` setting. - - True always show memory usage. False never shows memory usage. - A value of 'deep' is equivalent to "True with deep introspection". - Memory usage is shown in human-readable units (base-2 - representation). Without deep introspection a memory estimation is - made based in column dtype and number of rows assuming values - consume the same memory amount for corresponding dtypes. With deep - memory introspection, a real memory usage calculation is performed - at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the %(klass)s is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - - Returns - ------- - None - This method prints a summary of a %(klass)s and returns None. - - See Also - -------- - %(see_also_sub)s - - Examples - -------- - %(examples_sub)s - """ printer = DataFrameInfoPrinter( info=self, max_cols=max_cols, @@ -305,6 +303,8 @@ def to_buffer( verbose: Optional[bool], show_counts: Optional[bool], ) -> None: + """ + """ printer = SeriesInfoPrinter( info=self, verbose=verbose, From 3e71336b361a65e05e3fe5f01445bc9366fefbea Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 23 Oct 2020 23:06:01 +0700 Subject: [PATCH 12/26] REF: to_buffer -> render, unify func signature --- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 10 +++------- pandas/io/formats/info.py | 21 ++++++++++++++++----- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 86930d395d216..6087d378b474e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2614,7 +2614,7 @@ def to_html( DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) - @doc(BaseInfo.to_buffer) + @doc(BaseInfo.render) def info( self, verbose: Optional[bool] = None, @@ -2627,7 +2627,7 @@ def info( data=self, memory_usage=memory_usage, ) - info.to_buffer( + info.render( buf=buf, max_cols=max_cols, verbose=verbose, diff --git a/pandas/core/series.py b/pandas/core/series.py index eb49e34204008..2ae659fa84135 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4640,7 +4640,7 @@ def replace( Series.memory_usage: Memory usage of Series.""" ), ) - @doc(BaseInfo.to_buffer) + @doc(BaseInfo.render) def info( self, verbose: Optional[bool] = None, @@ -4649,13 +4649,9 @@ def info( memory_usage: Optional[Union[bool, str]] = None, null_counts: bool = True, ) -> None: - if max_cols is not None: - raise ValueError( - "Argument `max_cols` can only be passed " - "in DataFrame.info, not Series.info" - ) - return SeriesInfo(self, memory_usage).to_buffer( + return SeriesInfo(self, memory_usage).render( buf=buf, + max_cols=max_cols, verbose=verbose, show_counts=null_counts, ) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 2c6201071b0be..4d185ab97953c 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -160,7 +160,14 @@ def size_qualifier(self) -> str: return size_qualifier @abstractmethod - def to_buffer(): + def render( + self, + *, + buf: Optional[IO[str]], + max_cols: Optional[int], + verbose: Optional[bool], + show_counts: Optional[bool], + ) -> None: """ Print a concise summary of a %(klass)s. @@ -266,7 +273,7 @@ def memory_usage_bytes(self) -> int: deep = False return self.data.memory_usage(index=True, deep=deep).sum() - def to_buffer( + def render( self, *, buf: Optional[IO[str]], @@ -296,15 +303,19 @@ def __init__( self.data: "Series" = data self.memory_usage = _initialize_memory_usage(memory_usage) - def to_buffer( + def render( self, *, buf: Optional[IO[str]], + max_cols: Optional[int], verbose: Optional[bool], show_counts: Optional[bool], ) -> None: - """ - """ + if max_cols is not None: + raise ValueError( + "Argument `max_cols` can only be passed " + "in DataFrame.info, not Series.info" + ) printer = SeriesInfoPrinter( info=self, verbose=verbose, From e9c522021d145a3bde1ea21c3b118f664b3a6a79 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 23 Oct 2020 23:15:18 +0700 Subject: [PATCH 13/26] DOC: add versionadded tag --- pandas/core/frame.py | 1 + pandas/core/series.py | 1 + pandas/io/formats/info.py | 2 ++ 3 files changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 761899758a3a4..a3a6377bf16a5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2618,6 +2618,7 @@ def to_html( columns. DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), + version_added_sub="", ) @doc(BaseInfo.render) def info( diff --git a/pandas/core/series.py b/pandas/core/series.py index 2ae659fa84135..5cfe6d0add4b0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4639,6 +4639,7 @@ def replace( Series.describe: Generate descriptive statistics of Series. Series.memory_usage: Memory usage of Series.""" ), + version_added_sub=".. versionadded:: 1.2.0", ) @doc(BaseInfo.render) def info( diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 4d185ab97953c..43fe1c6c159c3 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -206,6 +206,8 @@ def render( None This method prints a summary of a %(klass)s and returns None. + %(version_added_sub)s + See Also -------- %(see_also_sub)s From e74cdce8e8d8fb38067d33891eb153f82c3edba9 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 7 Nov 2020 21:47:31 +0700 Subject: [PATCH 14/26] DOC: maybe fix empty line problem with df.info --- pandas/core/series.py | 2 +- pandas/io/formats/info.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 12685b3ce9091..7f4fcff19b4c0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4625,7 +4625,7 @@ def replace( Series.describe: Generate descriptive statistics of Series. Series.memory_usage: Memory usage of Series.""" ), - version_added_sub=".. versionadded:: 1.2.0", + version_added_sub="\n.. versionadded:: 1.2.0\n", ) @doc(BaseInfo.render) def info( diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 43fe1c6c159c3..ea9ea2a62654a 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -173,6 +173,7 @@ def render( This method prints information about a %(klass)s including the index dtype%(type_sub)s, non-null values and memory usage. + %(version_added_sub)s\ Parameters ---------- @@ -206,8 +207,6 @@ def render( None This method prints a summary of a %(klass)s and returns None. - %(version_added_sub)s - See Also -------- %(see_also_sub)s From d41ecf118643bceae2e06862e59218d3b71bf32b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 7 Nov 2020 21:58:12 +0700 Subject: [PATCH 15/26] DOC: remove trailing period in type --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7f4fcff19b4c0..399f0bbe18c70 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4557,7 +4557,7 @@ def replace( max_cols_sub="", null_counts_sub=dedent( """\ - null_counts : bool, default True. + null_counts : bool, default True Whether to show the non-null counts.""" ), examples_sub=dedent( From 816803edff500bb770fd0422ad28a75a7c7f0ecc Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 4 Oct 2021 22:47:05 +0700 Subject: [PATCH 16/26] Fix styling --- pandas/io/formats/info.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index e5bc0c1707e67..d5f4ed280d18d 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -10,11 +10,8 @@ TYPE_CHECKING, Iterable, Iterator, - List, Mapping, - Optional, Sequence, - Union, ) from pandas._config import get_option @@ -114,7 +111,7 @@ class BaseInfo(ABC): """ data: DataFrame | Series - memory_usage: Union[bool, str] + memory_usage: bool | str @property @abstractmethod @@ -502,7 +499,7 @@ def _create_table_builder(self) -> SeriesTableBuilder: else: return SeriesTableBuilderNonVerbose(info=self.info) - def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: + def _initialize_show_counts(self, show_counts: bool | None) -> bool: if show_counts is None: return True else: @@ -806,13 +803,13 @@ class SeriesTableBuilder(TableBuilderAbstract): def __init__(self, *, info: SeriesInfo): self.info: SeriesInfo = info - def get_lines(self) -> List[str]: + def get_lines(self) -> list[str]: self._lines = [] self._fill_non_empty_info() return self._lines @property - def data(self) -> "Series": + def data(self) -> Series: """Series.""" return self.info.data From 1e2aaefaad8f237376848971e0d460bd0efcaa04 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 29 Nov 2021 22:38:10 +0700 Subject: [PATCH 17/26] DOC: move whatsnew info to v1.4.0 --- doc/source/whatsnew/v1.2.0.rst | 1 - doc/source/whatsnew/v1.4.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7e607c0feecfe..3d3ec53948a01 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -303,7 +303,6 @@ Other enhancements - :class:`.DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`) - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`) - :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`) -- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) - Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`) - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 71903d10a6983..5becbf0a87472 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -217,6 +217,7 @@ Other enhancements - Added "Juneteenth National Independence Day" to ``USFederalHolidayCalendar``. See also `Other API changes`_. - :meth:`.Rolling.var`, :meth:`.Expanding.var`, :meth:`.Rolling.std`, :meth:`.Expanding.std` now support `Numba `_ execution with the ``engine`` keyword (:issue:`44461`) +- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) .. --------------------------------------------------------------------------- From 688080b938d7a82e0d7d9c68759d33d987853367 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 29 Nov 2021 22:50:46 +0700 Subject: [PATCH 18/26] DOC: move docs on Series.info() to io/formats/info.py --- pandas/core/series.py | 80 +--------------------------------- pandas/io/formats/info.py | 92 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 93 insertions(+), 79 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 23b6c20499054..0d31fc90425cf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -140,8 +140,8 @@ import pandas.io.formats.format as fmt from pandas.io.formats.info import ( - BaseInfo, SeriesInfo, + series_sub_kwargs, ) import pandas.plotting @@ -4918,83 +4918,7 @@ def replace( method=method, ) - @Substitution( - klass="Series", - type_sub="", - max_cols_sub="", - show_counts_sub=dedent( - """\ - show_counts : bool, default True - Whether to show the non-null counts.""" - ), - examples_sub=dedent( - """\ - >>> int_values = [1, 2, 3, 4, 5] - >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] - >>> s = pd.Series(text_values, index=int_values) - >>> s.info() - - Int64Index: 5 entries, 1 to 5 - Series name: None - Non-Null Count Dtype - -------------- ----- - 5 non-null object - dtypes: object(1) - memory usage: 80.0+ bytes - - Prints a summary excluding information about its values: - - >>> s.info(verbose=False) - - Int64Index: 5 entries, 1 to 5 - dtypes: object(1) - memory usage: 80.0+ bytes - - Pipe output of Series.info to buffer instead of sys.stdout, get - buffer content and writes to a text file: - - >>> import io - >>> buffer = io.StringIO() - >>> s.info(buf=buffer) - >>> s = buffer.getvalue() - >>> with open("df_info.txt", "w", - ... encoding="utf-8") as f: # doctest: +SKIP - ... f.write(s) - 260 - - The `memory_usage` parameter allows deep introspection mode, specially - useful for big Series and fine-tune memory optimization: - - >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) - >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) - >>> s.info() - - RangeIndex: 1000000 entries, 0 to 999999 - Series name: None - Non-Null Count Dtype - -------------- ----- - 1000000 non-null object - dtypes: object(1) - memory usage: 7.6+ MB - - >>> s.info(memory_usage='deep') - - RangeIndex: 1000000 entries, 0 to 999999 - Series name: None - Non-Null Count Dtype - -------------- ----- - 1000000 non-null object - dtypes: object(1) - memory usage: 55.3 MB""" - ), - see_also_sub=dedent( - """\ - Series.describe: Generate descriptive statistics of Series. - Series.memory_usage: Memory usage of Series.""" - ), - version_added_sub="\n.. versionadded:: 1.2.0\n", - ) - @doc(BaseInfo.render) + @doc(SeriesInfo.render, **series_sub_kwargs) def info( self, verbose: bool | None = None, diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 35c10aa86ca46..057d9b3cf5e63 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -163,6 +163,86 @@ } +series_examples_sub = dedent( + """\ + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> s = pd.Series(text_values, index=int_values) + >>> s.info() + + Int64Index: 5 entries, 1 to 5 + Series name: None + Non-Null Count Dtype + -------------- ----- + 5 non-null object + dtypes: object(1) + memory usage: 80.0+ bytes + + Prints a summary excluding information about its values: + + >>> s.info(verbose=False) + + Int64Index: 5 entries, 1 to 5 + dtypes: object(1) + memory usage: 80.0+ bytes + + Pipe output of Series.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> s.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big Series and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6)) + >>> s.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object + dtypes: object(1) + memory usage: 7.6+ MB + + >>> s.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Series name: None + Non-Null Count Dtype + -------------- ----- + 1000000 non-null object + dtypes: object(1) + memory usage: 55.3 MB""" +) + + +series_see_also_sub = dedent( + """\ + Series.describe: Generate descriptive statistics of Series. + Series.memory_usage: Memory usage of Series.""" +) + + +series_sub_kwargs = { + "klass": "Series", + "type_sub": "", + "max_cols_sub": "", + "show_counts_sub": show_counts_sub, + "examples_sub": series_examples_sub, + "see_also_sub": series_see_also_sub, + "version_added_sub": "\n.. versionadded:: 1.2.0\n", +} + + INFO_DOCSTRING = dedent( """ Print a concise summary of a {klass}. @@ -462,10 +542,20 @@ def __init__( self.data: Series = data self.memory_usage = _initialize_memory_usage(memory_usage) + @doc( + INFO_DOCSTRING, + klass="Series", + type_sub="", + max_cols_sub="", + show_counts_sub=show_counts_sub, + examples_sub=series_examples_sub, + see_also_sub=series_see_also_sub, + version_added_sub="\n.. versionadded:: 1.2.0\n", + ) def render( self, *, - buf: IO[str] | None = None, + buf: WriteBuffer[str] | None = None, max_cols: int | None = None, verbose: bool | None = None, show_counts: bool | None = None, From 4e87b1a0186b856f6da3138100f02d5e8ef283e7 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 29 Nov 2021 23:00:54 +0700 Subject: [PATCH 19/26] FIX: newline --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 057d9b3cf5e63..bb1f2e31bc801 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -261,7 +261,7 @@ buf : writable buffer, defaults to sys.stdout Where to send the output. By default, the output is printed to sys.stdout. Pass a writable buffer if you need to further process - the output. + the output.\ {max_cols_sub} memory_usage : bool, str, optional Specifies whether total memory usage of the {klass} From dc999fe0e86a5278e183d9f7dcd06626eff106da Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 29 Nov 2021 23:01:12 +0700 Subject: [PATCH 20/26] FIX: change versionadded to 1.4.0 --- pandas/io/formats/info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index bb1f2e31bc801..6d2c8872f911b 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -239,7 +239,7 @@ "show_counts_sub": show_counts_sub, "examples_sub": series_examples_sub, "see_also_sub": series_see_also_sub, - "version_added_sub": "\n.. versionadded:: 1.2.0\n", + "version_added_sub": "\n.. versionadded:: 1.4.0\n", } From 4bb4e408e594bc23dab61cc99a16aeef9fb2b464 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 29 Nov 2021 23:11:07 +0700 Subject: [PATCH 21/26] DOC: extract null_counts_sub for frames only --- pandas/io/formats/info.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 6d2c8872f911b..f365180d61182 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -51,7 +51,11 @@ only if the DataFrame is smaller than ``pandas.options.display.max_info_rows`` and ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. + shows the counts, and False never shows the counts.""" +) + +null_counts_sub = dedent( + """\ null_counts : bool, optional .. deprecated:: 1.2.0 Use show_counts instead.""" @@ -157,6 +161,7 @@ "type_sub": " and columns", "max_cols_sub": frame_max_cols_sub, "show_counts_sub": show_counts_sub, + "null_counts_sub": null_counts_sub, "examples_sub": frame_examples_sub, "see_also_sub": frame_see_also_sub, "version_added_sub": "", @@ -237,6 +242,7 @@ "type_sub": "", "max_cols_sub": "", "show_counts_sub": show_counts_sub, + "null_counts_sub": "", "examples_sub": series_examples_sub, "see_also_sub": series_see_also_sub, "version_added_sub": "\n.. versionadded:: 1.4.0\n", @@ -277,6 +283,7 @@ memory introspection, a real memory usage calculation is performed at the cost of computational resources. {show_counts_sub} + {null_counts_sub} Returns ------- @@ -508,6 +515,7 @@ def memory_usage_bytes(self) -> int: type_sub=" and columns", max_cols_sub=frame_max_cols_sub, show_counts_sub=show_counts_sub, + null_counts_sub=null_counts_sub, examples_sub=frame_examples_sub, see_also_sub=frame_see_also_sub, version_added_sub="", @@ -548,9 +556,10 @@ def __init__( type_sub="", max_cols_sub="", show_counts_sub=show_counts_sub, + null_counts_sub="", examples_sub=series_examples_sub, see_also_sub=series_see_also_sub, - version_added_sub="\n.. versionadded:: 1.2.0\n", + version_added_sub="\n.. versionadded:: 1.4.0\n", ) def render( self, From f1142938f309ef3db815b52c9fbfc590fe0c618f Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 29 Nov 2021 23:17:02 +0700 Subject: [PATCH 22/26] DOC: avoid duplication of kwargs replacement --- pandas/core/frame.py | 3 ++- pandas/core/series.py | 3 ++- pandas/io/formats/info.py | 23 ----------------------- 3 files changed, 4 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8c85c4e961d99..8ebdb3b890d77 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -206,6 +206,7 @@ format as fmt, ) from pandas.io.formats.info import ( + INFO_DOCSTRING, DataFrameInfo, frame_sub_kwargs, ) @@ -3138,7 +3139,7 @@ def to_xml( return xml_formatter.write_output() # ---------------------------------------------------------------------- - @doc(DataFrameInfo.render, **frame_sub_kwargs) + @doc(INFO_DOCSTRING, **frame_sub_kwargs) def info( self, verbose: bool | None = None, diff --git a/pandas/core/series.py b/pandas/core/series.py index 0d31fc90425cf..b3133ee1275a1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -140,6 +140,7 @@ import pandas.io.formats.format as fmt from pandas.io.formats.info import ( + INFO_DOCSTRING, SeriesInfo, series_sub_kwargs, ) @@ -4918,7 +4919,7 @@ def replace( method=method, ) - @doc(SeriesInfo.render, **series_sub_kwargs) + @doc(INFO_DOCSTRING, **series_sub_kwargs) def info( self, verbose: bool | None = None, diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index f365180d61182..245203b31c50a 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -20,7 +20,6 @@ Dtype, WriteBuffer, ) -from pandas.util._decorators import doc from pandas.core.indexes.api import Index @@ -509,17 +508,6 @@ def memory_usage_bytes(self) -> int: deep = False return self.data.memory_usage(index=True, deep=deep).sum() - @doc( - INFO_DOCSTRING, - klass="DataFrame", - type_sub=" and columns", - max_cols_sub=frame_max_cols_sub, - show_counts_sub=show_counts_sub, - null_counts_sub=null_counts_sub, - examples_sub=frame_examples_sub, - see_also_sub=frame_see_also_sub, - version_added_sub="", - ) def render( self, *, @@ -550,17 +538,6 @@ def __init__( self.data: Series = data self.memory_usage = _initialize_memory_usage(memory_usage) - @doc( - INFO_DOCSTRING, - klass="Series", - type_sub="", - max_cols_sub="", - show_counts_sub=show_counts_sub, - null_counts_sub="", - examples_sub=series_examples_sub, - see_also_sub=series_see_also_sub, - version_added_sub="\n.. versionadded:: 1.4.0\n", - ) def render( self, *, From 16ac96ea8026b7ba81208220890492585550c6de Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Mon, 29 Nov 2021 23:34:45 +0700 Subject: [PATCH 23/26] DOC: unify newlines/spacing with substitutions --- pandas/io/formats/info.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 245203b31c50a..75b3f3861c508 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -34,7 +34,7 @@ frame_max_cols_sub = dedent( - """\ + """ max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output @@ -44,7 +44,7 @@ show_counts_sub = dedent( - """\ + """ show_counts : bool, optional Whether to show the non-null counts. By default, this is shown only if the DataFrame is smaller than @@ -54,7 +54,7 @@ ) null_counts_sub = dedent( - """\ + """ null_counts : bool, optional .. deprecated:: 1.2.0 Use show_counts instead.""" @@ -62,7 +62,7 @@ frame_examples_sub = dedent( - """\ + """ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] @@ -148,7 +148,7 @@ frame_see_also_sub = dedent( - """\ + """ DataFrame.describe: Generate descriptive statistics of DataFrame columns. DataFrame.memory_usage: Memory usage of DataFrame columns.""" @@ -168,7 +168,7 @@ series_examples_sub = dedent( - """\ + """ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> s = pd.Series(text_values, index=int_values) @@ -230,7 +230,7 @@ series_see_also_sub = dedent( - """\ + """ Series.describe: Generate descriptive statistics of Series. Series.memory_usage: Memory usage of Series.""" ) @@ -280,8 +280,8 @@ made based in column dtype and number of rows assuming values consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed - at the cost of computational resources. - {show_counts_sub} + at the cost of computational resources.\ + {show_counts_sub}\ {null_counts_sub} Returns @@ -290,11 +290,11 @@ This method prints a summary of a {klass} and returns None. See Also - -------- + --------\ {see_also_sub} Examples - -------- + --------\ {examples_sub} """ ) From aac2954a2c3a58413cae44e6222c130ac466e238 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 30 Nov 2021 01:20:10 +0700 Subject: [PATCH 24/26] Revert "DOC: unify newlines/spacing with substitutions" This reverts commit 16ac96ea8026b7ba81208220890492585550c6de. --- pandas/io/formats/info.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 75b3f3861c508..245203b31c50a 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -34,7 +34,7 @@ frame_max_cols_sub = dedent( - """ + """\ max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output @@ -44,7 +44,7 @@ show_counts_sub = dedent( - """ + """\ show_counts : bool, optional Whether to show the non-null counts. By default, this is shown only if the DataFrame is smaller than @@ -54,7 +54,7 @@ ) null_counts_sub = dedent( - """ + """\ null_counts : bool, optional .. deprecated:: 1.2.0 Use show_counts instead.""" @@ -62,7 +62,7 @@ frame_examples_sub = dedent( - """ + """\ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] @@ -148,7 +148,7 @@ frame_see_also_sub = dedent( - """ + """\ DataFrame.describe: Generate descriptive statistics of DataFrame columns. DataFrame.memory_usage: Memory usage of DataFrame columns.""" @@ -168,7 +168,7 @@ series_examples_sub = dedent( - """ + """\ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> s = pd.Series(text_values, index=int_values) @@ -230,7 +230,7 @@ series_see_also_sub = dedent( - """ + """\ Series.describe: Generate descriptive statistics of Series. Series.memory_usage: Memory usage of Series.""" ) @@ -280,8 +280,8 @@ made based in column dtype and number of rows assuming values consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed - at the cost of computational resources.\ - {show_counts_sub}\ + at the cost of computational resources. + {show_counts_sub} {null_counts_sub} Returns @@ -290,11 +290,11 @@ This method prints a summary of a {klass} and returns None. See Also - --------\ + -------- {see_also_sub} Examples - --------\ + -------- {examples_sub} """ ) From 22303dc1be5eb56192cd6048075932d08ff18863 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 30 Nov 2021 01:24:35 +0700 Subject: [PATCH 25/26] DOC: fix newlines substitutions --- pandas/io/formats/info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 245203b31c50a..2e01b87349895 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -54,7 +54,7 @@ ) null_counts_sub = dedent( - """\ + """ null_counts : bool, optional .. deprecated:: 1.2.0 Use show_counts instead.""" @@ -281,7 +281,7 @@ consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed at the cost of computational resources. - {show_counts_sub} + {show_counts_sub}\ {null_counts_sub} Returns From 9428a32e8bec78ff8a0fa5f87560bcddf37435b6 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 30 Nov 2021 08:50:12 +0700 Subject: [PATCH 26/26] DOC: another attempt to fix newline --- pandas/io/formats/info.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 2e01b87349895..4a9310c6dccf8 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -281,8 +281,7 @@ consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed at the cost of computational resources. - {show_counts_sub}\ - {null_counts_sub} + {show_counts_sub}{null_counts_sub} Returns -------