diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4ad80273f77ba..e4f36e128059b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -14,7 +14,6 @@ import datetime from io import StringIO import itertools -import sys from textwrap import dedent from typing import ( IO, @@ -131,7 +130,7 @@ from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt -from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.info import info import pandas.plotting if TYPE_CHECKING: @@ -2225,282 +2224,11 @@ def to_html( ) # ---------------------------------------------------------------------- - + @Appender(info.__doc__) def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None ) -> None: - """ - Print a concise summary of a DataFrame. - - This method prints information about a DataFrame including - the index dtype and column dtypes, non-null values and memory usage. - - Parameters - ---------- - verbose : bool, optional - Whether to print the full summary. By default, the setting in - ``pandas.options.display.max_info_columns`` is followed. - buf : writable buffer, defaults to sys.stdout - Where to send the output. By default, the output is printed to - sys.stdout. Pass a writable buffer if you need to further process - the output. - max_cols : int, optional - When to switch from the verbose to the truncated output. If the - DataFrame has more than `max_cols` columns, the truncated output - is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. - memory_usage : bool, str, optional - Specifies whether total memory usage of the DataFrame - elements (including the index) should be displayed. By default, - this follows the ``pandas.options.display.memory_usage`` setting. - - True always show memory usage. False never shows memory usage. - A value of 'deep' is equivalent to "True with deep introspection". - Memory usage is shown in human-readable units (base-2 - representation). Without deep introspection a memory estimation is - made based in column dtype and number of rows assuming values - consume the same memory amount for corresponding dtypes. With deep - memory introspection, a real memory usage calculation is performed - at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the frame is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - - Returns - ------- - None - This method prints a summary of a DataFrame and returns None. - - See Also - -------- - DataFrame.describe: Generate descriptive statistics of DataFrame - columns. - DataFrame.memory_usage: Memory usage of DataFrame columns. - - Examples - -------- - >>> int_values = [1, 2, 3, 4, 5] - >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] - >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] - >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, - ... "float_col": float_values}) - >>> df - int_col text_col float_col - 0 1 alpha 0.00 - 1 2 beta 0.25 - 2 3 gamma 0.50 - 3 4 delta 0.75 - 4 5 epsilon 1.00 - - Prints information of all columns: - - >>> df.info(verbose=True) - - RangeIndex: 5 entries, 0 to 4 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 248.0+ bytes - - Prints a summary of columns count and its dtypes but not per column - information: - - >>> df.info(verbose=False) - - RangeIndex: 5 entries, 0 to 4 - Columns: 3 entries, int_col to float_col - dtypes: float64(1), int64(1), object(1) - memory usage: 248.0+ bytes - - Pipe output of DataFrame.info to buffer instead of sys.stdout, get - buffer content and writes to a text file: - - >>> import io - >>> buffer = io.StringIO() - >>> df.info(buf=buffer) - >>> s = buffer.getvalue() - >>> with open("df_info.txt", "w", - ... encoding="utf-8") as f: # doctest: +SKIP - ... f.write(s) - 260 - - The `memory_usage` parameter allows deep introspection mode, specially - useful for big DataFrames and fine-tune memory optimization: - - >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) - >>> df = pd.DataFrame({ - ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) - ... }) - >>> df.info() - - RangeIndex: 1000000 entries, 0 to 999999 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 22.9+ MB - - >>> df.info(memory_usage='deep') - - RangeIndex: 1000000 entries, 0 to 999999 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 188.8 MB - """ - if buf is None: # pragma: no cover - buf = sys.stdout - - lines = [] - - lines.append(str(type(self))) - lines.append(self.index._summary()) - - if len(self.columns) == 0: - lines.append(f"Empty {type(self).__name__}") - fmt.buffer_put_lines(buf, lines) - return - - cols = self.columns - col_count = len(self.columns) - - # hack - if max_cols is None: - max_cols = get_option("display.max_info_columns", len(self.columns) + 1) - - max_rows = get_option("display.max_info_rows", len(self) + 1) - - if null_counts is None: - show_counts = (col_count <= max_cols) and (len(self) < max_rows) - else: - show_counts = null_counts - exceeds_info_cols = col_count > max_cols - - def _verbose_repr(): - lines.append(f"Data columns (total {len(self.columns)} columns):") - - id_head = " # " - column_head = "Column" - col_space = 2 - - max_col = max(len(pprint_thing(k)) for k in cols) - len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + col_space - - max_id = len(pprint_thing(col_count)) - len_id = len(pprint_thing(id_head)) - space_num = max(max_id, len_id) + col_space - counts = None - - header = _put_str(id_head, space_num) + _put_str(column_head, space) - if show_counts: - counts = self.count() - if len(cols) != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({len(cols)} != {len(counts)})" - ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null - else: - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" - - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes) - space_dtype = max(len_dtype, max_dtypes) - header += _put_str(count_header, space_count) + _put_str( - dtype_header, space_dtype - ) - - lines.append(header) - lines.append( - _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, space) - + _put_str("-" * len_count, space_count) - + _put_str("-" * len_dtype, space_dtype) - ) - - for i, col in enumerate(self.columns): - dtype = self.dtypes.iloc[i] - col = pprint_thing(col) - - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts.iloc[i] - - lines.append( - line_no - + _put_str(col, space) - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) - ) - - def _non_verbose_repr(): - lines.append(self.columns._summary(name="Columns")) - - def _sizeof_fmt(num, size_qualifier): - # returns size in human readable format - for x in ["bytes", "KB", "MB", "GB", "TB"]: - if num < 1024.0: - return f"{num:3.1f}{size_qualifier} {x}" - num /= 1024.0 - return f"{num:3.1f}{size_qualifier} PB" - - if verbose: - _verbose_repr() - elif verbose is False: # specifically set to False, not nesc None - _non_verbose_repr() - else: - if exceeds_info_cols: - _non_verbose_repr() - else: - _verbose_repr() - - counts = self._data.get_dtype_counts() - dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(dtypes)}") - - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - if memory_usage: - # append memory usage of df to display - size_qualifier = "" - if memory_usage == "deep": - deep = True - else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or self.index._is_memory_usage_qualified(): - size_qualifier = "+" - mem_usage = self.memory_usage(index=True, deep=deep).sum() - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(buf, lines) + return info(self, verbose, buf, max_cols, memory_usage, null_counts) def memory_usage(self, index=True, deep=False) -> Series: """ @@ -8623,7 +8351,3 @@ def _from_nested_dict(data): new_data[col] = new_data.get(col, {}) new_data[col][index] = v return new_data - - -def _put_str(s, space): - return str(s)[:space].ljust(space) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py new file mode 100644 index 0000000000000..0c08065f55273 --- /dev/null +++ b/pandas/io/formats/info.py @@ -0,0 +1,288 @@ +import sys + +from pandas._config import get_option + +from pandas.io.formats import format as fmt +from pandas.io.formats.printing import pprint_thing + + +def _put_str(s, space): + return str(s)[:space].ljust(space) + + +def info( + data, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None +) -> None: + """ + Print a concise summary of a DataFrame. + + This method prints information about a DataFrame including + the index dtype and column dtypes, non-null values and memory usage. + + Parameters + ---------- + data : DataFrame + DataFrame to print information about. + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + max_cols : int, optional + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. + memory_usage : bool, str, optional + Specifies whether total memory usage of the DataFrame + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the frame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns + ------- + None + This method prints a summary of a DataFrame and returns None. + + See Also + -------- + DataFrame.describe: Generate descriptive statistics of DataFrame + columns. + DataFrame.memory_usage: Memory usage of DataFrame columns. + + Examples + -------- + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] + >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, + ... "float_col": float_values}) + >>> df + int_col text_col float_col + 0 1 alpha 0.00 + 1 2 beta 0.25 + 2 3 gamma 0.50 + 3 4 delta 0.75 + 4 5 epsilon 1.00 + + Prints information of all columns: + + >>> df.info(verbose=True) + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 248.0+ bytes + + Prints a summary of columns count and its dtypes but not per column + information: + + >>> df.info(verbose=False) + + RangeIndex: 5 entries, 0 to 4 + Columns: 3 entries, int_col to float_col + dtypes: float64(1), int64(1), object(1) + memory usage: 248.0+ bytes + + Pipe output of DataFrame.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> df.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big DataFrames and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> df = pd.DataFrame({ + ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) + ... }) + >>> df.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object + dtypes: object(3) + memory usage: 22.9+ MB + + >>> df.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object + dtypes: object(3) + memory usage: 188.8 MB + """ + if buf is None: # pragma: no cover + buf = sys.stdout + + lines = [] + + lines.append(str(type(data))) + lines.append(data.index._summary()) + + if len(data.columns) == 0: + lines.append(f"Empty {type(data).__name__}") + fmt.buffer_put_lines(buf, lines) + return + + cols = data.columns + col_count = len(data.columns) + + # hack + if max_cols is None: + max_cols = get_option("display.max_info_columns", len(data.columns) + 1) + + max_rows = get_option("display.max_info_rows", len(data) + 1) + + if null_counts is None: + show_counts = (col_count <= max_cols) and (len(data) < max_rows) + else: + show_counts = null_counts + exceeds_info_cols = col_count > max_cols + + def _verbose_repr(): + lines.append(f"Data columns (total {len(data.columns)} columns):") + + id_head = " # " + column_head = "Column" + col_space = 2 + + max_col = max(len(pprint_thing(k)) for k in cols) + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space + + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space + + header = _put_str(id_head, space_num) + _put_str(column_head, space) + if show_counts: + counts = data.count() + if len(cols) != len(counts): # pragma: no cover + raise AssertionError( + f"Columns must equal counts ({len(cols)} != {len(counts)})" + ) + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in data.dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) + + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, space) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) + + for i, col in enumerate(data.columns): + dtype = data.dtypes.iloc[i] + col = pprint_thing(col) + + line_no = _put_str(f" {i}", space_num) + count = "" + if show_counts: + count = counts.iloc[i] + + lines.append( + line_no + + _put_str(col, space) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) + ) + + def _non_verbose_repr(): + lines.append(data.columns._summary(name="Columns")) + + def _sizeof_fmt(num, size_qualifier): + # returns size in human readable format + for x in ["bytes", "KB", "MB", "GB", "TB"]: + if num < 1024.0: + return f"{num:3.1f}{size_qualifier} {x}" + num /= 1024.0 + return f"{num:3.1f}{size_qualifier} PB" + + if verbose: + _verbose_repr() + elif verbose is False: # specifically set to False, not nesc None + _non_verbose_repr() + else: + if exceeds_info_cols: + _non_verbose_repr() + else: + _verbose_repr() + + counts = data._data.get_dtype_counts() + dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] + lines.append(f"dtypes: {', '.join(dtypes)}") + + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + if memory_usage: + # append memory usage of df to display + size_qualifier = "" + if memory_usage == "deep": + deep = True + else: + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + deep = False + if "object" in counts or data.index._is_memory_usage_qualified(): + size_qualifier = "+" + mem_usage = data.memory_usage(index=True, deep=deep).sum() + lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") + fmt.buffer_put_lines(buf, lines) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 4ac009ef508c4..c5d4d59adbc35 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -1,16 +1,10 @@ from datetime import datetime, timedelta from io import StringIO -import re -import sys -import textwrap import warnings import numpy as np import pytest -from pandas.compat import PYPY - -import pandas as pd from pandas import ( Categorical, DataFrame, @@ -192,357 +186,6 @@ def test_latex_repr(self): # GH 12182 assert df._repr_latex_() is None - def test_info(self, float_frame, datetime_frame): - io = StringIO() - float_frame.info(buf=io) - datetime_frame.info(buf=io) - - frame = DataFrame(np.random.randn(5, 3)) - - frame.info() - frame.info(verbose=False) - - def test_info_verbose(self): - buf = StringIO() - size = 1001 - start = 5 - frame = DataFrame(np.random.randn(3, size)) - frame.info(verbose=True, buf=buf) - - res = buf.getvalue() - header = " # Column Dtype \n--- ------ ----- " - assert header in res - - frame.info(verbose=True, buf=buf) - buf.seek(0) - lines = buf.readlines() - assert len(lines) > 0 - - for i, line in enumerate(lines): - if i >= start and i < start + size: - line_nr = f" {i - start} " - assert line.startswith(line_nr) - - def test_info_memory(self): - # https://github.com/pandas-dev/pandas/issues/21056 - df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")}) - buf = StringIO() - df.info(buf=buf) - result = buf.getvalue() - bytes = float(df.memory_usage().sum()) - - expected = textwrap.dedent( - f"""\ - - RangeIndex: 2 entries, 0 to 1 - Data columns (total 1 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 a 2 non-null int64 - dtypes: int64(1) - memory usage: {bytes} bytes - """ - ) - - assert result == expected - - def test_info_wide(self): - from pandas import set_option, reset_option - - io = StringIO() - df = DataFrame(np.random.randn(5, 101)) - df.info(buf=io) - - io = StringIO() - df.info(buf=io, max_cols=101) - rs = io.getvalue() - assert len(rs.splitlines()) > 100 - xp = rs - - set_option("display.max_info_columns", 101) - io = StringIO() - df.info(buf=io) - assert rs == xp - reset_option("display.max_info_columns") - - def test_info_duplicate_columns(self): - io = StringIO() - - # it works! - frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) - frame.info(buf=io) - - def test_info_duplicate_columns_shows_correct_dtypes(self): - # GH11761 - io = StringIO() - - frame = DataFrame([[1, 2.0]], columns=["a", "a"]) - frame.info(buf=io) - io.seek(0) - lines = io.readlines() - assert " 0 a 1 non-null int64 \n" == lines[5] - assert " 1 a 1 non-null float64\n" == lines[6] - - def test_info_shows_column_dtypes(self): - dtypes = [ - "int64", - "float64", - "datetime64[ns]", - "timedelta64[ns]", - "complex128", - "object", - "bool", - ] - data = {} - n = 10 - for i, dtype in enumerate(dtypes): - data[i] = np.random.randint(2, size=n).astype(dtype) - df = DataFrame(data) - buf = StringIO() - df.info(buf=buf) - res = buf.getvalue() - header = ( - " # Column Non-Null Count Dtype \n" - "--- ------ -------------- ----- " - ) - assert header in res - for i, dtype in enumerate(dtypes): - name = f" {i:d} {i:d} {n:d} non-null {dtype}" - assert name in res - - def test_info_max_cols(self): - df = DataFrame(np.random.randn(10, 5)) - for len_, verbose in [(5, None), (5, False), (12, True)]: - # For verbose always ^ setting ^ summarize ^ full output - with option_context("max_info_columns", 4): - buf = StringIO() - df.info(buf=buf, verbose=verbose) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - for len_, verbose in [(12, None), (5, False), (12, True)]: - - # max_cols not exceeded - with option_context("max_info_columns", 5): - buf = StringIO() - df.info(buf=buf, verbose=verbose) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - for len_, max_cols in [(12, 5), (5, 4)]: - # setting truncates - with option_context("max_info_columns", 4): - buf = StringIO() - df.info(buf=buf, max_cols=max_cols) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - # setting wouldn't truncate - with option_context("max_info_columns", 5): - buf = StringIO() - df.info(buf=buf, max_cols=max_cols) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - def test_info_memory_usage(self): - # Ensure memory usage is displayed, when asserted, on the last line - dtypes = [ - "int64", - "float64", - "datetime64[ns]", - "timedelta64[ns]", - "complex128", - "object", - "bool", - ] - data = {} - n = 10 - for i, dtype in enumerate(dtypes): - data[i] = np.random.randint(2, size=n).astype(dtype) - df = DataFrame(data) - buf = StringIO() - - # display memory usage case - df.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - assert "memory usage: " in res[-1] - - # do not display memory usage case - df.info(buf=buf, memory_usage=False) - res = buf.getvalue().splitlines() - assert "memory usage: " not in res[-1] - - df.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - - # memory usage is a lower bound, so print it as XYZ+ MB - assert re.match(r"memory usage: [^+]+\+", res[-1]) - - df.iloc[:, :5].info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - - # excluded column with object dtype, so estimate is accurate - assert not re.match(r"memory usage: [^+]+\+", res[-1]) - - # Test a DataFrame with duplicate columns - dtypes = ["int64", "int64", "int64", "float64"] - data = {} - n = 100 - for i, dtype in enumerate(dtypes): - data[i] = np.random.randint(2, size=n).astype(dtype) - df = DataFrame(data) - df.columns = dtypes - - df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) - df_with_object_index.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - assert re.match(r"memory usage: [^+]+\+", res[-1]) - - df_with_object_index.info(buf=buf, memory_usage="deep") - res = buf.getvalue().splitlines() - assert re.match(r"memory usage: [^+]+$", res[-1]) - - # Ensure df size is as expected - # (cols * rows * bytes) + index size - df_size = df.memory_usage().sum() - exp_size = len(dtypes) * n * 8 + df.index.nbytes - assert df_size == exp_size - - # Ensure number of cols in memory_usage is the same as df - size_df = np.size(df.columns.values) + 1 # index=True; default - assert size_df == np.size(df.memory_usage()) - - # assert deep works only on object - assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() - - # test for validity - DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) - DataFrame(1, index=["a"], columns=["A"]).index.nbytes - df = DataFrame( - data=1, - index=pd.MultiIndex.from_product([["a"], range(1000)]), - columns=["A"], - ) - df.index.nbytes - df.memory_usage(index=True) - df.index.values.nbytes - - mem = df.memory_usage(deep=True).sum() - assert mem > 0 - - @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") - def test_info_memory_usage_deep_not_pypy(self): - df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) - assert ( - df_with_object_index.memory_usage(index=True, deep=True).sum() - > df_with_object_index.memory_usage(index=True).sum() - ) - - df_object = pd.DataFrame({"a": ["a"]}) - assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() - - @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") - def test_info_memory_usage_deep_pypy(self): - df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) - assert ( - df_with_object_index.memory_usage(index=True, deep=True).sum() - == df_with_object_index.memory_usage(index=True).sum() - ) - - df_object = pd.DataFrame({"a": ["a"]}) - assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() - - @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") - def test_usage_via_getsizeof(self): - df = DataFrame( - data=1, - index=pd.MultiIndex.from_product([["a"], range(1000)]), - columns=["A"], - ) - mem = df.memory_usage(deep=True).sum() - # sys.getsizeof will call the .memory_usage with - # deep=True, and add on some GC overhead - diff = mem - sys.getsizeof(df) - assert abs(diff) < 100 - - def test_info_memory_usage_qualified(self): - - buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) - df.info(buf=buf) - assert "+" not in buf.getvalue() - - buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=list("ABC")) - df.info(buf=buf) - assert "+" in buf.getvalue() - - buf = StringIO() - df = DataFrame( - 1, - columns=list("ab"), - index=pd.MultiIndex.from_product([range(3), range(3)]), - ) - df.info(buf=buf) - assert "+" not in buf.getvalue() - - buf = StringIO() - df = DataFrame( - 1, - columns=list("ab"), - index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]), - ) - df.info(buf=buf) - assert "+" in buf.getvalue() - - def test_info_memory_usage_bug_on_multiindex(self): - # GH 14308 - # memory usage introspection should not materialize .values - - from string import ascii_uppercase as uppercase - - def memory_usage(f): - return f.memory_usage(deep=True).sum() - - N = 100 - M = len(uppercase) - index = pd.MultiIndex.from_product( - [list(uppercase), pd.date_range("20160101", periods=N)], - names=["id", "date"], - ) - df = DataFrame({"value": np.random.randn(N * M)}, index=index) - - unstacked = df.unstack("id") - assert df.values.nbytes == unstacked.values.nbytes - assert memory_usage(df) > memory_usage(unstacked) - - # high upper bound - assert memory_usage(unstacked) - memory_usage(df) < 2000 - - def test_info_categorical(self): - # GH14298 - idx = pd.CategoricalIndex(["a", "b"]) - df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx) - - buf = StringIO() - df.info(buf=buf) - - def test_info_categorical_column(self): - - # make sure it works - n = 2500 - df = DataFrame({"int64": np.random.randint(100, size=n)}) - df["category"] = Series( - np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) - ).astype("category") - df.isna() - buf = StringIO() - df.info(buf=buf) - - df2 = df[df["category"] == "d"] - buf = StringIO() - df2.info(buf=buf) - def test_repr_categorical_dates_periods(self): # normal DataFrame dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py new file mode 100644 index 0000000000000..877bd1650ae60 --- /dev/null +++ b/pandas/tests/io/formats/test_info.py @@ -0,0 +1,405 @@ +from io import StringIO +import re +from string import ascii_uppercase as uppercase +import sys +import textwrap + +import numpy as np +import pytest + +from pandas.compat import PYPY + +from pandas import ( + CategoricalIndex, + DataFrame, + MultiIndex, + Series, + date_range, + option_context, + reset_option, + set_option, +) +import pandas._testing as tm + + +@pytest.fixture +def datetime_frame(): + """ + Fixture for DataFrame of floats with DatetimeIndex + + Columns are ['A', 'B', 'C', 'D'] + + A B C D + 2000-01-03 -1.122153 0.468535 0.122226 1.693711 + 2000-01-04 0.189378 0.486100 0.007864 -1.216052 + 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 + 2000-01-06 0.430050 0.894352 0.090719 0.036939 + 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 + 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 + 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 + ... ... ... ... ... + 2000-02-03 1.642618 -0.579288 0.046005 1.385249 + 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 + 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 + 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 + 2000-02-09 1.377373 0.398619 1.008453 -0.928207 + 2000-02-10 0.473194 -0.636677 0.984058 0.511519 + 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 + + [30 rows x 4 columns] + """ + return DataFrame(tm.getTimeSeriesData()) + + +def test_info_categorical_column(): + + # make sure it works + n = 2500 + df = DataFrame({"int64": np.random.randint(100, size=n)}) + df["category"] = Series( + np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + ).astype("category") + df.isna() + buf = StringIO() + df.info(buf=buf) + + df2 = df[df["category"] == "d"] + buf = StringIO() + df2.info(buf=buf) + + +def test_info(float_frame, datetime_frame): + io = StringIO() + float_frame.info(buf=io) + datetime_frame.info(buf=io) + + frame = DataFrame(np.random.randn(5, 3)) + + frame.info() + frame.info(verbose=False) + + +def test_info_verbose(): + buf = StringIO() + size = 1001 + start = 5 + frame = DataFrame(np.random.randn(3, size)) + frame.info(verbose=True, buf=buf) + + res = buf.getvalue() + header = " # Column Dtype \n--- ------ ----- " + assert header in res + + frame.info(verbose=True, buf=buf) + buf.seek(0) + lines = buf.readlines() + assert len(lines) > 0 + + for i, line in enumerate(lines): + if i >= start and i < start + size: + line_nr = f" {i - start} " + assert line.startswith(line_nr) + + +def test_info_memory(): + # https://github.com/pandas-dev/pandas/issues/21056 + df = DataFrame({"a": Series([1, 2], dtype="i8")}) + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + bytes = float(df.memory_usage().sum()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Data columns (total 1 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 2 non-null int64 + dtypes: int64(1) + memory usage: {bytes} bytes + """ + ) + assert result == expected + + +def test_info_wide(): + io = StringIO() + df = DataFrame(np.random.randn(5, 101)) + df.info(buf=io) + + io = StringIO() + df.info(buf=io, max_cols=101) + rs = io.getvalue() + assert len(rs.splitlines()) > 100 + xp = rs + + set_option("display.max_info_columns", 101) + io = StringIO() + df.info(buf=io) + assert rs == xp + reset_option("display.max_info_columns") + + +def test_info_duplicate_columns(): + io = StringIO() + + # it works! + frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) + frame.info(buf=io) + + +def test_info_duplicate_columns_shows_correct_dtypes(): + # GH11761 + io = StringIO() + + frame = DataFrame([[1, 2.0]], columns=["a", "a"]) + frame.info(buf=io) + io.seek(0) + lines = io.readlines() + assert " 0 a 1 non-null int64 \n" == lines[5] + assert " 1 a 1 non-null float64\n" == lines[6] + + +def test_info_shows_column_dtypes(): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + df.info(buf=buf) + res = buf.getvalue() + header = ( + " # Column Non-Null Count Dtype \n" + "--- ------ -------------- ----- " + ) + assert header in res + for i, dtype in enumerate(dtypes): + name = f" {i:d} {i:d} {n:d} non-null {dtype}" + assert name in res + + +def test_info_max_cols(): + df = DataFrame(np.random.randn(10, 5)) + for len_, verbose in [(5, None), (5, False), (12, True)]: + # For verbose always ^ setting ^ summarize ^ full output + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, verbose in [(12, None), (5, False), (12, True)]: + + # max_cols not exceeded + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, max_cols in [(12, 5), (5, 4)]: + # setting truncates + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + # setting wouldn't truncate + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + +def test_info_memory_usage(): + # Ensure memory usage is displayed, when asserted, on the last line + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + + # display memory usage case + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert "memory usage: " in res[-1] + + # do not display memory usage case + df.info(buf=buf, memory_usage=False) + res = buf.getvalue().splitlines() + assert "memory usage: " not in res[-1] + + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # memory usage is a lower bound, so print it as XYZ+ MB + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df.iloc[:, :5].info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # excluded column with object dtype, so estimate is accurate + assert not re.match(r"memory usage: [^+]+\+", res[-1]) + + # Test a DataFrame with duplicate columns + dtypes = ["int64", "int64", "int64", "float64"] + data = {} + n = 100 + for i, dtype in enumerate(dtypes): + data[i] = np.random.randint(2, size=n).astype(dtype) + df = DataFrame(data) + df.columns = dtypes + + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df_with_object_index.info(buf=buf, memory_usage="deep") + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+$", res[-1]) + + # Ensure df size is as expected + # (cols * rows * bytes) + index size + df_size = df.memory_usage().sum() + exp_size = len(dtypes) * n * 8 + df.index.nbytes + assert df_size == exp_size + + # Ensure number of cols in memory_usage is the same as df + size_df = np.size(df.columns.values) + 1 # index=True; default + assert size_df == np.size(df.memory_usage()) + + # assert deep works only on object + assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() + + # test for validity + DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) + DataFrame(1, index=["a"], columns=["A"]).index.nbytes + df = DataFrame( + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"], + ) + df.index.nbytes + df.memory_usage(index=True) + df.index.values.nbytes + + mem = df.memory_usage(deep=True).sum() + assert mem > 0 + + +@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") +def test_info_memory_usage_deep_not_pypy(): + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + > df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + + +@pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") +def test_info_memory_usage_deep_pypy(): + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + == df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() + + +@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") +def test_usage_via_getsizeof(): + df = DataFrame( + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"], + ) + mem = df.memory_usage(deep=True).sum() + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = mem - sys.getsizeof(df) + assert abs(diff) < 100 + + +def test_info_memory_usage_qualified(): + + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) + df.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df.info(buf=buf) + assert "+" in buf.getvalue() + + buf = StringIO() + df = DataFrame( + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]), + ) + df.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + df = DataFrame( + 1, + columns=list("ab"), + index=MultiIndex.from_product([range(3), ["foo", "bar"]]), + ) + df.info(buf=buf) + assert "+" in buf.getvalue() + + +def test_info_memory_usage_bug_on_multiindex(): + # GH 14308 + # memory usage introspection should not materialize .values + + def memory_usage(f): + return f.memory_usage(deep=True).sum() + + N = 100 + M = len(uppercase) + index = MultiIndex.from_product( + [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"], + ) + df = DataFrame({"value": np.random.randn(N * M)}, index=index) + + unstacked = df.unstack("id") + assert df.values.nbytes == unstacked.values.nbytes + assert memory_usage(df) > memory_usage(unstacked) + + # high upper bound + assert memory_usage(unstacked) - memory_usage(df) < 2000 + + +def test_info_categorical(): + # GH14298 + idx = CategoricalIndex(["a", "b"]) + df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx) + + buf = StringIO() + df.info(buf=buf)