diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 801307a8f9481..244423a1dd4e3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2604,7 +2604,7 @@ def to_html( DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) - @doc(DataFrameInfo.info) + @doc(DataFrameInfo.to_buffer) def info( self, verbose: Optional[bool] = None, @@ -2613,9 +2613,16 @@ def info( memory_usage: Optional[Union[bool, str]] = None, null_counts: Optional[bool] = None, ) -> None: - return DataFrameInfo( - self, verbose, buf, max_cols, memory_usage, null_counts - ).info() + info = DataFrameInfo( + data=self, + memory_usage=memory_usage, + ) + info.to_buffer( + buf=buf, + max_cols=max_cols, + verbose=verbose, + show_counts=null_counts, + ) def memory_usage(self, index=True, deep=False) -> Series: """ diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index a57fda7472878..891b3ea7af0e2 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,6 +1,6 @@ -from abc import ABCMeta, abstractmethod +from abc import ABC, abstractmethod import sys -from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union +from typing import IO, TYPE_CHECKING, Iterator, List, Mapping, Optional, Sequence, Union from pandas._config import get_option @@ -12,6 +12,7 @@ from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: + from pandas.core.frame import DataFrame from pandas.core.series import Series @@ -72,92 +73,148 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: return f"{num:3.1f}{size_qualifier} PB" -class BaseInfo(metaclass=ABCMeta): +def _initialize_memory_usage( + memory_usage: Optional[Union[bool, str]] = None, +) -> Union[bool, str]: + """Get memory usage based on inputs and display options.""" + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + return memory_usage + + +class BaseInfo(ABC): + """Base class for DataFrameInfo and SeriesInfo. + + Parameters + ---------- + data : FrameOrSeries + Either dataframe or series. + memory_usage : bool or str, optional + If "deep", introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + """ + def __init__( self, data: FrameOrSeries, - verbose: Optional[bool] = None, - buf: Optional[IO[str]] = None, - max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, - null_counts: Optional[bool] = None, ): - if buf is None: # pragma: no cover - buf = sys.stdout - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - self.data = data - self.verbose = verbose - self.buf = buf - self.max_cols = max_cols - self.memory_usage = memory_usage - self.null_counts = null_counts + self.memory_usage = _initialize_memory_usage(memory_usage) + + @property + @abstractmethod + def ids(self) -> Index: + """Column names or index names.""" + @property @abstractmethod - def _get_mem_usage(self, deep: bool) -> int: + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + + @property + @abstractmethod + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns or column (if series).""" + + @property + @abstractmethod + def dtypes(self) -> "Series": + """Dtypes. + + Returns + ------- + dtypes : Series + Dtype of each of the DataFrame's columns. """ - Get memory usage in bytes. + return self.data.dtypes - Parameters - ---------- - deep : bool - If True, introspect the data deeply by interrogating object dtypes - for system-level memory consumption, and include it in the returned - values. + @property + def memory_usage_bytes(self) -> int: + """Memory usage in bytes. Returns ------- - mem_usage : int + memory_usage_bytes : int Object's total memory usage in bytes. """ + if self.memory_usage == "deep": + deep = True + else: + deep = False + return self.data.memory_usage(index=True, deep=deep).sum() - @abstractmethod - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - """ - Get column names and dtypes. + @property + def memory_usage_string(self) -> str: + """Memory usage in a form of human readable string.""" + return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n" + + @property + def size_qualifier(self) -> str: + size_qualifier = "" + if self.memory_usage: + if self.memory_usage != "deep": + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + if ( + "object" in self.dtype_counts + or self.data.index._is_memory_usage_qualified() + ): + size_qualifier = "+" + return size_qualifier + + +class DataFrameInfo(BaseInfo): + """Class storing dataframe-specific info.""" + + @property + def ids(self) -> Index: + """Column names. Returns ------- ids : Index DataFrame's column names. - dtypes : Series - Dtype of each of the DataFrame's columns. """ + return self.data.columns - @abstractmethod - def _verbose_repr( - self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool - ) -> None: - """ - Append name, non-null count (optional), and dtype for each column to `lines`. + @property + def dtypes(self) -> "Series": + """Dtypes. - Parameters - ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The DataFrame's column names. + Returns + ------- dtypes : Series - The DataFrame's columns' dtypes. - show_counts : bool - If True, count of non-NA cells for each column will be appended to `lines`. + Dtype of each of the DataFrame's columns. """ + return self.data.dtypes - @abstractmethod - def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: - """ - Append short summary of columns' names to `lines`. + @property + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + # groupby dtype.name to collect e.g. Categorical columns + return self.dtypes.value_counts().groupby(lambda x: x.name).sum() - Parameters - ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The DataFrame's column names. - """ + @property + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns.""" + return self.data.count() + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return len(self.ids) - def info(self) -> None: + def to_buffer( + self, + *, + buf: Optional[IO[str]], + max_cols: Optional[int], + verbose: Optional[bool], + show_counts: Optional[bool], + ) -> None: """ Print a concise summary of a %(klass)s. @@ -209,151 +266,359 @@ def info(self) -> None: -------- %(examples_sub)s """ - lines = [] + printer = InfoPrinter( + info=self, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) + printer.to_buffer(buf) - lines.append(str(type(self.data))) - lines.append(self.data.index._summary()) - ids, dtypes = self._get_ids_and_dtypes() - col_count = len(ids) +class InfoPrinter: + """Class for printing dataframe or series info. - if col_count == 0: - lines.append(f"Empty {type(self.data).__name__}") - fmt.buffer_put_lines(self.buf, lines) - return + Parameters + ---------- + info : DataFrameInfo + Instance of DataFrameInfo. + max_cols : int, optional + When to switch from the verbose to the truncated output. + verbose : bool, optional + Whether to print the full summary. + show_counts : bool, optional + Whether to show the non-null counts. + """ - # hack - max_cols = self.max_cols + def __init__( + self, + info: DataFrameInfo, + max_cols: Optional[int] = None, + verbose: Optional[bool] = None, + show_counts: Optional[bool] = None, + ): + self.info = info + self.data = info.data + self.verbose = verbose + self.max_cols = self._initialize_max_cols(max_cols) + self.show_counts = self._initialize_show_counts(show_counts) + + @property + def max_rows(self) -> int: + """Maximum info rows to be displayed.""" + return get_option("display.max_info_rows", len(self.data) + 1) + + @property + def exceeds_info_cols(self) -> bool: + """Check if number of columns to be summarized does not exceed maximum.""" + return bool(self.col_count > self.max_cols) + + @property + def exceeds_info_rows(self) -> bool: + """Check if number of rows to be summarized does not exceed maximum.""" + return bool(len(self.data) > self.max_rows) + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return self.info.col_count + + def _initialize_max_cols(self, max_cols: Optional[int]) -> int: if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) - - max_rows = get_option("display.max_info_rows", len(self.data) + 1) + return get_option("display.max_info_columns", self.col_count + 1) + return max_cols - if self.null_counts is None: - show_counts = (col_count <= max_cols) and (len(self.data) < max_rows) + def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: + if show_counts is None: + return bool(not self.exceeds_info_cols and not self.exceeds_info_rows) else: - show_counts = self.null_counts - exceeds_info_cols = col_count > max_cols + return show_counts + + def to_buffer(self, buf: Optional[IO[str]] = None) -> None: + """Save dataframe info into buffer.""" + table_builder = self._create_table_builder() + lines = table_builder.get_lines() + if buf is None: # pragma: no cover + buf = sys.stdout + fmt.buffer_put_lines(buf, lines) + def _create_table_builder(self) -> "DataFrameTableBuilder": + """ + Create instance of table builder based on verbosity and display settings. + """ if self.verbose: - self._verbose_repr(lines, ids, dtypes, show_counts) + return DataFrameTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) elif self.verbose is False: # specifically set to False, not necessarily None - self._non_verbose_repr(lines, ids) + return DataFrameTableBuilderNonVerbose(info=self.info) else: - if exceeds_info_cols: - self._non_verbose_repr(lines, ids) + if self.exceeds_info_cols: + return DataFrameTableBuilderNonVerbose(info=self.info) else: - self._verbose_repr(lines, ids, dtypes, show_counts) + return DataFrameTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) - # groupby dtype.name to collect e.g. Categorical columns - counts = dtypes.value_counts().groupby(lambda x: x.name).sum() - collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(collected_dtypes)}") - if self.memory_usage: - # append memory usage of df to display - size_qualifier = "" - if self.memory_usage == "deep": - deep = True - else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or self.data.index._is_memory_usage_qualified(): - size_qualifier = "+" - mem_usage = self._get_mem_usage(deep=deep) - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(self.buf, lines) +class TableBuilderAbstract(ABC): + """Abstract builder for info table. + Parameters + ---------- + info : BaseInfo + Instance of DataFrameInfo or SeriesInfo. + """ -class DataFrameInfo(BaseInfo): - def _get_mem_usage(self, deep: bool) -> int: - return self.data.memory_usage(index=True, deep=deep).sum() + _lines: List[str] - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - return self.data.columns, self.data.dtypes + def __init__(self, *, info): + self.info = info - def _verbose_repr( - self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool - ) -> None: - col_count = len(ids) - lines.append(f"Data columns (total {col_count} columns):") - - id_head = " # " - column_head = "Column" - col_space = 2 - - max_col = max(len(pprint_thing(k)) for k in ids) - len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + col_space - - # GH #36765 - # add one space in max_id because there is a one-space padding - # in front of the number - # this allows maintain two spaces gap between columns - max_id = len(pprint_thing(col_count)) + 1 - len_id = len(pprint_thing(id_head)) - space_num = max(max_id, len_id) + col_space - - if show_counts: - counts = self.data.count() - if col_count != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({col_count} != {len(counts)})" - ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null + @abstractmethod + def get_lines(self) -> List[str]: + """Product in a form of list of lines (strings).""" + + +class DataFrameTableBuilder(TableBuilderAbstract): + """Abstract builder for dataframe info table.""" + + def get_lines(self) -> List[str]: + self._lines = [] + if self.col_count == 0: + self._fill_empty_info() else: - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" + self._fill_non_empty_info() + return self._lines + + def _fill_empty_info(self) -> None: + """Add lines to the info table, pertaining to empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self._lines.append(f"Empty {type(self.data).__name__}") + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + @property + def data(self) -> "DataFrame": + """DataFrame.""" + return self.info.data + + @property + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + return self.info.dtype_counts + + @property + def non_null_counts(self) -> Sequence[int]: + return self.info.non_null_counts + + @property + def display_memory_usage(self) -> bool: + """Whether to display memory usage.""" + return self.info.memory_usage + + @property + def memory_usage_string(self) -> str: + """Memory usage string with proper size qualifier.""" + return self.info.memory_usage_string + + @property + def ids(self) -> Index: + """Dataframe columns.""" + return self.info.ids + + @property + def dtypes(self) -> "Series": + """Dtypes of each of the DataFrame's columns.""" + return self.info.dtypes + + @property + def col_count(self) -> int: + """Number of dataframe columns to be summarized.""" + return self.info.col_count + + def add_object_type_line(self) -> None: + """Add line with string representation of dataframe to the table.""" + self._lines.append(str(type(self.data))) + + def add_index_range_line(self) -> None: + """Add line with range of indices to the table.""" + self._lines.append(self.data.index._summary()) + + @abstractmethod + def add_columns_summary_line(self) -> None: + """Add line with columns summary to the table.""" + + @abstractmethod + def add_header_line(self) -> None: + """Add header line to the table.""" + + @abstractmethod + def add_separator_line(self) -> None: + """Add separator line between header and body of the table.""" + + @abstractmethod + def add_body_lines(self) -> None: + """Add content of the table body.""" + + def add_dtypes_line(self) -> None: + """Add summary line with dtypes present in dataframe.""" + collected_dtypes = [ + f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) + ] + self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") + + def add_memory_usage_line(self) -> None: + """Add line containing memory usage.""" + self._lines.append(f"memory usage: {self.memory_usage_string}") + + +class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): + """Info table builder for non-verbose output.""" - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in dtypes) - space_dtype = max(len_dtype, max_dtypes) + def add_columns_summary_line(self) -> None: + self._lines.append(self.ids._summary(name="Columns")) - header = "".join( + def add_header_line(self) -> None: + """No header in non-verbose output.""" + + def add_separator_line(self) -> None: + """No separator in non-verbose output.""" + + def add_body_lines(self) -> None: + """No body in non-verbose output.""" + + +class DataFrameTableBuilderVerbose(DataFrameTableBuilder): + """Info table builder for verbose output.""" + + SPACING = " " * 2 + + def __init__( + self, + *, + info: DataFrameInfo, + with_counts: bool, + ): + super().__init__(info=info) + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return [" # ", "Column", "Non-Null Count", "Dtype"] + return [" # ", "Column", "Dtype"] + + def _gen_rows(self) -> Iterator[Sequence[str]]: + """Generator function yielding rows content. + + Each element represents a row comprising a sequence of strings. + """ + if self.with_counts: + return self._gen_rows_with_counts() + else: + return self._gen_rows_without_counts() + + def add_columns_summary_line(self) -> None: + self._lines.append(f"Data columns (total {self.col_count} columns):") + + @property + def header_column_widths(self) -> Sequence[int]: + """Widths of header columns (only titles).""" + return [len(col) for col in self.headers] + + def _get_gross_column_widths(self) -> Sequence[int]: + """Get widths of columns containing both headers and actual content.""" + body_column_widths = self._get_body_column_widths() + return [ + max(*widths) + for widths in zip(self.header_column_widths, body_column_widths) + ] + + def _get_body_column_widths(self) -> Sequence[int]: + """Get widths of table content columns.""" + strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) + return [max(len(x) for x in col) for col in strcols] + + def add_header_line(self) -> None: + header_line = self.SPACING.join( [ - _put_str(id_head, space_num), - _put_str(column_head, space), - _put_str(count_header, space_count), - _put_str(dtype_header, space_dtype), + _put_str(header, col_width) + for header, col_width in zip(self.headers, self.gross_column_widths) ] ) - lines.append(header) + self._lines.append(header_line) - top_separator = "".join( + def add_separator_line(self) -> None: + separator_line = self.SPACING.join( [ - _put_str("-" * len_id, space_num), - _put_str("-" * len_column, space), - _put_str("-" * len_count, space_count), - _put_str("-" * len_dtype, space_dtype), + _put_str("-" * header_colwidth, gross_colwidth) + for header_colwidth, gross_colwidth in zip( + self.header_column_widths, self.gross_column_widths + ) ] ) - lines.append(top_separator) - - for i, col in enumerate(ids): - dtype = dtypes.iloc[i] - col = pprint_thing(col) - - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts.iloc[i] - - lines.append( - line_no - + _put_str(col, space) - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) + self._lines.append(separator_line) + + def add_body_lines(self) -> None: + for row in self.strrows: + body_line = self.SPACING.join( + [ + _put_str(col, gross_colwidth) + for col, gross_colwidth in zip(row, self.gross_column_widths) + ] ) + self._lines.append(body_line) + + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + yield from zip( + self._gen_line_numbers(), + self._gen_columns(), + self._gen_dtypes(), + ) + + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + yield from zip( + self._gen_line_numbers(), + self._gen_columns(), + self._gen_non_null_counts(), + self._gen_dtypes(), + ) - def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: - lines.append(ids._summary(name="Columns")) + def _gen_line_numbers(self) -> Iterator[str]: + """Iterator with string representation of column numbers.""" + for i, _ in enumerate(self.ids): + yield f" {i}" + + def _gen_columns(self) -> Iterator[str]: + """Iterator with string representation of column names.""" + for col in self.ids: + yield pprint_thing(col) + + def _gen_dtypes(self) -> Iterator[str]: + """Iterator with string representation of column dtypes.""" + for dtype in self.dtypes: + yield pprint_thing(dtype) + + def _gen_non_null_counts(self) -> Iterator[str]: + """Iterator with string representation of non-null counts.""" + for count in self.non_null_counts: + yield f"{count} non-null" diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index fd44bd431d50f..418d05a6b8752 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -51,6 +51,20 @@ def datetime_frame(): return DataFrame(tm.getTimeSeriesData()) +def test_info_empty(): + df = DataFrame() + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + expected = textwrap.dedent( + """\ + + Index: 0 entries + Empty DataFrame""" + ) + assert result == expected + + def test_info_categorical_column(): # make sure it works