diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index ce8d8d5c2ca10..1101379752958 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -33,6 +33,7 @@ objects. :toctree: api/ api.extensions.ExtensionArray._concat_same_type + api.extensions.ExtensionArray._format_array api.extensions.ExtensionArray._formatter api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index db5cce8459ca2..71856f1c0448a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -204,6 +204,7 @@ Other enhancements - :meth:`IntegerArray.all` , :meth:`IntegerArray.any`, :meth:`FloatingArray.any`, and :meth:`FloatingArray.all` use Kleene logic (:issue:`41967`) - Added support for nullable boolean and integer types in :meth:`DataFrame.to_stata`, :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`40855`) - :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`) +- Added :meth:`api.extension.ExtensionArray._format_array` for extension arrays to control how they are formatted in ``Series`` and ``DataFrame`` (:issue:`26837`) - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`) - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`) - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a64aef64ab49f..584aad6af4e8c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -29,6 +29,7 @@ AstypeArg, Dtype, FillnaOptions, + FloatFormatType, PositionalIndexer, ScalarIndexer, SequenceIndexer, @@ -137,6 +138,7 @@ class ExtensionArray: view _concat_same_type _formatter + _format_array _from_factorized _from_sequence _from_sequence_of_strings @@ -167,6 +169,8 @@ class ExtensionArray: * __repr__ : A default repr for the ExtensionArray. * _formatter : Print scalars inside a Series or DataFrame. + * _format_array: Full control over formatting an ExtensionArray + to be included in a Series or DataFrame. Some methods require casting the ExtensionArray to an ndarray of Python objects with ``self.astype(object)``, which may be expensive. When @@ -1232,6 +1236,105 @@ def _repr_2d(self) -> str: class_name = f"<{type(self).__name__}>" return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" + def _format_array( + self, + formatter: Callable | None, + *, + float_format: FloatFormatType, + na_rep: str = "NaN", + digits: int, + space: str | int, + justify: str = "right", + decimal: str = ".", + leading_space: bool | None = True, + quoting: int | None = None, + ) -> list[str]: + """ + Format an array of values. + + This is called from both the Series and DataFrame reprs. By default, + the ExtensionArray is converted to a NumPy array and formatted using + pandas' normal formatting methods. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + formatter : Callable, optional + The function to apply to each element of the array to convert it + to a string. By default, `self._formatter` is used. + float_format : one-parameter function, optional, default None + Formatter function to apply to columns' elements if they are + floats. This function must return a unicode string and will be + applied only to the non-``NaN`` elements, with ``NaN`` being + handled by ``na_rep``. + na_rep : str, optional, default 'NaN' + String representation of ``NaN`` to use. + digits : int, optional + Display precision in terms of decimal places. Defaults to + ``pandas.options.display.precision``. + space : int, optional + Defaults to ``pandas.options.display.column_space``. + justify : str, default None + How to justify the column labels. If None uses the option from + the print configuration (controlled by set_option), 'right' out + of the box. Valid values are + + * left + * right + * center + * justify + * justify-all + * start + * end + * inherit + * match-parent + * initial + * unset. + + decimal : str, default '.' + Character recognized as decimal separator, e.g. ',' in Europe. + + leading_space : bool, optional, default True + Whether the array should be formatted with a leading space. + When an array as a column of a Series or DataFrame, we do want + the leading space to pad between columns. + + When formatting an Index subclass + (e.g. IntervalIndex._format_native_types), we don't want the + leading space since it should be left-aligned. + + Returns + ------- + list[str] + The list of formatted values for the array. + """ + from pandas.core.construction import extract_array + + from pandas.io.formats.format import format_array + + values = extract_array(self, extract_numpy=True) + + if formatter is None: + # error: Item "ndarray" of "Union[Any, Union[ExtensionArray, ndarray]]" has + # no attribute "_formatter" + formatter = values._formatter(boxed=True) # type: ignore[union-attr] + + array = np.asarray(values) + fmt_values = format_array( + array, + formatter, + float_format=float_format, + na_rep=na_rep, + digits=digits, + space=space, + justify=justify, + decimal=decimal, + leading_space=leading_space, + quoting=quoting, + ) + return fmt_values + def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: """ Formatting function for scalar values. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4c6a32ff1ba4e..89a2f374c516f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,6 +6,7 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, + Callable, Hashable, Sequence, TypeVar, @@ -35,6 +36,7 @@ ArrayLike, AstypeArg, Dtype, + FloatFormatType, NpDtype, Ordered, Shape, @@ -1950,6 +1952,36 @@ def __contains__(self, key) -> bool: # ------------------------------------------------------------------ # Rendering Methods + def _format_array( + self, + formatter: Callable | None, + *, + float_format: FloatFormatType, + na_rep: str = "NaN", + digits: int, + space: str | int, + justify: str = "right", + decimal: str = ".", + leading_space: bool | None = True, + quoting: int | None = None, + ) -> list[str]: + from pandas.io.formats.format import format_array + + array = self._internal_get_values() + fmt_values = format_array( + array, + formatter, + float_format=float_format, + na_rep=na_rep, + digits=digits, + space=space, + justify=justify, + decimal=decimal, + leading_space=leading_space, + quoting=quoting, + ) + return fmt_values + def _formatter(self, boxed: bool = False): # Defer to CategoricalFormatter's formatter. return None diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7bd3403abd5cc..1aeaa67803679 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -8,6 +8,7 @@ ) from typing import ( TYPE_CHECKING, + Callable, Literal, ) import warnings @@ -36,7 +37,10 @@ to_offset, tzconversion, ) -from pandas._typing import npt +from pandas._typing import ( + FloatFormatType, + npt, +) from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_inclusive @@ -680,6 +684,46 @@ def _format_native_types( self.asi8, tz=self.tz, format=fmt, na_rep=na_rep ) + def _format_array( + self, + formatter: Callable | None, + *, + float_format: FloatFormatType, + na_rep: str = "NaN", + digits: int, + space: str | int, + justify: str = "right", + decimal: str = ".", + leading_space: bool | None = True, + quoting: int | None = None, + ) -> list[str]: + from pandas.io.formats.format import ( + Datetime64Formatter, + Datetime64TZFormatter, + ) + + fmt_klass: type[Datetime64Formatter] | type[Datetime64TZFormatter] + + if is_datetime64tz_dtype(self.dtype): + fmt_klass = Datetime64TZFormatter + else: + fmt_klass = Datetime64Formatter + + fmt_obj = fmt_klass( + self, + digits=digits, + na_rep=na_rep, + float_format=float_format, + formatter=formatter, + space=space, + justify=justify, + decimal=decimal, + leading_space=leading_space, + quoting=quoting, + ) + + return fmt_obj.get_result() + # ----------------------------------------------------------------- # Comparison Methods diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 616331bf80a44..735d1e6b5b656 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1276,14 +1276,36 @@ def format_array( List[str] """ fmt_klass: type[GenericArrayFormatter] - if is_datetime64_dtype(values.dtype): + + if space is None: + space = get_option("display.column_space") + + if float_format is None: + float_format = get_option("display.float_format") + + if digits is None: + digits = get_option("display.precision") + + values = extract_array(values, extract_numpy=True) + + if is_extension_array_dtype(values): + return values._format_array( + formatter, + float_format=float_format, + na_rep=na_rep, + digits=digits, + space=space, + justify=justify, + decimal=decimal, + leading_space=leading_space, + quoting=quoting, + ) + elif is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter elif is_datetime64tz_dtype(values.dtype): fmt_klass = Datetime64TZFormatter elif is_timedelta64_dtype(values.dtype): fmt_klass = Timedelta64Formatter - elif is_extension_array_dtype(values.dtype): - fmt_klass = ExtensionArrayFormatter elif is_float_dtype(values.dtype) or is_complex_dtype(values.dtype): fmt_klass = FloatArrayFormatter elif is_integer_dtype(values.dtype): @@ -1291,15 +1313,6 @@ def format_array( else: fmt_klass = GenericArrayFormatter - if space is None: - space = get_option("display.column_space") - - if float_format is None: - float_format = get_option("display.float_format") - - if digits is None: - digits = get_option("display.precision") - fmt_obj = fmt_klass( values, digits=digits, @@ -1633,37 +1646,6 @@ def _format_strings(self) -> list[str]: return fmt_values.tolist() -class ExtensionArrayFormatter(GenericArrayFormatter): - def _format_strings(self) -> list[str]: - values = extract_array(self.values, extract_numpy=True) - - formatter = self.formatter - if formatter is None: - # error: Item "ndarray" of "Union[Any, Union[ExtensionArray, ndarray]]" has - # no attribute "_formatter" - formatter = values._formatter(boxed=True) # type: ignore[union-attr] - - if isinstance(values, Categorical): - # Categorical is special for now, so that we can preserve tzinfo - array = values._internal_get_values() - else: - array = np.asarray(values) - - fmt_values = format_array( - array, - formatter, - float_format=self.float_format, - na_rep=self.na_rep, - digits=self.digits, - space=self.space, - justify=self.justify, - decimal=self.decimal, - leading_space=self.leading_space, - quoting=self.quoting, - ) - return fmt_values - - def format_percentiles( percentiles: (np.ndarray | list[int | float] | list[float] | list[str | float]), ) -> list[str]: diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index e43650c291200..1aa0e0b189656 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -1,6 +1,12 @@ +from __future__ import annotations + +from typing import Callable + import numpy as np import pytest +from pandas._typing import FloatFormatType + from pandas.core.dtypes import dtypes from pandas.core.dtypes.common import is_extension_array_dtype @@ -10,7 +16,8 @@ class DummyDtype(dtypes.ExtensionDtype): - pass + type = object + name = "dummy" class DummyArray(ExtensionArray): @@ -33,6 +40,29 @@ def astype(self, dtype, copy=True): return np.array(self, dtype=dtype, copy=copy) + def __len__(self) -> int: + return len(self.data) + + +class DummyArrayNoAsarray(DummyArray): + def __array__(self, dtype=None): + raise ValueError("Cannot be converted to an array!") + + def _format_array( + self, + formatter: Callable | None, + *, + float_format: FloatFormatType, + na_rep: str = "NaN", + digits: int, + space: str | int, + justify: str = "right", + decimal: str = ".", + leading_space: bool | None = True, + quoting: int | None = None, + ): + return ["" for _ in self.data] + class TestExtensionArrayDtype: @pytest.mark.parametrize( @@ -79,3 +109,13 @@ def test_astype_no_copy(): def test_is_extension_array_dtype(dtype): assert isinstance(dtype, dtypes.ExtensionDtype) assert is_extension_array_dtype(dtype) + + +def test_repr_no_conversion(): + # https://github.com/pandas-dev/pandas/issues/26837#issuecomment-967268492 + # Validates + ser = pd.Series(DummyArrayNoAsarray([1])) + repr(ser) # OK! + + df = pd.DataFrame({"A": DummyArrayNoAsarray([1])}, copy=False) + repr(df) # OK!