Skip to content
Closed
159 changes: 10 additions & 149 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,7 @@
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError, InvalidIndexError
from pandas.util._decorators import doc, rewrite_axis_style_signature
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
validate_percentile,
)
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs

from pandas.core.dtypes.common import (
ensure_int64,
Expand Down Expand Up @@ -109,11 +105,8 @@
from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window

from pandas.io.formats import format as fmt
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
format_percentiles,
)
from pandas.io.formats.describe import describe_ndframe
from pandas.io.formats.format import DataFrameFormatter, DataFrameRenderer
from pandas.io.formats.printing import pprint_thing

if TYPE_CHECKING:
Expand Down Expand Up @@ -10237,145 +10230,13 @@ def describe(
75% NaN 2.5
max NaN 3.0
"""
if self.ndim == 2 and self.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")

if percentiles is not None:
# explicit conversion of `percentiles` to list
percentiles = list(percentiles)

# get them all to be in [0, 1]
validate_percentile(percentiles)

# median should always be included
if 0.5 not in percentiles:
percentiles.append(0.5)
percentiles = np.asarray(percentiles)
else:
percentiles = np.array([0.25, 0.5, 0.75])

# sort and check for duplicates
unique_pcts = np.unique(percentiles)
if len(unique_pcts) < len(percentiles):
raise ValueError("percentiles cannot contain duplicates")
percentiles = unique_pcts

formatted_percentiles = format_percentiles(percentiles)

def describe_numeric_1d(series) -> "Series":
stat_index = (
["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
)
d = (
[series.count(), series.mean(), series.std(), series.min()]
+ series.quantile(percentiles).tolist()
+ [series.max()]
)
return pd.Series(d, index=stat_index, name=series.name)

def describe_categorical_1d(data) -> "Series":
names = ["count", "unique"]
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
result = [data.count(), count_unique]
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
if is_datetime64_any_dtype(data.dtype):
if self.ndim == 1:
stacklevel = 4
else:
stacklevel = 5
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
names += ["top", "freq"]
result += [np.nan, np.nan]
dtype = "object"

return pd.Series(result, index=names, name=data.name, dtype=dtype)

def describe_timestamp_1d(data) -> "Series":
# GH-30164
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
d = (
[data.count(), data.mean(), data.min()]
+ data.quantile(percentiles).tolist()
+ [data.max()]
)
return pd.Series(d, index=stat_index, name=data.name)

def describe_1d(data) -> "Series":
if is_bool_dtype(data.dtype):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d(data)
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data)
else:
return describe_categorical_1d(data)

if self.ndim == 1:
# Incompatible return value type
# (got "Series", expected "FrameOrSeries") [return-value]
return describe_1d(self) # type:ignore[return-value]
elif (include is None) and (exclude is None):
# when some numerics are found, keep only numerics
default_include = [np.number]
if datetime_is_numeric:
default_include.append("datetime")
data = self.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = self
elif include == "all":
if exclude is not None:
msg = "exclude must be None when include is 'all'"
raise ValueError(msg)
data = self
else:
data = self.select_dtypes(include=include, exclude=exclude)

ldesc = [describe_1d(s) for _, s in data.items()]
# set a convenient order for rows
names: List[Label] = []
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
for idxnames in ldesc_indexes:
for name in idxnames:
if name not in names:
names.append(name)

d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
d.columns = data.columns.copy()
return d
return describe_ndframe(
data=self,
include=include,
exclude=exclude,
datetime_is_numeric=datetime_is_numeric,
percentiles=percentiles,
)

@final
def pct_change(
Expand Down
Loading