Skip to content

Commit 1d7623f

Browse files
jbrockmendeljreback
authored andcommitted
REF: Simplify quantile, remove reduction from BlockManager (#24597)
1 parent c9a0405 commit 1d7623f

File tree

3 files changed

+124
-82
lines changed

3 files changed

+124
-82
lines changed

pandas/core/internals/blocks.py

Lines changed: 32 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from pandas.core.indexing import check_setitem_lengths
4343
from pandas.core.internals.arrays import extract_array
4444
import pandas.core.missing as missing
45+
from pandas.core.nanops import nanpercentile
4546

4647
from pandas.io.formats.printing import pprint_thing
4748

@@ -1438,7 +1439,7 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
14381439
blocks = [make_block(new_values, placement=new_placement)]
14391440
return blocks, mask
14401441

1441-
def quantile(self, qs, interpolation='linear', axis=0, axes=None):
1442+
def quantile(self, qs, interpolation='linear', axis=0):
14421443
"""
14431444
compute the quantiles of the
14441445
@@ -1447,94 +1448,53 @@ def quantile(self, qs, interpolation='linear', axis=0, axes=None):
14471448
qs: a scalar or list of the quantiles to be computed
14481449
interpolation: type of interpolation, default 'linear'
14491450
axis: axis to compute, default 0
1450-
axes : BlockManager.axes
14511451
14521452
Returns
14531453
-------
1454-
tuple of (axis, block)
1455-
1454+
Block
14561455
"""
1457-
kw = {'interpolation': interpolation}
14581456
values = self.get_values()
14591457
values, _ = self._try_coerce_args(values, values)
14601458

1461-
def _nanpercentile1D(values, mask, q, **kw):
1462-
# mask is Union[ExtensionArray, ndarray]
1463-
values = values[~mask]
1464-
1465-
if len(values) == 0:
1466-
if lib.is_scalar(q):
1467-
return self._na_value
1468-
else:
1469-
return np.array([self._na_value] * len(q),
1470-
dtype=values.dtype)
1471-
1472-
return np.percentile(values, q, **kw)
1473-
1474-
def _nanpercentile(values, q, axis, **kw):
1475-
1476-
mask = isna(self.values)
1477-
if not lib.is_scalar(mask) and mask.any():
1478-
if self.ndim == 1:
1479-
return _nanpercentile1D(values, mask, q, **kw)
1480-
else:
1481-
# for nonconsolidatable blocks mask is 1D, but values 2D
1482-
if mask.ndim < values.ndim:
1483-
mask = mask.reshape(values.shape)
1484-
if axis == 0:
1485-
values = values.T
1486-
mask = mask.T
1487-
result = [_nanpercentile1D(val, m, q, **kw) for (val, m)
1488-
in zip(list(values), list(mask))]
1489-
result = np.array(result, dtype=values.dtype, copy=False).T
1490-
return result
1491-
else:
1492-
return np.percentile(values, q, axis=axis, **kw)
1493-
1494-
from pandas import Float64Index
14951459
is_empty = values.shape[axis] == 0
1496-
if is_list_like(qs):
1497-
ax = Float64Index(qs)
1460+
orig_scalar = not is_list_like(qs)
1461+
if orig_scalar:
1462+
# make list-like, unpack later
1463+
qs = [qs]
14981464

1499-
if is_empty:
1500-
if self.ndim == 1:
1501-
result = self._na_value
1502-
else:
1503-
# create the array of na_values
1504-
# 2d len(values) * len(qs)
1505-
result = np.repeat(np.array([self._na_value] * len(qs)),
1506-
len(values)).reshape(len(values),
1507-
len(qs))
1465+
if is_empty:
1466+
if self.ndim == 1:
1467+
result = self._na_value
15081468
else:
1509-
result = _nanpercentile(values, np.array(qs) * 100,
1510-
axis=axis, **kw)
1511-
1512-
result = np.array(result, copy=False)
1513-
if self.ndim > 1:
1514-
result = result.T
1515-
1469+
# create the array of na_values
1470+
# 2d len(values) * len(qs)
1471+
result = np.repeat(np.array([self._na_value] * len(qs)),
1472+
len(values)).reshape(len(values),
1473+
len(qs))
15161474
else:
1475+
mask = isna(self.values)
1476+
result = nanpercentile(values, np.array(qs) * 100,
1477+
axis=axis, na_value=self._na_value,
1478+
mask=mask, ndim=self.ndim,
1479+
interpolation=interpolation)
15171480

1518-
if self.ndim == 1:
1519-
ax = Float64Index([qs])
1520-
else:
1521-
ax = axes[0]
1481+
result = np.array(result, copy=False)
1482+
if self.ndim > 1:
1483+
result = result.T
15221484

1523-
if is_empty:
1524-
if self.ndim == 1:
1525-
result = self._na_value
1526-
else:
1527-
result = np.array([self._na_value] * len(self))
1528-
else:
1529-
result = _nanpercentile(values, qs * 100, axis=axis, **kw)
1485+
if orig_scalar and not lib.is_scalar(result):
1486+
# result could be scalar in case with is_empty and self.ndim == 1
1487+
assert result.shape[-1] == 1, result.shape
1488+
result = result[..., 0]
1489+
result = lib.item_from_zerodim(result)
15301490

15311491
ndim = getattr(result, 'ndim', None) or 0
15321492
result = self._try_coerce_result(result)
15331493
if lib.is_scalar(result):
1534-
return ax, self.make_block_scalar(result)
1535-
return ax, make_block(result,
1536-
placement=np.arange(len(result)),
1537-
ndim=ndim)
1494+
return self.make_block_scalar(result)
1495+
return make_block(result,
1496+
placement=np.arange(len(result)),
1497+
ndim=ndim)
15381498

15391499
def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
15401500
convert=False, mask=None):

pandas/core/internals/managers.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
maybe_promote)
1717
from pandas.core.dtypes.common import (
1818
_NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype,
19-
is_extension_type, is_numeric_v_string_like, is_scalar)
19+
is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar)
2020
import pandas.core.dtypes.concat as _concat
2121
from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries
2222
from pandas.core.dtypes.missing import isna
@@ -402,34 +402,47 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False,
402402
bm._consolidate_inplace()
403403
return bm
404404

405-
def reduction(self, f, axis=0, consolidate=True, transposed=False,
406-
**kwargs):
405+
def quantile(self, axis=0, consolidate=True, transposed=False,
406+
interpolation='linear', qs=None, numeric_only=None):
407407
"""
408-
iterate over the blocks, collect and create a new block manager.
408+
Iterate over blocks applying quantile reduction.
409409
This routine is intended for reduction type operations and
410410
will do inference on the generated blocks.
411411
412412
Parameters
413413
----------
414-
f: the callable or function name to operate on at the block level
415414
axis: reduction axis, default 0
416415
consolidate: boolean, default True. Join together blocks having same
417416
dtype
418417
transposed: boolean, default False
419418
we are holding transposed data
419+
interpolation : type of interpolation, default 'linear'
420+
qs : a scalar or list of the quantiles to be computed
421+
numeric_only : ignored
420422
421423
Returns
422424
-------
423425
Block Manager (new object)
424-
425426
"""
426427

427428
if consolidate:
428429
self._consolidate_inplace()
429430

431+
def get_axe(block, qs, axes):
432+
from pandas import Float64Index
433+
if is_list_like(qs):
434+
ax = Float64Index(qs)
435+
elif block.ndim == 1:
436+
ax = Float64Index([qs])
437+
else:
438+
ax = axes[0]
439+
return ax
440+
430441
axes, blocks = [], []
431442
for b in self.blocks:
432-
axe, block = getattr(b, f)(axis=axis, axes=self.axes, **kwargs)
443+
block = b.quantile(axis=axis, qs=qs, interpolation=interpolation)
444+
445+
axe = get_axe(b, qs, axes=self.axes)
433446

434447
axes.append(axe)
435448
blocks.append(block)
@@ -496,9 +509,6 @@ def isna(self, func, **kwargs):
496509
def where(self, **kwargs):
497510
return self.apply('where', **kwargs)
498511

499-
def quantile(self, **kwargs):
500-
return self.reduction('quantile', **kwargs)
501-
502512
def setitem(self, **kwargs):
503513
return self.apply('setitem', **kwargs)
504514

pandas/core/nanops.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,3 +1194,75 @@ def f(x, y):
11941194
nanle = make_nancomp(operator.le)
11951195
naneq = make_nancomp(operator.eq)
11961196
nanne = make_nancomp(operator.ne)
1197+
1198+
1199+
def _nanpercentile_1d(values, mask, q, na_value, interpolation):
1200+
"""
1201+
Wraper for np.percentile that skips missing values, specialized to
1202+
1-dimensional case.
1203+
1204+
Parameters
1205+
----------
1206+
values : array over which to find quantiles
1207+
mask : ndarray[bool]
1208+
locations in values that should be considered missing
1209+
q : scalar or array of quantile indices to find
1210+
na_value : scalar
1211+
value to return for empty or all-null values
1212+
interpolation : str
1213+
1214+
Returns
1215+
-------
1216+
quantiles : scalar or array
1217+
"""
1218+
# mask is Union[ExtensionArray, ndarray]
1219+
values = values[~mask]
1220+
1221+
if len(values) == 0:
1222+
if lib.is_scalar(q):
1223+
return na_value
1224+
else:
1225+
return np.array([na_value] * len(q),
1226+
dtype=values.dtype)
1227+
1228+
return np.percentile(values, q, interpolation=interpolation)
1229+
1230+
1231+
def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):
1232+
"""
1233+
Wraper for np.percentile that skips missing values.
1234+
1235+
Parameters
1236+
----------
1237+
values : array over which to find quantiles
1238+
q : scalar or array of quantile indices to find
1239+
axis : {0, 1}
1240+
na_value : scalar
1241+
value to return for empty or all-null values
1242+
mask : ndarray[bool]
1243+
locations in values that should be considered missing
1244+
ndim : {1, 2}
1245+
interpolation : str
1246+
1247+
Returns
1248+
-------
1249+
quantiles : scalar or array
1250+
"""
1251+
if not lib.is_scalar(mask) and mask.any():
1252+
if ndim == 1:
1253+
return _nanpercentile_1d(values, mask, q, na_value,
1254+
interpolation=interpolation)
1255+
else:
1256+
# for nonconsolidatable blocks mask is 1D, but values 2D
1257+
if mask.ndim < values.ndim:
1258+
mask = mask.reshape(values.shape)
1259+
if axis == 0:
1260+
values = values.T
1261+
mask = mask.T
1262+
result = [_nanpercentile_1d(val, m, q, na_value,
1263+
interpolation=interpolation)
1264+
for (val, m) in zip(list(values), list(mask))]
1265+
result = np.array(result, dtype=values.dtype, copy=False).T
1266+
return result
1267+
else:
1268+
return np.percentile(values, q, axis=axis, interpolation=interpolation)

0 commit comments

Comments
 (0)