diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index e311cf34ffbc2..129cca503d69f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -17,6 +17,9 @@ New features - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) +- :func: quantile and :func: qcut now accept ``bounded`` as a keyword + argument, allowing for unbounded quantiles such that the lower/upper bounds are -inf/inf (:issue:`17282`) + .. _whatsnew_0240.enhancements.extension_array_operators: ``ExtensionArray`` operator support diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 49705cb6d9ad2..7011c4448abf0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -985,7 +985,7 @@ def _broadcast(arr_or_scalar, shape): } -def quantile(x, q, interpolation_method='fraction'): +def quantile(x, q, bounded=True, interpolation_method='fraction'): """ Compute sample quantile or quantiles of the input array. For example, q=0.5 computes the median. @@ -1002,6 +1002,9 @@ def quantile(x, q, interpolation_method='fraction'): Values from which to extract score. q : scalar or array Percentile at which to extract score. + bounded : bool, optional + Whether to use the min/max of the distribution as the lower/upper + bounds or use -inf/inf. interpolation_method : {'fraction', 'lower', 'higher'}, optional This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -1038,6 +1041,12 @@ def _interpolate(a, b, fraction): return a + (b - a) * fraction def _get_score(at): + if not bounded: + if at == 0: + return -np.inf + elif at == 1: + return np.inf + if len(values) == 0: return np.nan diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 031c94c06d3c8..43629b76648d4 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -238,7 +238,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, series_index, name, dtype) -def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): +def qcut(x, q, labels=None, retbins=False, precision=3, bounded=True, + duplicates='raise'): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example @@ -260,6 +261,9 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): is given as a scalar. precision : int, optional The precision at which to store and display the bins labels + bounded : bool, optional + Whether to use the min/max of the distribution as the lower/upper + bounds or use -inf/inf. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. @@ -301,7 +305,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q - bins = algos.quantile(x, quantiles) + bins = algos.quantile(x, quantiles, bounded=bounded) fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, dtype=dtype, duplicates=duplicates)