diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index a5001e840f471..e63728e22d23a 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -482,6 +482,8 @@ Enhancements - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max, :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`) - ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`) +- :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of + quantiles. Performance ~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fcbd0688792fb..23736dafe3556 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4145,22 +4145,41 @@ def mode(self, axis=0, numeric_only=False): def quantile(self, q=0.5, axis=0, numeric_only=True): """ Return values at the given quantile over requested axis, a la - scoreatpercentile in scipy.stats + numpy.percentile. Parameters ---------- - q : quantile, default 0.5 (50% quantile) - 0 <= q <= 1 + q : float or array-like, default 0.5 (50% quantile) + 0 <= q <= 1, the quantile(s) to compute axis : {0, 1} 0 for row-wise, 1 for column-wise Returns ------- - quantiles : Series + quantiles : Series or DataFrame + If ``q`` is an array, a DataFrame will be returned where the + index is ``q``, the columns are the columns of self, and the + values are the quantiles. + If ``q`` is a float, a Series will be returned where the + index is the columns of self and the values are the quantiles. + + Examples + -------- + + >>> df = DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + columns=['a', 'b']) + >>> df.quantile(.1) + a 1.3 + b 3.7 + dtype: float64 + >>> df.quantile([.1, .5]) + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 """ - per = q * 100 + per = np.asarray(q) * 100 - def f(arr): + def f(arr, per): arr = arr.values if arr.dtype != np.float_: arr = arr.astype(float) @@ -4171,7 +4190,12 @@ def f(arr): return _quantile(arr, per) data = self._get_numeric_data() if numeric_only else self - return data.apply(f, axis=axis) + if com.is_list_like(per): + from pandas.tools.merge import concat + return concat([data.apply(f, axis=axis, args=(x,)) for x in per], + axis=1, keys=per/100.).T + else: + return data.apply(f, axis=axis, args=(per,)) def rank(self, axis=0, numeric_only=None, method='average', na_option='keep', ascending=True, pct=False): diff --git a/pandas/core/series.py b/pandas/core/series.py index 6172f87ead246..637b2e8bfc67d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1220,26 +1220,51 @@ def round(self, decimals=0, out=None): def quantile(self, q=0.5): """ - Return value at the given quantile, a la scoreatpercentile in - scipy.stats + Return value at the given quantile, a la numpy.percentile. Parameters ---------- - q : quantile - 0 <= q <= 1 + q : float or array-like, default 0.5 (50% quantile) + 0 <= q <= 1, the quantile(s) to compute Returns ------- - quantile : float + quantile : float or Series + if ``q`` is an array, a Series will be returned where the + index is ``q`` and the values are the quantiles. + + Examples + -------- + + >>> s = Series([1, 2, 3, 4]) + >>> s.quantile(.5) + 2.5 + >>> s.quantile([.25, .5, .75]) + 0.25 1.75 + 0.50 2.50 + 0.75 3.25 + dtype: float64 """ valid_values = self.dropna().values if len(valid_values) == 0: return pa.NA + + def multi(values, qs): + if com.is_list_like(qs): + return Series([_quantile(values, x*100) + for x in qs], index=qs) + else: + return _quantile(values, qs*100) + if com.is_datetime64_dtype(self): values = _values_from_object(self).view('i8') - result = lib.Timestamp(_quantile(values, q * 100)) + result = multi(values, q) + if com.is_list_like(q): + result = result.map(lib.Timestamp) + else: + result = lib.Timestamp(result) else: - result = _quantile(valid_values, q * 100) + result = multi(valid_values, q) return result diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b9692214dcb74..3a3d5a822163f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10945,6 +10945,25 @@ def test_quantile(self): xp = df.median() assert_series_equal(rs, xp) + def test_quantile_multi(self): + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=['a', 'b', 'c']) + result = df.quantile([.25, .5]) + expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], + index=[.25, .5], columns=['a', 'b', 'c']) + assert_frame_equal(result, expected) + + # axis = 1 + result = df.quantile([.25, .5], axis=1) + expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], + index=[.25, .5], columns=[0, 1, 2]) + + # empty + result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0) + expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]}, + index=[.1, .9]) + assert_frame_equal(result, expected) + def test_cumsum(self): self.tsframe.ix[5:10, 0] = nan self.tsframe.ix[10:15, 1] = nan @@ -12728,7 +12747,6 @@ def check_query_with_unnamed_multiindex(self, parser, engine): df = DataFrame(randn(10, 2), index=index) ind = Series(df.index.get_level_values(0).values, index=index) - #import ipdb; ipdb.set_trace() res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) exp = df[ind == 'red'] diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 5dd3201ee5214..839804be4437c 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2203,6 +2203,22 @@ def test_quantile(self): q = tds.quantile(.25) self.assertEqual(q, pd.to_timedelta('24:00:00')) + def test_quantile_multi(self): + from numpy import percentile + + qs = [.1, .9] + result = self.ts.quantile(qs) + expected = pd.Series([percentile(self.ts.valid(), 10), + percentile(self.ts.valid(), 90)], + index=qs) + assert_series_equal(result, expected) + + dts = self.ts.index.to_series() + result = dts.quantile((.2, .2)) + assert_series_equal(result, Series([Timestamp('2000-01-10 19:12:00'), + Timestamp('2000-01-10 19:12:00')], + index=[.2, .2])) + def test_describe(self): _ = self.series.describe() _ = self.ts.describe()