Skip to content

ENH: Quantiles accepts an array #6955

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 25, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,8 @@ Enhancements
- Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max,
:func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`)
- ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`)
- :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of
quantiles.

Performance
~~~~~~~~~~~
Expand Down
38 changes: 31 additions & 7 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4145,22 +4145,41 @@ def mode(self, axis=0, numeric_only=False):
def quantile(self, q=0.5, axis=0, numeric_only=True):
"""
Return values at the given quantile over requested axis, a la
scoreatpercentile in scipy.stats
numpy.percentile.

Parameters
----------
q : quantile, default 0.5 (50% quantile)
0 <= q <= 1
q : float or array-like, default 0.5 (50% quantile)
0 <= q <= 1, the quantile(s) to compute
axis : {0, 1}
0 for row-wise, 1 for column-wise

Returns
-------
quantiles : Series
quantiles : Series or DataFrame
If ``q`` is an array, a DataFrame will be returned where the
index is ``q``, the columns are the columns of self, and the
values are the quantiles.
If ``q`` is a float, a Series will be returned where the
index is the columns of self and the values are the quantiles.

Examples
--------

>>> df = DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
columns=['a', 'b'])
>>> df.quantile(.1)
a 1.3
b 3.7
dtype: float64
>>> df.quantile([.1, .5])
a b
0.1 1.3 3.7
0.5 2.5 55.0
"""
per = q * 100
per = np.asarray(q) * 100

def f(arr):
def f(arr, per):
arr = arr.values
if arr.dtype != np.float_:
arr = arr.astype(float)
Expand All @@ -4171,7 +4190,12 @@ def f(arr):
return _quantile(arr, per)

data = self._get_numeric_data() if numeric_only else self
return data.apply(f, axis=axis)
if com.is_list_like(per):
from pandas.tools.merge import concat
return concat([data.apply(f, axis=axis, args=(x,)) for x in per],
axis=1, keys=per/100.).T
else:
return data.apply(f, axis=axis, args=(per,))

def rank(self, axis=0, numeric_only=None, method='average',
na_option='keep', ascending=True, pct=False):
Expand Down
39 changes: 32 additions & 7 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1220,26 +1220,51 @@ def round(self, decimals=0, out=None):

def quantile(self, q=0.5):
"""
Return value at the given quantile, a la scoreatpercentile in
scipy.stats
Return value at the given quantile, a la numpy.percentile.

Parameters
----------
q : quantile
0 <= q <= 1
q : float or array-like, default 0.5 (50% quantile)
0 <= q <= 1, the quantile(s) to compute

Returns
-------
quantile : float
quantile : float or Series
if ``q`` is an array, a Series will be returned where the
index is ``q`` and the values are the quantiles.

Examples
--------

>>> s = Series([1, 2, 3, 4])
>>> s.quantile(.5)
2.5
>>> s.quantile([.25, .5, .75])
0.25 1.75
0.50 2.50
0.75 3.25
dtype: float64
"""
valid_values = self.dropna().values
if len(valid_values) == 0:
return pa.NA

def multi(values, qs):
if com.is_list_like(qs):
return Series([_quantile(values, x*100)
for x in qs], index=qs)
else:
return _quantile(values, qs*100)

if com.is_datetime64_dtype(self):
values = _values_from_object(self).view('i8')
result = lib.Timestamp(_quantile(values, q * 100))
result = multi(values, q)
if com.is_list_like(q):
result = result.map(lib.Timestamp)
else:
result = lib.Timestamp(result)
else:
result = _quantile(valid_values, q * 100)
result = multi(valid_values, q)

return result

Expand Down
20 changes: 19 additions & 1 deletion pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10945,6 +10945,25 @@ def test_quantile(self):
xp = df.median()
assert_series_equal(rs, xp)

def test_quantile_multi(self):
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
columns=['a', 'b', 'c'])
result = df.quantile([.25, .5])
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
index=[.25, .5], columns=['a', 'b', 'c'])
assert_frame_equal(result, expected)

# axis = 1
result = df.quantile([.25, .5], axis=1)
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
index=[.25, .5], columns=[0, 1, 2])

# empty
result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]},
index=[.1, .9])
assert_frame_equal(result, expected)

def test_cumsum(self):
self.tsframe.ix[5:10, 0] = nan
self.tsframe.ix[10:15, 1] = nan
Expand Down Expand Up @@ -12728,7 +12747,6 @@ def check_query_with_unnamed_multiindex(self, parser, engine):
df = DataFrame(randn(10, 2), index=index)
ind = Series(df.index.get_level_values(0).values, index=index)

#import ipdb; ipdb.set_trace()
res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
exp = df[ind == 'red']
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2203,6 +2203,22 @@ def test_quantile(self):
q = tds.quantile(.25)
self.assertEqual(q, pd.to_timedelta('24:00:00'))

def test_quantile_multi(self):
from numpy import percentile

qs = [.1, .9]
result = self.ts.quantile(qs)
expected = pd.Series([percentile(self.ts.valid(), 10),
percentile(self.ts.valid(), 90)],
index=qs)
assert_series_equal(result, expected)

dts = self.ts.index.to_series()
result = dts.quantile((.2, .2))
assert_series_equal(result, Series([Timestamp('2000-01-10 19:12:00'),
Timestamp('2000-01-10 19:12:00')],
index=[.2, .2]))

def test_describe(self):
_ = self.series.describe()
_ = self.ts.describe()
Expand Down