From 55836e6e7fa7e8bec73256377600c7922b854769 Mon Sep 17 00:00:00 2001 From: Mayank Asthana Date: Mon, 25 May 2015 02:33:23 +0530 Subject: [PATCH] Closes issue #10174. Added 'interpolation' keyword in Dataframe.quantile and Series.quantile --- doc/source/whatsnew/v0.18.0.txt | 1 + pandas/core/frame.py | 28 +++++++++-- pandas/core/series.py | 33 ++++++++++--- pandas/tests/test_frame.py | 88 +++++++++++++++++++++++++++++++++ pandas/tests/test_series.py | 41 ++++++++++++++- 5 files changed, 179 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index d78065765b90b..498cfaf838320 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -111,6 +111,7 @@ Other enhancements - ``sys.getsizeof(obj)`` returns the memory usage of a pandas object, including the values it contains (:issue:`11597`) - ``Series`` gained an ``is_unique`` attribute (:issue:`11946`) +- ``DataFrame.quantile`` and ``Series.quantile`` now accept ``interpolation`` keyword (:issue:`10174`). .. _whatsnew_0180.enhancements.rounding: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6207ac5dc5c12..29aaf8c5aa5c2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -64,6 +64,7 @@ import pandas.algos as _algos from pandas.core.config import get_option +from pandas import _np_version_under1p9 #---------------------------------------------------------------------- # Docstring templates @@ -4874,7 +4875,7 @@ def mode(self, axis=0, numeric_only=False): f = lambda s: s.mode() return data.apply(f, axis=axis) - def quantile(self, q=0.5, axis=0, numeric_only=True): + def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation='linear'): """ Return values at the given quantile over requested axis, a la numpy.percentile. @@ -4885,7 +4886,16 @@ def quantile(self, q=0.5, axis=0, numeric_only=True): 0 <= q <= 1, the quantile(s) to compute axis : {0, 1, 'index', 'columns'} (default 0) 0 or 'index' for row-wise, 1 or 'columns' for column-wise - + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + .. versionadded:: 0.18.0 + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. Returns ------- @@ -4920,7 +4930,12 @@ def quantile(self, q=0.5, axis=0, numeric_only=True): else: squeeze = False - def f(arr, per): + if _np_version_under1p9: + if interpolation != 'linear': + raise ValueError("Interpolation methods" + " other than linear not supported in numpy < 1.9") + + def f(arr, per,interpolation): if arr._is_datelike_mixed_type: values = _values_from_object(arr).view('i8') else: @@ -4929,7 +4944,10 @@ def f(arr, per): if len(values) == 0: return NA else: - return _quantile(values, per) + if _np_version_under1p9: + return _quantile(values, per) + else: + return _quantile(values, per, interpolation=interpolation) data = self._get_numeric_data() if numeric_only else self @@ -4943,7 +4961,7 @@ def f(arr, per): is_dt_col = data.dtypes.map(com.is_datetime64_dtype) is_dt_col = is_dt_col[is_dt_col].index - quantiles = [[f(vals, x) for x in per] + quantiles = [[f(vals, x, interpolation) for x in per] for (_, vals) in data.iteritems()] result = self._constructor(quantiles, index=data._info_axis, diff --git a/pandas/core/series.py b/pandas/core/series.py index 9910966bd4d2c..4cf16804408d3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -58,6 +58,8 @@ from numpy import percentile as _quantile from pandas.core.config import get_option +from pandas import _np_version_under1p9 + __all__ = ['Series'] @@ -1261,7 +1263,7 @@ def round(self, decimals=0): return result - def quantile(self, q=0.5): + def quantile(self, q=0.5, interpolation='linear'): """ Return value at the given quantile, a la numpy.percentile. @@ -1269,6 +1271,16 @@ def quantile(self, q=0.5): ---------- q : float or array-like, default 0.5 (50% quantile) 0 <= q <= 1, the quantile(s) to compute + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + .. versionadded:: 0.18.0 + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + * linear: `i + (j - i) * fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + * lower: `i`. + * higher: `j`. + * nearest: `i` or `j` whichever is nearest. + * midpoint: (`i` + `j`) / 2. Returns ------- @@ -1291,17 +1303,26 @@ def quantile(self, q=0.5): valid = self.dropna() self._check_percentile(q) - def multi(values, qs): + if _np_version_under1p9: + if interpolation != 'linear': + raise ValueError("Interpolation methods" + " other than linear not supported in numpy < 1.9.") + + def multi(values,qs,**kwargs): if com.is_list_like(qs): - values = [_quantile(values, x*100) for x in qs] + values = [_quantile(values, x*100, **kwargs) for x in qs] # let empty result to be Float64Index qs = Float64Index(qs) return self._constructor(values, index=qs, name=self.name) else: - return _quantile(values, qs*100) - - return self._maybe_box(lambda values: multi(values, q), dropna=True) + return _quantile(values, qs*100, **kwargs) + + kwargs = dict() + if not _np_version_under1p9: + kwargs.update({'interpolation':interpolation}) + return self._maybe_box(lambda values: multi(values,q,**kwargs), dropna=True) + def corr(self, other, method='pearson', min_periods=None): """ diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b6b81caccf9d5..c20c704b957a7 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -55,6 +55,7 @@ import pandas.lib as lib from numpy.testing.decorators import slow +from pandas import _np_version_under1p9 #--------------------------------------------------------------------- # DataFrame test cases @@ -13642,6 +13643,93 @@ def test_quantile_axis_parameter(self): self.assertRaises(ValueError, df.quantile, 0.1, axis=-1) self.assertRaises(ValueError, df.quantile, 0.1, axis="column") + def test_quantile_interpolation(self): + # GH #10174 + if _np_version_under1p9: + raise nose.SkipTest("Numpy version under 1.9") + + from numpy import percentile + + #interpolation = linear (default case) + q = self.tsframe.quantile(0.1, axis=0,interpolation='linear') + self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) + q = self.intframe.quantile(0.1) + self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) + + q1 = self.intframe.quantile(0.1) + self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10)) + #test with and without interpolation keyword + assert_series_equal(q,q1) + + #interpolation method other than default linear + + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + result = df.quantile(.5, axis=1,interpolation='nearest') + expected = Series([1., 2., 3.], index=[1, 2, 3]) + assert_series_equal(result, expected) + + #axis + result = df.quantile([.5, .75], axis=1,interpolation='lower') + expected = DataFrame({1: [1., 1.], 2: [2., 2.], + 3: [3., 3.]}, index=[0.5, 0.75]) + assert_frame_equal(result, expected) + + #test degenerate case + df = DataFrame({'x': [], 'y': []}) + q = df.quantile(0.1, axis=0,interpolation='higher') + assert(np.isnan(q['x']) and np.isnan(q['y'])) + + #multi + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=['a', 'b', 'c']) + result = df.quantile([.25, .5],interpolation='midpoint') + expected = DataFrame([[1.5, 1.5, 1.5], [2.5, 2.5, 2.5]], + index=[.25, .5], columns=['a', 'b', 'c']) + assert_frame_equal(result, expected) + + + def test_quantile_interpolation_np_lt_1p9(self): + # GH #10174 + if not _np_version_under1p9: + raise nose.SkipTest("Numpy version is greater than 1.9") + + from numpy import percentile + + #interpolation = linear (default case) + q = self.tsframe.quantile(0.1, axis=0,interpolation='linear') + self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) + q = self.intframe.quantile(0.1) + self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) + + q1 = self.intframe.quantile(0.1) + self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10)) + #test with and without interpolation keyword + assert_series_equal(q,q1) + + #interpolation method other than default linear + + expErrMsg = ("Interpolation methods other than linear" + " not supported in numpy < 1.9") + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + with assertRaisesRegexp(ValueError,expErrMsg): + df.quantile(.5, axis=1,interpolation='nearest') + + with assertRaisesRegexp(ValueError,expErrMsg): + df.quantile([.5, .75], axis=1,interpolation='lower') + + # test degenerate case + df = DataFrame({'x': [], 'y': []}) + with assertRaisesRegexp(ValueError,expErrMsg): + q = df.quantile(0.1, axis=0,interpolation='higher') + + #multi + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=['a', 'b', 'c']) + with assertRaisesRegexp(ValueError,expErrMsg): + result = df.quantile([.25, .5],interpolation='midpoint') + + + def test_quantile_multi(self): df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['a', 'b', 'c']) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 4a350e7e38091..a4e4e0632a171 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -20,7 +20,8 @@ import pandas as pd from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range, NaT, - date_range, period_range, timedelta_range, _np_version_under1p8) + date_range, period_range, timedelta_range, _np_version_under1p8, + _np_version_under1p9) from pandas.core.index import MultiIndex from pandas.core.indexing import IndexingError from pandas.tseries.period import PeriodIndex @@ -3083,6 +3084,44 @@ def test_quantile_multi(self): expected = pd.Series([], name=self.ts.name, index=Index([], dtype=float)) assert_series_equal(result, expected) + def test_quantile_interpolation(self): + # GH #10174 + if _np_version_under1p9: + raise nose.SkipTest("Numpy version is under 1.9") + + from numpy import percentile + + #interpolation = linear (default case) + q = self.ts.quantile(0.1,interpolation='linear') + self.assertEqual(q, percentile(self.ts.valid(), 10)) + q1 = self.ts.quantile(0.1) + self.assertEqual(q1, percentile(self.ts.valid(), 10)) + + #test with and without interpolation keyword + self.assertEqual(q,q1) + + def test_quantile_interpolation_np_lt_1p9(self): + # GH #10174 + if not _np_version_under1p9: + raise nose.SkipTest("Numpy version is greater than 1.9") + + from numpy import percentile + + #interpolation = linear (default case) + q = self.ts.quantile(0.1,interpolation='linear') + self.assertEqual(q, percentile(self.ts.valid(), 10)) + q1 = self.ts.quantile(0.1) + self.assertEqual(q1, percentile(self.ts.valid(), 10)) + + #interpolation other than linear + expErrMsg = "Interpolation methods other than linear not supported in numpy < 1.9" + with tm.assertRaisesRegexp(ValueError,expErrMsg): + self.ts.quantile(0.9,interpolation='nearest') + + # object dtype + with tm.assertRaisesRegexp(ValueError,expErrMsg): + q = Series(self.ts,dtype=object).quantile(0.7,interpolation='higher') + def test_append(self): appendedSeries = self.series.append(self.objSeries) for idx, value in compat.iteritems(appendedSeries):