From add1acbf592884a3f318e726d8fe21d417448b42 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 6 Oct 2015 17:49:08 +0200 Subject: [PATCH 1/5] added random_split in generic.py, for DataFrames etc. --- pandas/core/generic.py | 63 ++++++++++++++++++++++++++++++++++++ pandas/tests/test_generic.py | 6 ++++ 2 files changed, 69 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2def8180a43e4..9cf1a3cde5dcf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3,6 +3,8 @@ import operator import weakref import gc +from numbers import Real +from math import floor import numpy as np import pandas.lib as lib @@ -2035,6 +2037,67 @@ def tail(self, n=5): return self.iloc[-n:] + def random_split(self, weights=(50,50), random_state=None, axis=None): + """ + Returns a random split from an axis of this object + + Parameters + ---------- + weights : weights: list or tuple or equivalent, optional + The passed collection of weights serves as relative sizes of the splits + of the returned datasets. + Default = (50,50). + random_state : int or numpy.random.RandomState, optional + Seed for the random number generator (if int), or numpy RandomState + object. + axis : int or string, optional + Axis to sample. Accepts axis number or name. Default is stat axis + for given data type (0 for Series and DataFrames, 1 for Panels). + + Returns + ------- + Multiple objects of the same type as original object. The number of returned objects + is the same as the number of weights provided as parameter. + """ + if axis is None: + axis = self._stat_axis_number + + axis = self._get_axis_number(axis) + axis_length = self.shape[axis] + + # Process random_state argument + rs = com._random_state(random_state) + + # check weight type + if len(weights) < 2: + return self + for w in weights: + if not isinstance(w, Real) or w <=0: + raise ValueError("weights must be strictly positive real numbers") + + weights_total = reduce(lambda x,y: x+y, weights, 0) + + # get the thresholds + + thresholds = [0] + for w in weights[:-1]: + tdelta = int(floor(w*1.*axis_length/weights_total)) + threshold = thresholds[-1] + tdelta + thresholds.append(threshold) + + thresholds = thresholds + [axis_length] + + idxs = range(axis_length) + rs.shuffle(idxs) + + splits = [] + for ti in range(1, len(thresholds)): + idxst = idxs[thresholds[ti-1]:thresholds[ti]] + splits.append(self.take(idxst, axis=axis, is_copy=False)) + return tuple(splits) + + + def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None): """ Returns a random sample of items from an axis of object. diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 3a26be2ca1032..e424baa9cce8f 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -354,6 +354,12 @@ def test_head_tail(self): self._compare(o.head(-3), o.head(7)) self._compare(o.tail(-3), o.tail(7)) + def test_random_split(self): + o = self._construct(shape=10) + a, b = o.random_split((1,1)) + self.assertTrue(len(a) == 5) + self.assertTrue(len(b) == 5) + def test_sample(self): # Fixes issue: 2419 From ef350cd84ea61aaa0daf534689838c8b5971c120 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 6 Oct 2015 19:54:51 +0200 Subject: [PATCH 2/5] generalized to split(), with a randomness option, changed test --- pandas/core/generic.py | 22 +++++++++++++++------- pandas/tests/test_generic.py | 8 ++++---- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9cf1a3cde5dcf..b743fdec63642 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2036,8 +2036,7 @@ def tail(self, n=5): return self return self.iloc[-n:] - - def random_split(self, weights=(50,50), random_state=None, axis=None): + def split(self, weights=(50, 50), random=False, axis=None): """ Returns a random split from an axis of this object @@ -2047,9 +2046,11 @@ def random_split(self, weights=(50,50), random_state=None, axis=None): The passed collection of weights serves as relative sizes of the splits of the returned datasets. Default = (50,50). - random_state : int or numpy.random.RandomState, optional - Seed for the random number generator (if int), or numpy RandomState - object. + random : boolean or int or numpy.random.RandomState, optional + If False (=default value), makes consecutive splits from beginning to end. + If not False, a seed for the random number generator can be provided (if int) or + a numpy RandomState object. If True, default random behavior. + Default = False. axis : int or string, optional Axis to sample. Accepts axis number or name. Default is stat axis for given data type (0 for Series and DataFrames, 1 for Panels). @@ -2066,7 +2067,12 @@ def random_split(self, weights=(50,50), random_state=None, axis=None): axis_length = self.shape[axis] # Process random_state argument - rs = com._random_state(random_state) + + if random is not None and random is not False: + random_state = random + if random_state is True: + random_state = None + rs = com._random_state(random_state) # check weight type if len(weights) < 2: @@ -2088,8 +2094,10 @@ def random_split(self, weights=(50,50), random_state=None, axis=None): thresholds = thresholds + [axis_length] idxs = range(axis_length) - rs.shuffle(idxs) + if random is not None and random is not False: + rs.shuffle(idxs) + # TODO: maybe more efficient way exists? maybe with generators? splits = [] for ti in range(1, len(thresholds)): idxst = idxs[thresholds[ti-1]:thresholds[ti]] diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index e424baa9cce8f..d8860b51d7c6e 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -354,11 +354,11 @@ def test_head_tail(self): self._compare(o.head(-3), o.head(7)) self._compare(o.tail(-3), o.tail(7)) - def test_random_split(self): + def test_split(self): o = self._construct(shape=10) - a, b = o.random_split((1,1)) - self.assertTrue(len(a) == 5) - self.assertTrue(len(b) == 5) + a, b = o.split((1, 1), axis=0, random=True) + self.assertTrue(a.shape[0] == 5) + self.assertTrue(b.shape[0] == 5) def test_sample(self): # Fixes issue: 2419 From 8b2b06ce9e14b9feceec7e4dfdfc4418b63516c9 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Wed, 7 Oct 2015 21:09:18 +0200 Subject: [PATCH 3/5] implemented optionally random splitting as Ordered/Random Grouper + .split() on a GroupBy still need to test --- pandas/core/api.py | 2 +- pandas/core/generic.py | 50 ++++-------------------------- pandas/core/groupby.py | 59 ++++++++++++++++++++++++++++++++++++ pandas/tests/test_groupby.py | 8 +++++ 4 files changed, 74 insertions(+), 45 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index e2ac57e37cba6..d735e2385ce3c 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -6,7 +6,7 @@ from pandas.core.algorithms import factorize, match, unique, value_counts from pandas.core.common import isnull, notnull from pandas.core.categorical import Categorical -from pandas.core.groupby import Grouper +from pandas.core.groupby import Grouper, RandomGrouper, OrderedGrouper from pandas.core.format import set_eng_float_format from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b743fdec63642..122690698c58e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2060,50 +2060,12 @@ def split(self, weights=(50, 50), random=False, axis=None): Multiple objects of the same type as original object. The number of returned objects is the same as the number of weights provided as parameter. """ - if axis is None: - axis = self._stat_axis_number - - axis = self._get_axis_number(axis) - axis_length = self.shape[axis] - - # Process random_state argument - - if random is not None and random is not False: - random_state = random - if random_state is True: - random_state = None - rs = com._random_state(random_state) - - # check weight type - if len(weights) < 2: - return self - for w in weights: - if not isinstance(w, Real) or w <=0: - raise ValueError("weights must be strictly positive real numbers") - - weights_total = reduce(lambda x,y: x+y, weights, 0) - - # get the thresholds - - thresholds = [0] - for w in weights[:-1]: - tdelta = int(floor(w*1.*axis_length/weights_total)) - threshold = thresholds[-1] + tdelta - thresholds.append(threshold) - - thresholds = thresholds + [axis_length] - - idxs = range(axis_length) - if random is not None and random is not False: - rs.shuffle(idxs) - - # TODO: maybe more efficient way exists? maybe with generators? - splits = [] - for ti in range(1, len(thresholds)): - idxst = idxs[thresholds[ti-1]:thresholds[ti]] - splits.append(self.take(idxst, axis=axis, is_copy=False)) - return tuple(splits) - + g = pd.OrderedGrouper(weights, axis) + if random is not False and random is not None: + if random is True: + random = None + g = pd.RandomGrouper(weights, axis, random) + return self.groupby(g).split() def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index add5080a69ee4..d7fd0fbd66a77 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4,6 +4,7 @@ import datetime import collections import warnings +from numbers import Real from pandas.compat import( zip, builtins, range, long, lzip, @@ -296,6 +297,60 @@ def groups(self): return self.grouper.groups +class OrderedGrouper(Grouper): + + def __init__(self, proportions=(1,1), axis=None): + self._proportions = proportions + self._axis = axis + self.key = None + # check weight type + if len(self._proportions) < 2: + raise ValueError("must split into more than 1 partition") + for w in self._proportions: + if not isinstance(w, Real) or w <=0: + raise ValueError("weights must be strictly positive real numbers") + + weights_total = reduce(lambda x, y: x+y, self._proportions, 0) + + # compute proportions as fractions + self._proportions = [x*1./weights_total for x in self._proportions] + super(OrderedGrouper, self).__init__() + + def _get_grouper(self, obj): + return self._go_get_grouper(obj) + + def _go_get_grouper(self, obj): + if self._axis is None: + self._axis = obj._stat_axis_number + self._axis = obj._get_axis_number(self._axis) + axis_length = obj.shape[self._axis] + + numbers = [int(round(prop*axis_length)) for prop in self._proportions] + + newcol = reduce(lambda x, y: x + y, [[x]*numbers[x] for x in range(len(numbers))]) + while len(newcol) < axis_length: + newcol.append(newcol[-1]) + + self._processidxs(newcol) + + grouping = Grouping(obj._get_axis(self._axis), grouper=Series(newcol), obj=obj, sort=True, in_axis=True) + + return None, BaseGrouper(self._axis, [grouping]), obj + + def _processidxs(self, newcol): + pass + +class RandomGrouper(OrderedGrouper): + + def __init__(self, proportions=(1,1), axis=None, random=None): + # Process random_state argument + self.rs = com._random_state(random) + super(RandomGrouper, self).__init__(proportions, axis) + + def _processidxs(self, newcol): + self.rs.shuffle(newcol) + + class GroupByPlot(PandasObject): """ Class implementing the .plot attribute for groupby objects @@ -658,6 +713,10 @@ def __iter__(self): """ return self.grouper.get_iterator(self.obj, axis=self.axis) + def split(self): + acc = [x for _, x in self] + return tuple(acc) + def apply(self, func, *args, **kwargs): """ Apply function and combine results together in an intelligent way. The diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 8eb641ce8f494..3ba76eba00a7a 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -399,7 +399,15 @@ def test_grouper_multilevel_freq(self): pd.Grouper(level=1, freq='W')]).sum() assert_frame_equal(result, expected) + def test_grouper_random(self): + print("testing random grouper") + df = DataFrame({"A": [0,1,2,3,4,5], "b": [10,11,12,13,14,15]}) + g = df.groupby(pd.RandomGrouper((1,2))) + a, b = g.split() + assert_frame_equal(df, df) + def test_grouper_creation_bug(self): + #self.test_grouper_random() # TODO remove # GH 8795 df = DataFrame({'A':[0,0,1,1,2,2], 'B':[1,2,3,4,5,6]}) From 81d8ba5691f3ebcd83702e092986297a5e23c641 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Wed, 7 Oct 2015 21:12:41 +0200 Subject: [PATCH 4/5] removed superfluous lines --- pandas/core/groupby.py | 3 --- pandas/tests/test_groupby.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index d7fd0fbd66a77..b676959918cdd 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -317,9 +317,6 @@ def __init__(self, proportions=(1,1), axis=None): super(OrderedGrouper, self).__init__() def _get_grouper(self, obj): - return self._go_get_grouper(obj) - - def _go_get_grouper(self, obj): if self._axis is None: self._axis = obj._stat_axis_number self._axis = obj._get_axis_number(self._axis) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 3ba76eba00a7a..1b65f63f3e268 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -400,15 +400,12 @@ def test_grouper_multilevel_freq(self): assert_frame_equal(result, expected) def test_grouper_random(self): - print("testing random grouper") df = DataFrame({"A": [0,1,2,3,4,5], "b": [10,11,12,13,14,15]}) g = df.groupby(pd.RandomGrouper((1,2))) a, b = g.split() assert_frame_equal(df, df) def test_grouper_creation_bug(self): - #self.test_grouper_random() # TODO remove - # GH 8795 df = DataFrame({'A':[0,0,1,1,2,2], 'B':[1,2,3,4,5,6]}) g = df.groupby('A') From ed261eef61b40f41129b53e870fa8d1746f1fb65 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Thu, 8 Oct 2015 15:56:18 +0200 Subject: [PATCH 5/5] improved code still need to write tests --- pandas/core/api.py | 2 +- pandas/core/generic.py | 4 ++-- pandas/core/groupby.py | 29 +++++++++++++++++------------ pandas/tests/test_groupby.py | 2 +- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index d735e2385ce3c..e34895af9640c 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -6,7 +6,7 @@ from pandas.core.algorithms import factorize, match, unique, value_counts from pandas.core.common import isnull, notnull from pandas.core.categorical import Categorical -from pandas.core.groupby import Grouper, RandomGrouper, OrderedGrouper +from pandas.core.groupby import Grouper, RandomPartitioner, Partitioner from pandas.core.format import set_eng_float_format from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 122690698c58e..246ebcb6cc953 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2060,11 +2060,11 @@ def split(self, weights=(50, 50), random=False, axis=None): Multiple objects of the same type as original object. The number of returned objects is the same as the number of weights provided as parameter. """ - g = pd.OrderedGrouper(weights, axis) + g = pd.Partitioner(weights, axis) if random is not False and random is not None: if random is True: random = None - g = pd.RandomGrouper(weights, axis, random) + g = pd.RandomPartitioner(weights, axis, random) return self.groupby(g).split() diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index b676959918cdd..d8f5b33a1ad35 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -297,7 +297,10 @@ def groups(self): return self.grouper.groups -class OrderedGrouper(Grouper): +class Partitioner(Grouper): + ''' + + ''' def __init__(self, proportions=(1,1), axis=None): self._proportions = proportions @@ -307,14 +310,13 @@ def __init__(self, proportions=(1,1), axis=None): if len(self._proportions) < 2: raise ValueError("must split into more than 1 partition") for w in self._proportions: - if not isinstance(w, Real) or w <=0: + if not (com.is_float(w) or com.is_integer(w)) or w <=0: raise ValueError("weights must be strictly positive real numbers") - weights_total = reduce(lambda x, y: x+y, self._proportions, 0) - # compute proportions as fractions - self._proportions = [x*1./weights_total for x in self._proportions] - super(OrderedGrouper, self).__init__() + self._proportions = np.asarray(self._proportions, dtype="float64") + self._proportions = self._proportions/self._proportions.sum() + super(Partitioner, self).__init__() def _get_grouper(self, obj): if self._axis is None: @@ -322,29 +324,32 @@ def _get_grouper(self, obj): self._axis = obj._get_axis_number(self._axis) axis_length = obj.shape[self._axis] - numbers = [int(round(prop*axis_length)) for prop in self._proportions] + numbers = np.rint(self._proportions * axis_length).astype("int32") newcol = reduce(lambda x, y: x + y, [[x]*numbers[x] for x in range(len(numbers))]) while len(newcol) < axis_length: newcol.append(newcol[-1]) - self._processidxs(newcol) + self._transform(newcol) grouping = Grouping(obj._get_axis(self._axis), grouper=Series(newcol), obj=obj, sort=True, in_axis=True) return None, BaseGrouper(self._axis, [grouping]), obj - def _processidxs(self, newcol): + def _transform(self, newcol): pass -class RandomGrouper(OrderedGrouper): +class RandomPartitioner(Partitioner): + ''' + TODO + ''' def __init__(self, proportions=(1,1), axis=None, random=None): # Process random_state argument self.rs = com._random_state(random) - super(RandomGrouper, self).__init__(proportions, axis) + super(RandomPartitioner, self).__init__(proportions, axis) - def _processidxs(self, newcol): + def _transform(self, newcol): self.rs.shuffle(newcol) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 1b65f63f3e268..7a66da080ddf8 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -401,7 +401,7 @@ def test_grouper_multilevel_freq(self): def test_grouper_random(self): df = DataFrame({"A": [0,1,2,3,4,5], "b": [10,11,12,13,14,15]}) - g = df.groupby(pd.RandomGrouper((1,2))) + g = df.groupby(pd.RandomPartitioner((1,2))) a, b = g.split() assert_frame_equal(df, df)