From add1acbf592884a3f318e726d8fe21d417448b42 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@denis>
Date: Tue, 6 Oct 2015 17:49:08 +0200
Subject: [PATCH 1/5] added random_split in generic.py, for DataFrames etc.

---
 pandas/core/generic.py       | 63 ++++++++++++++++++++++++++++++++++++
 pandas/tests/test_generic.py |  6 ++++
 2 files changed, 69 insertions(+)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 2def8180a43e4..9cf1a3cde5dcf 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -3,6 +3,8 @@
 import operator
 import weakref
 import gc
+from numbers import Real
+from math import floor
 
 import numpy as np
 import pandas.lib as lib
@@ -2035,6 +2037,67 @@ def tail(self, n=5):
         return self.iloc[-n:]
 
 
+    def random_split(self, weights=(50,50), random_state=None, axis=None):
+        """
+        Returns a random split from an axis of this object
+
+        Parameters
+        ----------
+        weights : weights: list or tuple or equivalent, optional
+            The passed collection of weights serves as relative sizes of the splits
+            of the returned datasets.
+            Default = (50,50).
+        random_state : int or numpy.random.RandomState, optional
+            Seed for the random number generator (if int), or numpy RandomState
+            object.
+        axis : int or string, optional
+            Axis to sample. Accepts axis number or name. Default is stat axis
+            for given data type (0 for Series and DataFrames, 1 for Panels).
+
+        Returns
+        -------
+        Multiple objects of the same type as original object. The number of returned objects
+        is the same as the number of weights provided as parameter.
+        """
+        if axis is None:
+            axis = self._stat_axis_number
+
+        axis = self._get_axis_number(axis)
+        axis_length = self.shape[axis]
+
+        # Process random_state argument
+        rs = com._random_state(random_state)
+
+        # check weight type
+        if len(weights) < 2:
+            return self
+        for w in weights:
+            if not isinstance(w, Real) or w <=0:
+                raise ValueError("weights must be strictly positive real numbers")
+
+        weights_total = reduce(lambda x,y: x+y, weights, 0)
+
+        # get the thresholds
+
+        thresholds = [0]
+        for w in weights[:-1]:
+            tdelta = int(floor(w*1.*axis_length/weights_total))
+            threshold = thresholds[-1] + tdelta
+            thresholds.append(threshold)
+
+        thresholds = thresholds + [axis_length]
+
+        idxs = range(axis_length)
+        rs.shuffle(idxs)
+
+        splits = []
+        for ti in range(1, len(thresholds)):
+            idxst = idxs[thresholds[ti-1]:thresholds[ti]]
+            splits.append(self.take(idxst, axis=axis, is_copy=False))
+        return tuple(splits)
+
+
+
     def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None):
         """
         Returns a random sample of items from an axis of object.
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
index 3a26be2ca1032..e424baa9cce8f 100644
--- a/pandas/tests/test_generic.py
+++ b/pandas/tests/test_generic.py
@@ -354,6 +354,12 @@ def test_head_tail(self):
             self._compare(o.head(-3), o.head(7))
             self._compare(o.tail(-3), o.tail(7))
 
+    def test_random_split(self):
+        o = self._construct(shape=10)
+        a, b = o.random_split((1,1))
+        self.assertTrue(len(a) == 5)
+        self.assertTrue(len(b) == 5)
+
     def test_sample(self):
         # Fixes issue: 2419
 

From ef350cd84ea61aaa0daf534689838c8b5971c120 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@denis>
Date: Tue, 6 Oct 2015 19:54:51 +0200
Subject: [PATCH 2/5] generalized to split(), with a randomness option, changed
 test

---
 pandas/core/generic.py       | 22 +++++++++++++++-------
 pandas/tests/test_generic.py |  8 ++++----
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 9cf1a3cde5dcf..b743fdec63642 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2036,8 +2036,7 @@ def tail(self, n=5):
             return self
         return self.iloc[-n:]
 
-
-    def random_split(self, weights=(50,50), random_state=None, axis=None):
+    def split(self, weights=(50, 50), random=False, axis=None):
         """
         Returns a random split from an axis of this object
 
@@ -2047,9 +2046,11 @@ def random_split(self, weights=(50,50), random_state=None, axis=None):
             The passed collection of weights serves as relative sizes of the splits
             of the returned datasets.
             Default = (50,50).
-        random_state : int or numpy.random.RandomState, optional
-            Seed for the random number generator (if int), or numpy RandomState
-            object.
+        random : boolean or int or numpy.random.RandomState, optional
+            If False (=default value), makes consecutive splits from beginning to end.
+            If not False, a seed for the random number generator can be provided (if int) or
+            a numpy RandomState object. If True, default random behavior.
+            Default = False.
         axis : int or string, optional
             Axis to sample. Accepts axis number or name. Default is stat axis
             for given data type (0 for Series and DataFrames, 1 for Panels).
@@ -2066,7 +2067,12 @@ def random_split(self, weights=(50,50), random_state=None, axis=None):
         axis_length = self.shape[axis]
 
         # Process random_state argument
-        rs = com._random_state(random_state)
+
+        if random is not None and random is not False:
+            random_state = random
+            if random_state is True:
+                random_state = None
+            rs = com._random_state(random_state)
 
         # check weight type
         if len(weights) < 2:
@@ -2088,8 +2094,10 @@ def random_split(self, weights=(50,50), random_state=None, axis=None):
         thresholds = thresholds + [axis_length]
 
         idxs = range(axis_length)
-        rs.shuffle(idxs)
+        if random is not None and random is not False:
+            rs.shuffle(idxs)
 
+        # TODO: maybe more efficient way exists? maybe with generators?
         splits = []
         for ti in range(1, len(thresholds)):
             idxst = idxs[thresholds[ti-1]:thresholds[ti]]
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
index e424baa9cce8f..d8860b51d7c6e 100644
--- a/pandas/tests/test_generic.py
+++ b/pandas/tests/test_generic.py
@@ -354,11 +354,11 @@ def test_head_tail(self):
             self._compare(o.head(-3), o.head(7))
             self._compare(o.tail(-3), o.tail(7))
 
-    def test_random_split(self):
+    def test_split(self):
         o = self._construct(shape=10)
-        a, b = o.random_split((1,1))
-        self.assertTrue(len(a) == 5)
-        self.assertTrue(len(b) == 5)
+        a, b = o.split((1, 1), axis=0, random=True)
+        self.assertTrue(a.shape[0] == 5)
+        self.assertTrue(b.shape[0] == 5)
 
     def test_sample(self):
         # Fixes issue: 2419

From 8b2b06ce9e14b9feceec7e4dfdfc4418b63516c9 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@denis>
Date: Wed, 7 Oct 2015 21:09:18 +0200
Subject: [PATCH 3/5] implemented optionally random splitting as Ordered/Random
 Grouper + .split() on a GroupBy

still need to test
---
 pandas/core/api.py           |  2 +-
 pandas/core/generic.py       | 50 ++++--------------------------
 pandas/core/groupby.py       | 59 ++++++++++++++++++++++++++++++++++++
 pandas/tests/test_groupby.py |  8 +++++
 4 files changed, 74 insertions(+), 45 deletions(-)

diff --git a/pandas/core/api.py b/pandas/core/api.py
index e2ac57e37cba6..d735e2385ce3c 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -6,7 +6,7 @@
 from pandas.core.algorithms import factorize, match, unique, value_counts
 from pandas.core.common import isnull, notnull
 from pandas.core.categorical import Categorical
-from pandas.core.groupby import Grouper
+from pandas.core.groupby import Grouper, RandomGrouper, OrderedGrouper
 from pandas.core.format import set_eng_float_format
 from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index b743fdec63642..122690698c58e 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2060,50 +2060,12 @@ def split(self, weights=(50, 50), random=False, axis=None):
         Multiple objects of the same type as original object. The number of returned objects
         is the same as the number of weights provided as parameter.
         """
-        if axis is None:
-            axis = self._stat_axis_number
-
-        axis = self._get_axis_number(axis)
-        axis_length = self.shape[axis]
-
-        # Process random_state argument
-
-        if random is not None and random is not False:
-            random_state = random
-            if random_state is True:
-                random_state = None
-            rs = com._random_state(random_state)
-
-        # check weight type
-        if len(weights) < 2:
-            return self
-        for w in weights:
-            if not isinstance(w, Real) or w <=0:
-                raise ValueError("weights must be strictly positive real numbers")
-
-        weights_total = reduce(lambda x,y: x+y, weights, 0)
-
-        # get the thresholds
-
-        thresholds = [0]
-        for w in weights[:-1]:
-            tdelta = int(floor(w*1.*axis_length/weights_total))
-            threshold = thresholds[-1] + tdelta
-            thresholds.append(threshold)
-
-        thresholds = thresholds + [axis_length]
-
-        idxs = range(axis_length)
-        if random is not None and random is not False:
-            rs.shuffle(idxs)
-
-        # TODO: maybe more efficient way exists? maybe with generators?
-        splits = []
-        for ti in range(1, len(thresholds)):
-            idxst = idxs[thresholds[ti-1]:thresholds[ti]]
-            splits.append(self.take(idxst, axis=axis, is_copy=False))
-        return tuple(splits)
-
+        g = pd.OrderedGrouper(weights, axis)
+        if random is not False and random is not None:
+            if random is True:
+                random = None
+            g = pd.RandomGrouper(weights, axis, random)
+        return self.groupby(g).split()
 
 
     def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None):
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index add5080a69ee4..d7fd0fbd66a77 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -4,6 +4,7 @@
 import datetime
 import collections
 import warnings
+from numbers import Real
 
 from pandas.compat import(
     zip, builtins, range, long, lzip,
@@ -296,6 +297,60 @@ def groups(self):
         return self.grouper.groups
 
 
+class OrderedGrouper(Grouper):
+
+    def __init__(self, proportions=(1,1), axis=None):
+        self._proportions = proportions
+        self._axis = axis
+        self.key = None
+        # check weight type
+        if len(self._proportions) < 2:
+            raise ValueError("must split into more than 1 partition")
+        for w in self._proportions:
+            if not isinstance(w, Real) or w <=0:
+                raise ValueError("weights must be strictly positive real numbers")
+
+        weights_total = reduce(lambda x, y: x+y, self._proportions, 0)
+
+        # compute proportions as fractions
+        self._proportions = [x*1./weights_total for x in self._proportions]
+        super(OrderedGrouper, self).__init__()
+
+    def _get_grouper(self, obj):
+        return self._go_get_grouper(obj)
+
+    def _go_get_grouper(self, obj):
+        if self._axis is None:
+            self._axis = obj._stat_axis_number
+        self._axis = obj._get_axis_number(self._axis)
+        axis_length = obj.shape[self._axis]
+
+        numbers = [int(round(prop*axis_length)) for prop in self._proportions]
+
+        newcol = reduce(lambda x, y: x + y, [[x]*numbers[x] for x in range(len(numbers))])
+        while len(newcol) < axis_length:
+            newcol.append(newcol[-1])
+
+        self._processidxs(newcol)
+
+        grouping = Grouping(obj._get_axis(self._axis), grouper=Series(newcol), obj=obj, sort=True, in_axis=True)
+
+        return None, BaseGrouper(self._axis, [grouping]), obj
+
+    def _processidxs(self, newcol):
+        pass
+
+class RandomGrouper(OrderedGrouper):
+
+    def __init__(self, proportions=(1,1), axis=None, random=None):
+        # Process random_state argument
+        self.rs = com._random_state(random)
+        super(RandomGrouper, self).__init__(proportions, axis)
+
+    def _processidxs(self, newcol):
+        self.rs.shuffle(newcol)
+
+
 class GroupByPlot(PandasObject):
     """
     Class implementing the .plot attribute for groupby objects
@@ -658,6 +713,10 @@ def __iter__(self):
         """
         return self.grouper.get_iterator(self.obj, axis=self.axis)
 
+    def split(self):
+        acc = [x for _, x in self]
+        return tuple(acc)
+
     def apply(self, func, *args, **kwargs):
         """
         Apply function and combine results together in an intelligent way. The
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
index 8eb641ce8f494..3ba76eba00a7a 100644
--- a/pandas/tests/test_groupby.py
+++ b/pandas/tests/test_groupby.py
@@ -399,7 +399,15 @@ def test_grouper_multilevel_freq(self):
                              pd.Grouper(level=1, freq='W')]).sum()
         assert_frame_equal(result, expected)
 
+    def test_grouper_random(self):
+        print("testing random grouper")
+        df = DataFrame({"A": [0,1,2,3,4,5], "b": [10,11,12,13,14,15]})
+        g = df.groupby(pd.RandomGrouper((1,2)))
+        a, b = g.split()
+        assert_frame_equal(df, df)
+
     def test_grouper_creation_bug(self):
+        #self.test_grouper_random() # TODO remove
 
         # GH 8795
         df = DataFrame({'A':[0,0,1,1,2,2], 'B':[1,2,3,4,5,6]})

From 81d8ba5691f3ebcd83702e092986297a5e23c641 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@denis>
Date: Wed, 7 Oct 2015 21:12:41 +0200
Subject: [PATCH 4/5] removed superfluous lines

---
 pandas/core/groupby.py       | 3 ---
 pandas/tests/test_groupby.py | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index d7fd0fbd66a77..b676959918cdd 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -317,9 +317,6 @@ def __init__(self, proportions=(1,1), axis=None):
         super(OrderedGrouper, self).__init__()
 
     def _get_grouper(self, obj):
-        return self._go_get_grouper(obj)
-
-    def _go_get_grouper(self, obj):
         if self._axis is None:
             self._axis = obj._stat_axis_number
         self._axis = obj._get_axis_number(self._axis)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
index 3ba76eba00a7a..1b65f63f3e268 100644
--- a/pandas/tests/test_groupby.py
+++ b/pandas/tests/test_groupby.py
@@ -400,15 +400,12 @@ def test_grouper_multilevel_freq(self):
         assert_frame_equal(result, expected)
 
     def test_grouper_random(self):
-        print("testing random grouper")
         df = DataFrame({"A": [0,1,2,3,4,5], "b": [10,11,12,13,14,15]})
         g = df.groupby(pd.RandomGrouper((1,2)))
         a, b = g.split()
         assert_frame_equal(df, df)
 
     def test_grouper_creation_bug(self):
-        #self.test_grouper_random() # TODO remove
-
         # GH 8795
         df = DataFrame({'A':[0,0,1,1,2,2], 'B':[1,2,3,4,5,6]})
         g = df.groupby('A')

From ed261eef61b40f41129b53e870fa8d1746f1fb65 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@denis>
Date: Thu, 8 Oct 2015 15:56:18 +0200
Subject: [PATCH 5/5] improved code still need to write tests

---
 pandas/core/api.py           |  2 +-
 pandas/core/generic.py       |  4 ++--
 pandas/core/groupby.py       | 29 +++++++++++++++++------------
 pandas/tests/test_groupby.py |  2 +-
 4 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/pandas/core/api.py b/pandas/core/api.py
index d735e2385ce3c..e34895af9640c 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -6,7 +6,7 @@
 from pandas.core.algorithms import factorize, match, unique, value_counts
 from pandas.core.common import isnull, notnull
 from pandas.core.categorical import Categorical
-from pandas.core.groupby import Grouper, RandomGrouper, OrderedGrouper
+from pandas.core.groupby import Grouper, RandomPartitioner, Partitioner
 from pandas.core.format import set_eng_float_format
 from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 122690698c58e..246ebcb6cc953 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2060,11 +2060,11 @@ def split(self, weights=(50, 50), random=False, axis=None):
         Multiple objects of the same type as original object. The number of returned objects
         is the same as the number of weights provided as parameter.
         """
-        g = pd.OrderedGrouper(weights, axis)
+        g = pd.Partitioner(weights, axis)
         if random is not False and random is not None:
             if random is True:
                 random = None
-            g = pd.RandomGrouper(weights, axis, random)
+            g = pd.RandomPartitioner(weights, axis, random)
         return self.groupby(g).split()
 
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index b676959918cdd..d8f5b33a1ad35 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -297,7 +297,10 @@ def groups(self):
         return self.grouper.groups
 
 
-class OrderedGrouper(Grouper):
+class Partitioner(Grouper):
+    '''
+
+    '''
 
     def __init__(self, proportions=(1,1), axis=None):
         self._proportions = proportions
@@ -307,14 +310,13 @@ def __init__(self, proportions=(1,1), axis=None):
         if len(self._proportions) < 2:
             raise ValueError("must split into more than 1 partition")
         for w in self._proportions:
-            if not isinstance(w, Real) or w <=0:
+            if not (com.is_float(w) or com.is_integer(w)) or w <=0:
                 raise ValueError("weights must be strictly positive real numbers")
 
-        weights_total = reduce(lambda x, y: x+y, self._proportions, 0)
-
         # compute proportions as fractions
-        self._proportions = [x*1./weights_total for x in self._proportions]
-        super(OrderedGrouper, self).__init__()
+        self._proportions = np.asarray(self._proportions, dtype="float64")
+        self._proportions = self._proportions/self._proportions.sum()
+        super(Partitioner, self).__init__()
 
     def _get_grouper(self, obj):
         if self._axis is None:
@@ -322,29 +324,32 @@ def _get_grouper(self, obj):
         self._axis = obj._get_axis_number(self._axis)
         axis_length = obj.shape[self._axis]
 
-        numbers = [int(round(prop*axis_length)) for prop in self._proportions]
+        numbers = np.rint(self._proportions * axis_length).astype("int32")
 
         newcol = reduce(lambda x, y: x + y, [[x]*numbers[x] for x in range(len(numbers))])
         while len(newcol) < axis_length:
             newcol.append(newcol[-1])
 
-        self._processidxs(newcol)
+        self._transform(newcol)
 
         grouping = Grouping(obj._get_axis(self._axis), grouper=Series(newcol), obj=obj, sort=True, in_axis=True)
 
         return None, BaseGrouper(self._axis, [grouping]), obj
 
-    def _processidxs(self, newcol):
+    def _transform(self, newcol):
         pass
 
-class RandomGrouper(OrderedGrouper):
+class RandomPartitioner(Partitioner):
+    '''
+    TODO
+    '''
 
     def __init__(self, proportions=(1,1), axis=None, random=None):
         # Process random_state argument
         self.rs = com._random_state(random)
-        super(RandomGrouper, self).__init__(proportions, axis)
+        super(RandomPartitioner, self).__init__(proportions, axis)
 
-    def _processidxs(self, newcol):
+    def _transform(self, newcol):
         self.rs.shuffle(newcol)
 
 
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
index 1b65f63f3e268..7a66da080ddf8 100644
--- a/pandas/tests/test_groupby.py
+++ b/pandas/tests/test_groupby.py
@@ -401,7 +401,7 @@ def test_grouper_multilevel_freq(self):
 
     def test_grouper_random(self):
         df = DataFrame({"A": [0,1,2,3,4,5], "b": [10,11,12,13,14,15]})
-        g = df.groupby(pd.RandomGrouper((1,2)))
+        g = df.groupby(pd.RandomPartitioner((1,2)))
         a, b = g.split()
         assert_frame_equal(df, df)