pandas-dev · hayd · May 29, 2014 · May 30, 2014 · jorisvandenbossche · May 31, 2014
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -50,6 +50,8 @@ Known Issues
 Enhancements
 ~~~~~~~~~~~~
 
+- Add a sample method to NDFrame (:issue:`2419`)
+
 .. _whatsnew_0141.performance:
 
 Performance

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1241,6 +1241,45 @@ def take(self, indices, axis=0, convert=True, is_copy=True):
 
         return result
 
+    def sample(self, size, replace=True):
+        """Take a sample from the object, analogue of numpy.random.choice
+
+        Parameters
+        ----------
+        size : int, size of sample to take
+        replace : bool, default True, whether to sample with replacements
+
+        Returns
+        -------
+        type of caller
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3, 4, 5])
+        >>> s.sample(3, replace=False)
+        2    3
+        0    1
+        3    4
+        dtype: int64
+        >>> s.sample(3, replace=True)
+        1    2
+        3    4
+        1    2
+        dtype: int64
+
+        Note
+        ----
+        If you are sampling without replacement over a larger sample size than
+        the object you're sampling a ValueError will be raised.
+
+        """
+        try:
+            from numpy.random import choice
+        except ImportError:
+            from pandas.stats.misc import choice
+        msk = choice(len(self), size, replace=replace)
+        return self.iloc[msk]
+
     def xs(self, key, axis=0, level=None, copy=None, drop_level=True):
         """
         Returns a cross-section (row(s) or column(s)) from the Series/DataFrame.

diff --git a/pandas/stats/misc.py b/pandas/stats/misc.py
@@ -297,3 +297,20 @@ def _bucket_labels(series, k):
         mat[v] = i
 
     return mat + 1
+
+
+def choice(arr, size, replace):
+    """Partial implementation of numpy.random.choice which is new to 1.7
+
+    Note: unlike numpy's version size must be a scalar.
+    """
+    if replace:
+        pos = (np.random.sample(size) * len(arr)).astype('int64')
+        return arr[pos]
+    else:
+        if size > len(arr):
+            raise ValueError("Cannot take a larger sample than "
+                             "population when 'replace=False'")
+        shuffle = np.arange(len(arr))
+        np.random.shuffle(shuffle)
+        return arr[shuffle[:size]]
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -8441,6 +8441,12 @@ def test_truncate_copy(self):
         truncated.values[:] = 5.
         self.assertFalse((self.tsframe.values[5:11] == 5).any())
 
+    def test_sample(self):
+        df = DataFrame([[1, 2], [2, 3]], columns=['A', 'B'])
+        res = df.sample(5)
+        self.assertEqual(len(res), 5)
+        assert(res.index.isin(df.index).all())
+
     def test_xs(self):
         idx = self.frame.index[5]
         xs = self.frame.xs(idx)

diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py
@@ -547,6 +547,12 @@ def test_xs(self):
         result = self.panel.xs('D', axis=2)
         self.assertIsNotNone(result.is_copy)
 
+    def test_sample(self):
+        p = self.panel
+        res = p.sample(5)
+        self.assertEqual(len(res), 5)
+        assert(res.major_axis.isin(p.major_axis).all())
+
     def test_getitem_fancy_labels(self):
         p = self.panel
 

diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -1608,6 +1608,12 @@ def test_mask(self):
         rs = s.where(cond, np.nan)
         assert_series_equal(rs, s.mask(~cond))
 
+    def test_sample(self):
+        s = Series([1, 2, 2, 3])
+        res = s.sample(5)
+        self.assertEqual(len(res), 5)
+        assert(res.index.isin(s.index).all())
+
     def test_drop(self):
 
         # unique