diff --git a/numpy/random/generator.pyx b/numpy/random/generator.pyx
index 29a36787ebbc..ffbfc972e248 100644
--- a/numpy/random/generator.pyx
+++ b/numpy/random/generator.pyx
@@ -368,26 +368,11 @@ cdef class RandomGenerator:
                 [ True,  True]]])
 
         """
-        cdef np.npy_intp n
-        cdef np.ndarray randoms
-        cdef int64_t *randoms_data
-
-        if size is None:
-            with self.lock:
-                return random_positive_int(self._brng)
-
-        randoms = <np.ndarray>np.empty(size, dtype=np.int64)
-        randoms_data = <int64_t*>np.PyArray_DATA(randoms)
-        n = np.PyArray_SIZE(randoms)
-
-        for i in range(n):
-            with self.lock, nogil:
-                randoms_data[i] = random_positive_int(self._brng)
-        return randoms
+        return self.randint(0, np.iinfo(np.int).max + 1, dtype=np.int, size=size)
 
-    def randint(self, low, high=None, size=None, dtype=int, use_masked=True):
+    def randint(self, low, high=None, size=None, dtype=np.int64, use_masked=True):
         """
-        randint(low, high=None, size=None, dtype='l', use_masked=True)
+        randint(low, high=None, size=None, dtype='int64', use_masked=True)
 
         Return random integers from `low` (inclusive) to `high` (exclusive).
 
@@ -530,9 +515,9 @@ cdef class RandomGenerator:
         return self.randint(0, 4294967296, size=n_uint32, dtype=np.uint32).tobytes()[:length]
 
     @cython.wraparound(True)
-    def choice(self, a, size=None, replace=True, p=None):
+    def choice(self, a, size=None, replace=True, p=None, axis=0):
         """
-        choice(a, size=None, replace=True, p=None)
+        choice(a, size=None, replace=True, p=None, axis=0):
 
         Generates a random sample from a given 1-D array
 
@@ -553,6 +538,9 @@ cdef class RandomGenerator:
             The probabilities associated with each entry in a.
             If not given the sample assumes a uniform distribution over all
             entries in a.
+        axis : int, optional
+            The axis along which the selection is performed. The default, 0,
+            selects by row.
 
         Returns
         -------
@@ -562,11 +550,11 @@ cdef class RandomGenerator:
         Raises
         ------
         ValueError
-            If a is an int and less than zero, if a or p are not 1-dimensional,
-            if a is an array-like of size 0, if p is not a vector of
+            If a is an int and less than zero, if p is not 1-dimensional, if
+            a is array-like with a size 0, if p is not a vector of
             probabilities, if a and p have different lengths, or if
             replace=False and the sample size is greater than the population
-            size
+            size.
 
         See Also
         --------
@@ -607,7 +595,14 @@ cdef class RandomGenerator:
               dtype='<U11')
 
         """
-
+        cdef char* idx_ptr
+        cdef int64_t buf
+        cdef char* buf_ptr
+
+        cdef set idx_set
+        cdef int64_t val, t, loc, size_i, pop_size_i
+        cdef int64_t *idx_data
+        cdef np.npy_intp j
         # Format and Verify input
         a = np.array(a, copy=False)
         if a.ndim == 0:
@@ -618,11 +613,9 @@ cdef class RandomGenerator:
                 raise ValueError("a must be 1-dimensional or an integer")
             if pop_size <= 0 and np.prod(size) != 0:
                 raise ValueError("a must be greater than 0 unless no samples are taken")
-        elif a.ndim != 1:
-            raise ValueError("a must be 1-dimensional")
         else:
-            pop_size = a.shape[0]
-            if pop_size is 0 and np.prod(size) != 0:
+            pop_size = a.shape[axis]
+            if pop_size == 0 and np.prod(size) != 0:
                 raise ValueError("'a' cannot be empty unless no samples are taken")
 
         if p is not None:
@@ -661,9 +654,9 @@ cdef class RandomGenerator:
                 cdf /= cdf[-1]
                 uniform_samples = self.random_sample(shape)
                 idx = cdf.searchsorted(uniform_samples, side='right')
-                idx = np.array(idx, copy=False)  # searchsorted returns a scalar
+                idx = np.array(idx, copy=False, dtype=np.int64)  # searchsorted returns a scalar
             else:
-                idx = self.randint(0, pop_size, size=shape)
+                idx = self.randint(0, pop_size, size=shape, dtype=np.int64)
         else:
             if size > pop_size:
                 raise ValueError("Cannot take a larger sample than "
@@ -692,7 +685,39 @@ cdef class RandomGenerator:
                     n_uniq += new.size
                 idx = found
             else:
-                idx = self.permutation(pop_size)[:size]
+                size_i = size
+                pop_size_i = pop_size
+                # This is a heuristic tuning. should be improvable
+                if pop_size_i > 200 and (size > 200 or size > (10 * pop_size // size)):
+                    # Tail shuffle size elements
+                    idx = np.arange(pop_size, dtype=np.int64)
+                    idx_ptr = np.PyArray_BYTES(<np.ndarray>idx)
+                    buf_ptr = <char*>&buf
+                    self._shuffle_raw(pop_size_i, max(pop_size_i - size_i,1),
+                                      8, 8, idx_ptr, buf_ptr)
+                    # Copy to allow potentially large array backing idx to be gc
+                    idx = idx[(pop_size - size):].copy()
+                else:
+                    # Floyds's algorithm with precomputed indices
+                    # Worst case, O(n**2) when size is close to pop_size
+                    idx = np.empty(size, dtype=np.int64)
+                    idx_data = <int64_t*>np.PyArray_DATA(<np.ndarray>idx)
+                    idx_set = set()
+                    loc = 0
+                    # Sample indices with one pass to avoid reacquiring the lock
+                    with self.lock:
+                        for j in range(pop_size_i - size_i, pop_size_i):
+                            idx_data[loc] = random_interval(self._brng, j)
+                            loc += 1
+                    loc = 0
+                    while len(idx_set) < size_i:
+                        for j in range(pop_size_i - size_i, pop_size_i):
+                            if idx_data[loc] not in idx_set:
+                                val = idx_data[loc]
+                            else:
+                                idx_data[loc] = val = j
+                            idx_set.add(val)
+                            loc += 1
                 if shape is not None:
                     idx.shape = shape
 
@@ -714,7 +739,9 @@ cdef class RandomGenerator:
             res[()] = a[idx]
             return res
 
-        return a[idx]
+        # asarray downcasts on 32-bit platforms, always safe
+        # no-op on 64-bit platforms
+        return a.take(np.asarray(idx, dtype=np.intp), axis=axis)
 
     def uniform(self, low=0.0, high=1.0, size=None):
         """
@@ -3986,9 +4013,9 @@ cdef class RandomGenerator:
                 # the most common case, yielding a ~33% performance improvement.
                 # Note that apparently, only one branch can ever be specialized.
                 if itemsize == sizeof(np.npy_intp):
-                    self._shuffle_raw(n, sizeof(np.npy_intp), stride, x_ptr, buf_ptr)
+                    self._shuffle_raw(n, 1, sizeof(np.npy_intp), stride, x_ptr, buf_ptr)
                 else:
-                    self._shuffle_raw(n, itemsize, stride, x_ptr, buf_ptr)
+                    self._shuffle_raw(n, 1, itemsize, stride, x_ptr, buf_ptr)
         elif isinstance(x, np.ndarray) and x.ndim and x.size:
             buf = np.empty_like(x[0, ...])
             with self.lock:
@@ -4007,10 +4034,29 @@ cdef class RandomGenerator:
                     j = random_interval(self._brng, i)
                     x[i], x[j] = x[j], x[i]
 
-    cdef inline _shuffle_raw(self, np.npy_intp n, np.npy_intp itemsize,
-                             np.npy_intp stride, char* data, char* buf):
+    cdef inline _shuffle_raw(self, np.npy_intp n, np.npy_intp first,
+                             np.npy_intp itemsize, np.npy_intp stride,
+                             char* data, char* buf):
+        """
+        Parameters
+        ----------
+        n
+            Number of elements in data 
+        first
+            First observation to shuffle.  Shuffles n-1, 
+            n-2, ..., first, so that when first=1 the entire
+            array is shuffled 
+        itemsize
+            Size in bytes of item 
+        stride
+            Array stride
+        data
+            Location of data
+        buf
+            Location of buffer (itemsize)
+        """
         cdef np.npy_intp i, j
-        for i in reversed(range(1, n)):
+        for i in reversed(range(first, n)):
             j = random_interval(self._brng, i)
             string.memcpy(buf, data + j * stride, itemsize)
             string.memcpy(data + j * stride, data + i * stride, itemsize)
diff --git a/numpy/random/src/distributions/distributions.c b/numpy/random/src/distributions/distributions.c
index 5f49b68be4c7..73396a50b317 100644
--- a/numpy/random/src/distributions/distributions.c
+++ b/numpy/random/src/distributions/distributions.c
@@ -1070,7 +1070,7 @@ int64_t random_zipf(brng_t *brng_state, double a) {
 
     T = pow(1.0 + 1.0 / X, am1);
     if (V * X * (T - 1.0) / (b - 1.0) <= T / b) {
-      return (long)X;
+      return (int64_t)X;
     }
   }
 }
diff --git a/numpy/random/tests/test_against_numpy.py b/numpy/random/tests/test_against_numpy.py
index b930fcbff3ce..3c83932e8bb8 100644
--- a/numpy/random/tests/test_against_numpy.py
+++ b/numpy/random/tests/test_against_numpy.py
@@ -183,6 +183,7 @@ def test_standard_exponential(self):
                         self.rs.standard_exponential)
         self._is_state_common_legacy()
 
+    @pytest.mark.xfail(reason='Stream broken for simplicity')
     def test_tomaxint(self):
         self._set_common_state()
         self._is_state_common()
@@ -327,6 +328,7 @@ def test_multinomial(self):
                      g(100, np.array(p), size=(7, 23)))
         self._is_state_common()
 
+    @pytest.mark.xfail(reason='Stream broken for performance')
     def test_choice(self):
         self._set_common_state()
         self._is_state_common()
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index 895e7fc6cf4a..d591ca3399e9 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -542,25 +542,25 @@ def test_random_sample_unsupported_type(self):
     def test_choice_uniform_replace(self):
         random.brng.seed(self.seed)
         actual = random.choice(4, 4)
-        desired = np.array([2, 3, 2, 3])
+        desired = np.array([2, 3, 2, 3], dtype=np.int64)
         assert_array_equal(actual, desired)
 
     def test_choice_nonuniform_replace(self):
         random.brng.seed(self.seed)
         actual = random.choice(4, 4, p=[0.4, 0.4, 0.1, 0.1])
-        desired = np.array([1, 1, 2, 2])
+        desired = np.array([1, 1, 2, 2], dtype=np.int64)
         assert_array_equal(actual, desired)
 
     def test_choice_uniform_noreplace(self):
         random.brng.seed(self.seed)
         actual = random.choice(4, 3, replace=False)
-        desired = np.array([0, 1, 3])
+        desired = np.array([0, 2, 3], dtype=np.int64)
         assert_array_equal(actual, desired)
 
     def test_choice_nonuniform_noreplace(self):
         random.brng.seed(self.seed)
         actual = random.choice(4, 3, replace=False, p=[0.1, 0.3, 0.5, 0.1])
-        desired = np.array([2, 3, 1])
+        desired = np.array([2, 3, 1], dtype=np.int64)
         assert_array_equal(actual, desired)
 
     def test_choice_noninteger(self):
@@ -569,11 +569,22 @@ def test_choice_noninteger(self):
         desired = np.array(['c', 'd', 'c', 'd'])
         assert_array_equal(actual, desired)
 
+    def test_choice_multidimensional_default_axis(self):
+        random.brng.seed(self.seed)
+        actual = random.choice([[0, 1], [2, 3], [4, 5], [6, 7]], 3)
+        desired = np.array([[4, 5], [6, 7], [4, 5]])
+        assert_array_equal(actual, desired)
+
+    def test_choice_multidimensional_custom_axis(self):
+        random.brng.seed(self.seed)
+        actual = random.choice([[0, 1], [2, 3], [4, 5], [6, 7]], 1, axis=1)
+        desired = np.array([[0], [2], [4], [6]])
+        assert_array_equal(actual, desired)
+
     def test_choice_exceptions(self):
         sample = random.choice
         assert_raises(ValueError, sample, -1, 3)
         assert_raises(ValueError, sample, 3., 3)
-        assert_raises(ValueError, sample, [[1, 2], [3, 4]], 3)
         assert_raises(ValueError, sample, [], 3)
         assert_raises(ValueError, sample, [1, 2, 3, 4], 3,
                       p=[[0.25, 0.25], [0.25, 0.25]])
@@ -639,6 +650,29 @@ def test_choice_nan_probabilities(self):
         p = [None, None, None]
         assert_raises(ValueError, random.choice, a, p=p)
 
+    def test_choice_return_type(self):
+        # gh 9867
+        p = np.ones(4) / 4.
+        actual = random.choice(4, 2)
+        assert actual.dtype == np.int64
+        actual = random.choice(4, 2, replace=False)
+        assert actual.dtype == np.int64
+        actual = random.choice(4, 2, p=p)
+        assert actual.dtype == np.int64
+        actual = random.choice(4, 2, p=p, replace=False)
+        assert actual.dtype == np.int64
+
+    def test_choice_large_sample(self):
+        import hashlib
+
+        choice_hash = '6395868be877d27518c832213c17977c'
+        random.brng.seed(self.seed)
+        actual = random.choice(10000, 5000, replace=False)
+        if sys.byteorder != 'little':
+            actual = actual.byteswap()
+        res = hashlib.md5(actual.view(np.int8)).hexdigest()
+        assert_(choice_hash == res)
+
     def test_bytes(self):
         random.brng.seed(self.seed)
         actual = random.bytes(10)