diff --git a/numpy/random/generator.pyx b/numpy/random/generator.pyx index 29a36787ebbc..ffbfc972e248 100644 --- a/numpy/random/generator.pyx +++ b/numpy/random/generator.pyx @@ -368,26 +368,11 @@ cdef class RandomGenerator: [ True, True]]]) """ - cdef np.npy_intp n - cdef np.ndarray randoms - cdef int64_t *randoms_data - - if size is None: - with self.lock: - return random_positive_int(self._brng) - - randoms = np.empty(size, dtype=np.int64) - randoms_data = np.PyArray_DATA(randoms) - n = np.PyArray_SIZE(randoms) - - for i in range(n): - with self.lock, nogil: - randoms_data[i] = random_positive_int(self._brng) - return randoms + return self.randint(0, np.iinfo(np.int).max + 1, dtype=np.int, size=size) - def randint(self, low, high=None, size=None, dtype=int, use_masked=True): + def randint(self, low, high=None, size=None, dtype=np.int64, use_masked=True): """ - randint(low, high=None, size=None, dtype='l', use_masked=True) + randint(low, high=None, size=None, dtype='int64', use_masked=True) Return random integers from `low` (inclusive) to `high` (exclusive). @@ -530,9 +515,9 @@ cdef class RandomGenerator: return self.randint(0, 4294967296, size=n_uint32, dtype=np.uint32).tobytes()[:length] @cython.wraparound(True) - def choice(self, a, size=None, replace=True, p=None): + def choice(self, a, size=None, replace=True, p=None, axis=0): """ - choice(a, size=None, replace=True, p=None) + choice(a, size=None, replace=True, p=None, axis=0): Generates a random sample from a given 1-D array @@ -553,6 +538,9 @@ cdef class RandomGenerator: The probabilities associated with each entry in a. If not given the sample assumes a uniform distribution over all entries in a. + axis : int, optional + The axis along which the selection is performed. The default, 0, + selects by row. Returns ------- @@ -562,11 +550,11 @@ cdef class RandomGenerator: Raises ------ ValueError - If a is an int and less than zero, if a or p are not 1-dimensional, - if a is an array-like of size 0, if p is not a vector of + If a is an int and less than zero, if p is not 1-dimensional, if + a is array-like with a size 0, if p is not a vector of probabilities, if a and p have different lengths, or if replace=False and the sample size is greater than the population - size + size. See Also -------- @@ -607,7 +595,14 @@ cdef class RandomGenerator: dtype=' pop_size: raise ValueError("Cannot take a larger sample than " @@ -692,7 +685,39 @@ cdef class RandomGenerator: n_uniq += new.size idx = found else: - idx = self.permutation(pop_size)[:size] + size_i = size + pop_size_i = pop_size + # This is a heuristic tuning. should be improvable + if pop_size_i > 200 and (size > 200 or size > (10 * pop_size // size)): + # Tail shuffle size elements + idx = np.arange(pop_size, dtype=np.int64) + idx_ptr = np.PyArray_BYTES(idx) + buf_ptr = &buf + self._shuffle_raw(pop_size_i, max(pop_size_i - size_i,1), + 8, 8, idx_ptr, buf_ptr) + # Copy to allow potentially large array backing idx to be gc + idx = idx[(pop_size - size):].copy() + else: + # Floyds's algorithm with precomputed indices + # Worst case, O(n**2) when size is close to pop_size + idx = np.empty(size, dtype=np.int64) + idx_data = np.PyArray_DATA(idx) + idx_set = set() + loc = 0 + # Sample indices with one pass to avoid reacquiring the lock + with self.lock: + for j in range(pop_size_i - size_i, pop_size_i): + idx_data[loc] = random_interval(self._brng, j) + loc += 1 + loc = 0 + while len(idx_set) < size_i: + for j in range(pop_size_i - size_i, pop_size_i): + if idx_data[loc] not in idx_set: + val = idx_data[loc] + else: + idx_data[loc] = val = j + idx_set.add(val) + loc += 1 if shape is not None: idx.shape = shape @@ -714,7 +739,9 @@ cdef class RandomGenerator: res[()] = a[idx] return res - return a[idx] + # asarray downcasts on 32-bit platforms, always safe + # no-op on 64-bit platforms + return a.take(np.asarray(idx, dtype=np.intp), axis=axis) def uniform(self, low=0.0, high=1.0, size=None): """ @@ -3986,9 +4013,9 @@ cdef class RandomGenerator: # the most common case, yielding a ~33% performance improvement. # Note that apparently, only one branch can ever be specialized. if itemsize == sizeof(np.npy_intp): - self._shuffle_raw(n, sizeof(np.npy_intp), stride, x_ptr, buf_ptr) + self._shuffle_raw(n, 1, sizeof(np.npy_intp), stride, x_ptr, buf_ptr) else: - self._shuffle_raw(n, itemsize, stride, x_ptr, buf_ptr) + self._shuffle_raw(n, 1, itemsize, stride, x_ptr, buf_ptr) elif isinstance(x, np.ndarray) and x.ndim and x.size: buf = np.empty_like(x[0, ...]) with self.lock: @@ -4007,10 +4034,29 @@ cdef class RandomGenerator: j = random_interval(self._brng, i) x[i], x[j] = x[j], x[i] - cdef inline _shuffle_raw(self, np.npy_intp n, np.npy_intp itemsize, - np.npy_intp stride, char* data, char* buf): + cdef inline _shuffle_raw(self, np.npy_intp n, np.npy_intp first, + np.npy_intp itemsize, np.npy_intp stride, + char* data, char* buf): + """ + Parameters + ---------- + n + Number of elements in data + first + First observation to shuffle. Shuffles n-1, + n-2, ..., first, so that when first=1 the entire + array is shuffled + itemsize + Size in bytes of item + stride + Array stride + data + Location of data + buf + Location of buffer (itemsize) + """ cdef np.npy_intp i, j - for i in reversed(range(1, n)): + for i in reversed(range(first, n)): j = random_interval(self._brng, i) string.memcpy(buf, data + j * stride, itemsize) string.memcpy(data + j * stride, data + i * stride, itemsize) diff --git a/numpy/random/src/distributions/distributions.c b/numpy/random/src/distributions/distributions.c index 5f49b68be4c7..73396a50b317 100644 --- a/numpy/random/src/distributions/distributions.c +++ b/numpy/random/src/distributions/distributions.c @@ -1070,7 +1070,7 @@ int64_t random_zipf(brng_t *brng_state, double a) { T = pow(1.0 + 1.0 / X, am1); if (V * X * (T - 1.0) / (b - 1.0) <= T / b) { - return (long)X; + return (int64_t)X; } } } diff --git a/numpy/random/tests/test_against_numpy.py b/numpy/random/tests/test_against_numpy.py index b930fcbff3ce..3c83932e8bb8 100644 --- a/numpy/random/tests/test_against_numpy.py +++ b/numpy/random/tests/test_against_numpy.py @@ -183,6 +183,7 @@ def test_standard_exponential(self): self.rs.standard_exponential) self._is_state_common_legacy() + @pytest.mark.xfail(reason='Stream broken for simplicity') def test_tomaxint(self): self._set_common_state() self._is_state_common() @@ -327,6 +328,7 @@ def test_multinomial(self): g(100, np.array(p), size=(7, 23))) self._is_state_common() + @pytest.mark.xfail(reason='Stream broken for performance') def test_choice(self): self._set_common_state() self._is_state_common() diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py index 895e7fc6cf4a..d591ca3399e9 100644 --- a/numpy/random/tests/test_generator_mt19937.py +++ b/numpy/random/tests/test_generator_mt19937.py @@ -542,25 +542,25 @@ def test_random_sample_unsupported_type(self): def test_choice_uniform_replace(self): random.brng.seed(self.seed) actual = random.choice(4, 4) - desired = np.array([2, 3, 2, 3]) + desired = np.array([2, 3, 2, 3], dtype=np.int64) assert_array_equal(actual, desired) def test_choice_nonuniform_replace(self): random.brng.seed(self.seed) actual = random.choice(4, 4, p=[0.4, 0.4, 0.1, 0.1]) - desired = np.array([1, 1, 2, 2]) + desired = np.array([1, 1, 2, 2], dtype=np.int64) assert_array_equal(actual, desired) def test_choice_uniform_noreplace(self): random.brng.seed(self.seed) actual = random.choice(4, 3, replace=False) - desired = np.array([0, 1, 3]) + desired = np.array([0, 2, 3], dtype=np.int64) assert_array_equal(actual, desired) def test_choice_nonuniform_noreplace(self): random.brng.seed(self.seed) actual = random.choice(4, 3, replace=False, p=[0.1, 0.3, 0.5, 0.1]) - desired = np.array([2, 3, 1]) + desired = np.array([2, 3, 1], dtype=np.int64) assert_array_equal(actual, desired) def test_choice_noninteger(self): @@ -569,11 +569,22 @@ def test_choice_noninteger(self): desired = np.array(['c', 'd', 'c', 'd']) assert_array_equal(actual, desired) + def test_choice_multidimensional_default_axis(self): + random.brng.seed(self.seed) + actual = random.choice([[0, 1], [2, 3], [4, 5], [6, 7]], 3) + desired = np.array([[4, 5], [6, 7], [4, 5]]) + assert_array_equal(actual, desired) + + def test_choice_multidimensional_custom_axis(self): + random.brng.seed(self.seed) + actual = random.choice([[0, 1], [2, 3], [4, 5], [6, 7]], 1, axis=1) + desired = np.array([[0], [2], [4], [6]]) + assert_array_equal(actual, desired) + def test_choice_exceptions(self): sample = random.choice assert_raises(ValueError, sample, -1, 3) assert_raises(ValueError, sample, 3., 3) - assert_raises(ValueError, sample, [[1, 2], [3, 4]], 3) assert_raises(ValueError, sample, [], 3) assert_raises(ValueError, sample, [1, 2, 3, 4], 3, p=[[0.25, 0.25], [0.25, 0.25]]) @@ -639,6 +650,29 @@ def test_choice_nan_probabilities(self): p = [None, None, None] assert_raises(ValueError, random.choice, a, p=p) + def test_choice_return_type(self): + # gh 9867 + p = np.ones(4) / 4. + actual = random.choice(4, 2) + assert actual.dtype == np.int64 + actual = random.choice(4, 2, replace=False) + assert actual.dtype == np.int64 + actual = random.choice(4, 2, p=p) + assert actual.dtype == np.int64 + actual = random.choice(4, 2, p=p, replace=False) + assert actual.dtype == np.int64 + + def test_choice_large_sample(self): + import hashlib + + choice_hash = '6395868be877d27518c832213c17977c' + random.brng.seed(self.seed) + actual = random.choice(10000, 5000, replace=False) + if sys.byteorder != 'little': + actual = actual.byteswap() + res = hashlib.md5(actual.view(np.int8)).hexdigest() + assert_(choice_hash == res) + def test_bytes(self): random.brng.seed(self.seed) actual = random.bytes(10)