@@ -368,26 +368,11 @@ cdef class RandomGenerator:
368
368
[ True, True]]])
369
369
370
370
"""
371
- cdef np .npy_intp n
372
- cdef np .ndarray randoms
373
- cdef int64_t * randoms_data
374
-
375
- if size is None :
376
- with self .lock :
377
- return random_positive_int (self ._brng )
378
-
379
- randoms = < np .ndarray > np .empty (size , dtype = np .int64 )
380
- randoms_data = < int64_t * > np .PyArray_DATA (randoms )
381
- n = np .PyArray_SIZE (randoms )
382
-
383
- for i in range (n ):
384
- with self .lock , nogil :
385
- randoms_data [i ] = random_positive_int (self ._brng )
386
- return randoms
371
+ return self .randint (0 , np .iinfo (np .int ).max + 1 , dtype = np .int , size = size )
387
372
388
- def randint (self , low , high = None , size = None , dtype = int , use_masked = True ):
373
+ def randint (self , low , high = None , size = None , dtype = np . int64 , use_masked = True ):
389
374
"""
390
- randint(low, high=None, size=None, dtype='l ', use_masked=True)
375
+ randint(low, high=None, size=None, dtype='int64 ', use_masked=True)
391
376
392
377
Return random integers from `low` (inclusive) to `high` (exclusive).
393
378
@@ -530,9 +515,9 @@ cdef class RandomGenerator:
530
515
return self .randint (0 , 4294967296 , size = n_uint32 , dtype = np .uint32 ).tobytes ()[:length ]
531
516
532
517
@cython .wraparound (True )
533
- def choice (self , a , size = None , replace = True , p = None ):
518
+ def choice (self , a , size = None , replace = True , p = None , axis = 0 ):
534
519
"""
535
- choice(a, size=None, replace=True, p=None)
520
+ choice(a, size=None, replace=True, p=None, axis=0):
536
521
537
522
Generates a random sample from a given 1-D array
538
523
@@ -553,6 +538,9 @@ cdef class RandomGenerator:
553
538
The probabilities associated with each entry in a.
554
539
If not given the sample assumes a uniform distribution over all
555
540
entries in a.
541
+ axis : int, optional
542
+ The axis along which the selection is performed. The default, 0,
543
+ selects by row.
556
544
557
545
Returns
558
546
-------
@@ -562,11 +550,11 @@ cdef class RandomGenerator:
562
550
Raises
563
551
------
564
552
ValueError
565
- If a is an int and less than zero, if a or p are not 1-dimensional,
566
- if a is an array-like of size 0, if p is not a vector of
553
+ If a is an int and less than zero, if p is not 1-dimensional, if
554
+ a is array-like with a size 0, if p is not a vector of
567
555
probabilities, if a and p have different lengths, or if
568
556
replace=False and the sample size is greater than the population
569
- size
557
+ size.
570
558
571
559
See Also
572
560
--------
@@ -607,7 +595,14 @@ cdef class RandomGenerator:
607
595
dtype='<U11')
608
596
609
597
"""
610
-
598
+ cdef char * idx_ptr
599
+ cdef int64_t buf
600
+ cdef char * buf_ptr
601
+
602
+ cdef set idx_set
603
+ cdef int64_t val , t , loc , size_i , pop_size_i
604
+ cdef int64_t * idx_data
605
+ cdef np .npy_intp j
611
606
# Format and Verify input
612
607
a = np .array (a , copy = False )
613
608
if a .ndim == 0 :
@@ -618,11 +613,9 @@ cdef class RandomGenerator:
618
613
raise ValueError ("a must be 1-dimensional or an integer" )
619
614
if pop_size <= 0 and np .prod (size ) != 0 :
620
615
raise ValueError ("a must be greater than 0 unless no samples are taken" )
621
- elif a .ndim != 1 :
622
- raise ValueError ("a must be 1-dimensional" )
623
616
else :
624
- pop_size = a .shape [0 ]
625
- if pop_size is 0 and np .prod (size ) != 0 :
617
+ pop_size = a .shape [axis ]
618
+ if pop_size == 0 and np .prod (size ) != 0 :
626
619
raise ValueError ("'a' cannot be empty unless no samples are taken" )
627
620
628
621
if p is not None :
@@ -661,9 +654,9 @@ cdef class RandomGenerator:
661
654
cdf /= cdf [- 1 ]
662
655
uniform_samples = self .random_sample (shape )
663
656
idx = cdf .searchsorted (uniform_samples , side = 'right' )
664
- idx = np .array (idx , copy = False ) # searchsorted returns a scalar
657
+ idx = np .array (idx , copy = False , dtype = np . int64 ) # searchsorted returns a scalar
665
658
else :
666
- idx = self .randint (0 , pop_size , size = shape )
659
+ idx = self .randint (0 , pop_size , size = shape , dtype = np . int64 )
667
660
else :
668
661
if size > pop_size :
669
662
raise ValueError ("Cannot take a larger sample than "
@@ -692,7 +685,39 @@ cdef class RandomGenerator:
692
685
n_uniq += new .size
693
686
idx = found
694
687
else :
695
- idx = self .permutation (pop_size )[:size ]
688
+ size_i = size
689
+ pop_size_i = pop_size
690
+ # This is a heuristic tuning. should be improvable
691
+ if pop_size_i > 200 and (size > 200 or size > (10 * pop_size // size )):
692
+ # Tail shuffle size elements
693
+ idx = np .arange (pop_size , dtype = np .int64 )
694
+ idx_ptr = np .PyArray_BYTES (< np .ndarray > idx )
695
+ buf_ptr = < char * > & buf
696
+ self ._shuffle_raw (pop_size_i , max (pop_size_i - size_i ,1 ),
697
+ 8 , 8 , idx_ptr , buf_ptr )
698
+ # Copy to allow potentially large array backing idx to be gc
699
+ idx = idx [(pop_size - size ):].copy ()
700
+ else :
701
+ # Floyds's algorithm with precomputed indices
702
+ # Worst case, O(n**2) when size is close to pop_size
703
+ idx = np .empty (size , dtype = np .int64 )
704
+ idx_data = < int64_t * > np .PyArray_DATA (< np .ndarray > idx )
705
+ idx_set = set ()
706
+ loc = 0
707
+ # Sample indices with one pass to avoid reacquiring the lock
708
+ with self .lock :
709
+ for j in range (pop_size_i - size_i , pop_size_i ):
710
+ idx_data [loc ] = random_interval (self ._brng , j )
711
+ loc += 1
712
+ loc = 0
713
+ while len (idx_set ) < size_i :
714
+ for j in range (pop_size_i - size_i , pop_size_i ):
715
+ if idx_data [loc ] not in idx_set :
716
+ val = idx_data [loc ]
717
+ else :
718
+ idx_data [loc ] = val = j
719
+ idx_set .add (val )
720
+ loc += 1
696
721
if shape is not None :
697
722
idx .shape = shape
698
723
@@ -714,7 +739,9 @@ cdef class RandomGenerator:
714
739
res [()] = a [idx ]
715
740
return res
716
741
717
- return a [idx ]
742
+ # asarray downcasts on 32-bit platforms, always safe
743
+ # no-op on 64-bit platforms
744
+ return a .take (np .asarray (idx , dtype = np .intp ), axis = axis )
718
745
719
746
def uniform (self , low = 0.0 , high = 1.0 , size = None ):
720
747
"""
@@ -3986,9 +4013,9 @@ cdef class RandomGenerator:
3986
4013
# the most common case, yielding a ~33% performance improvement.
3987
4014
# Note that apparently, only one branch can ever be specialized.
3988
4015
if itemsize == sizeof (np .npy_intp ):
3989
- self ._shuffle_raw (n , sizeof (np .npy_intp ), stride , x_ptr , buf_ptr )
4016
+ self ._shuffle_raw (n , 1 , sizeof (np .npy_intp ), stride , x_ptr , buf_ptr )
3990
4017
else :
3991
- self ._shuffle_raw (n , itemsize , stride , x_ptr , buf_ptr )
4018
+ self ._shuffle_raw (n , 1 , itemsize , stride , x_ptr , buf_ptr )
3992
4019
elif isinstance (x , np .ndarray ) and x .ndim and x .size :
3993
4020
buf = np .empty_like (x [0 , ...])
3994
4021
with self .lock :
@@ -4007,10 +4034,29 @@ cdef class RandomGenerator:
4007
4034
j = random_interval (self ._brng , i )
4008
4035
x [i ], x [j ] = x [j ], x [i ]
4009
4036
4010
- cdef inline _shuffle_raw (self , np .npy_intp n , np .npy_intp itemsize ,
4011
- np .npy_intp stride , char * data , char * buf ):
4037
+ cdef inline _shuffle_raw (self , np .npy_intp n , np .npy_intp first ,
4038
+ np .npy_intp itemsize , np .npy_intp stride ,
4039
+ char * data , char * buf ):
4040
+ """
4041
+ Parameters
4042
+ ----------
4043
+ n
4044
+ Number of elements in data
4045
+ first
4046
+ First observation to shuffle. Shuffles n-1,
4047
+ n-2, ..., first, so that when first=1 the entire
4048
+ array is shuffled
4049
+ itemsize
4050
+ Size in bytes of item
4051
+ stride
4052
+ Array stride
4053
+ data
4054
+ Location of data
4055
+ buf
4056
+ Location of buffer (itemsize)
4057
+ """
4012
4058
cdef np .npy_intp i , j
4013
- for i in reversed (range (1 , n )):
4059
+ for i in reversed (range (first , n )):
4014
4060
j = random_interval (self ._brng , i )
4015
4061
string .memcpy (buf , data + j * stride , itemsize )
4016
4062
string .memcpy (data + j * stride , data + i * stride , itemsize )
0 commit comments