@@ -482,8 +482,8 @@ gc_maybe_untrack(PyObject *op)
482
482
#define BUFFER_LO 8
483
483
484
484
#if defined(__GNUC__ ) || defined(__clang__ )
485
- #define PREFETCH_L1 (ptr ) __builtin_prefetch(ptr, 1 , 3)
486
- #define PREFETCH_L2 (ptr ) __builtin_prefetch(ptr, 1 , 2)
485
+ #define PREFETCH_L1 (ptr ) __builtin_prefetch(ptr, 0 , 3)
486
+ #define PREFETCH_L2 (ptr ) __builtin_prefetch(ptr, 0 , 2)
487
487
#elif defined(_MSC_VER ) && (defined(_M_X64 ) || defined(_M_I86 )) && !defined(_M_ARM64EC )
488
488
#include <mmintrin.h>
489
489
#define PREFETCH_L1 (ptr ) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
@@ -497,17 +497,30 @@ gc_maybe_untrack(PyObject *op)
497
497
#endif
498
498
499
499
#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
500
- #define prefetch (ptr ) PREFETCH_L2 (ptr)
500
+ #define prefetch (ptr ) PREFETCH_L1 (ptr)
501
501
#else
502
502
#define prefetch (ptr )
503
503
#endif
504
504
505
- struct gc_mark_args {
505
+ // a contigous sequence of PyObject pointers
506
+ typedef struct {
507
+ PyObject * * start ;
508
+ PyObject * * end ;
509
+ } gc_span_t ;
510
+
511
+ typedef struct {
512
+ Py_ssize_t size ;
513
+ Py_ssize_t capacity ;
514
+ gc_span_t * stack ;
515
+ } gc_span_stack_t ;
516
+
517
+ typedef struct {
506
518
Py_ssize_t enqueued ;
507
519
Py_ssize_t dequeued ;
508
520
_PyObjectStack stack ;
521
+ gc_span_stack_t spans ;
509
522
PyObject * buffer [BUFFER_SIZE ];
510
- };
523
+ } gc_mark_args_t ;
511
524
512
525
// Called when we run out of space in the buffer. The object will be added
513
526
// to gc_mark_args.stack instead.
@@ -520,24 +533,45 @@ gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
520
533
return 0 ;
521
534
}
522
535
536
+ static int
537
+ gc_mark_span_push (gc_span_stack_t * ss , PyObject * * start , PyObject * * end )
538
+ {
539
+ if (ss -> size >= ss -> capacity ) {
540
+ if (ss -> capacity == 0 ) {
541
+ ss -> capacity = 256 ;
542
+ }
543
+ else {
544
+ ss -> capacity *= 2 ;
545
+ }
546
+ ss -> stack = (gc_span_t * )PyMem_Realloc (ss -> stack , ss -> capacity * sizeof (gc_span_t ));
547
+ if (ss -> stack == NULL ) {
548
+ return -1 ;
549
+ }
550
+ }
551
+ ss -> stack [ss -> size ].start = start ;
552
+ ss -> stack [ss -> size ].end = end ;
553
+ ss -> size ++ ;
554
+ return 0 ;
555
+ }
556
+
523
557
// Called when there is space in the buffer for the object. Add it to the end
524
558
// of the buffer and issue the prefetch instruction.
525
- static inline void
526
- gc_mark_buffer_push (PyObject * op , struct gc_mark_args * args )
559
+ static void
560
+ gc_mark_buffer_push (PyObject * op , gc_mark_args_t * args )
527
561
{
528
562
#ifdef Py_DEBUG
529
563
Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
530
564
assert (buf_used < BUFFER_SIZE );
531
565
#endif
566
+ prefetch (op );
532
567
args -> buffer [args -> enqueued % BUFFER_SIZE ] = op ;
533
568
args -> enqueued ++ ;
534
- prefetch (op );
535
569
}
536
570
537
571
// Called when we find an object that needs to be marked alive (either from a
538
572
// root or from calling tp_traverse).
539
573
static int
540
- gc_mark_enqueue (PyObject * op , struct gc_mark_args * args )
574
+ gc_mark_enqueue (PyObject * op , gc_mark_args_t * args )
541
575
{
542
576
assert (op != NULL );
543
577
if (args -> enqueued - args -> dequeued < BUFFER_SIZE ) {
@@ -549,6 +583,25 @@ gc_mark_enqueue(PyObject *op, struct gc_mark_args *args)
549
583
}
550
584
}
551
585
586
+ static int
587
+ gc_mark_enqueue_span (PyObject * * item , Py_ssize_t size , gc_mark_args_t * args )
588
+ {
589
+ Py_ssize_t used = args -> enqueued - args -> dequeued ;
590
+ Py_ssize_t free = BUFFER_SIZE - used ;
591
+ if (free > size ) {
592
+ for (Py_ssize_t i = 0 ; i < size ; i ++ ) {
593
+ gc_mark_buffer_push (item [i ], args );
594
+ }
595
+ }
596
+ else {
597
+ PyObject * * end = & item [size ];
598
+ if (gc_mark_span_push (& args -> spans , item , end ) < 0 ) {
599
+ return -1 ;
600
+ }
601
+ }
602
+ return 0 ;
603
+ }
604
+
552
605
static bool
553
606
gc_clear_alive_bits (const mi_heap_t * heap , const mi_heap_area_t * area ,
554
607
void * block , size_t block_size , void * args )
@@ -570,10 +623,8 @@ gc_mark_traverse_list(PyObject *self, void *args)
570
623
if (list -> ob_item == NULL ) {
571
624
return 0 ;
572
625
}
573
- for (Py_ssize_t i = 0 ; i < Py_SIZE (list ); i ++ ) {
574
- if (gc_mark_enqueue (list -> ob_item [i ], args ) < 0 ) {
575
- return -1 ;
576
- }
626
+ if (gc_mark_enqueue_span (list -> ob_item , PyList_GET_SIZE (list ), args ) < 0 ) {
627
+ return -1 ;
577
628
}
578
629
return 0 ;
579
630
}
@@ -586,33 +637,30 @@ gc_mark_traverse_tuple(PyObject *self, void *args)
586
637
return 0 ;
587
638
}
588
639
PyTupleObject * tuple = _PyTuple_CAST (self );
589
- for (Py_ssize_t i = Py_SIZE (tuple ); -- i >= 0 ; ) {
590
- PyObject * item = tuple -> ob_item [i ];
591
- if (item == NULL ) {
592
- continue ;
593
- }
594
- if (gc_mark_enqueue (tuple -> ob_item [i ], args ) < 0 ) {
595
- return -1 ;
596
- }
640
+ if (gc_mark_enqueue_span (tuple -> ob_item , Py_SIZE (tuple ), args ) < 0 ) {
641
+ return -1 ;
597
642
}
598
643
return 0 ;
599
644
}
600
645
601
646
static void
602
647
gc_abort_mark_alive (PyInterpreterState * interp ,
603
648
struct collection_state * state ,
604
- struct gc_mark_args * args )
649
+ gc_mark_args_t * args )
605
650
{
606
651
// We failed to allocate memory for "stack" while doing the "mark
607
652
// alive" phase. In that case, free the object stack and make sure
608
653
// that no objects have the alive bit set.
609
654
_PyObjectStack_Clear (& args -> stack );
655
+ if (args -> spans .stack != NULL ) {
656
+ PyMem_Free (args -> spans .stack );
657
+ }
610
658
gc_visit_heaps (interp , & gc_clear_alive_bits , & state -> base );
611
659
}
612
660
613
661
#ifdef GC_MARK_ALIVE_STACKS
614
662
static int
615
- gc_visit_stackref_mark_alive (struct gc_mark_args * args , _PyStackRef stackref )
663
+ gc_visit_stackref_mark_alive (gc_mark_args_t * args , _PyStackRef stackref )
616
664
{
617
665
if (!PyStackRef_IsNull (stackref )) {
618
666
PyObject * op = PyStackRef_AsPyObjectBorrow (stackref );
@@ -624,7 +672,7 @@ gc_visit_stackref_mark_alive(struct gc_mark_args *args, _PyStackRef stackref)
624
672
}
625
673
626
674
static int
627
- gc_visit_thread_stacks_mark_alive (PyInterpreterState * interp , struct gc_mark_args * args )
675
+ gc_visit_thread_stacks_mark_alive (PyInterpreterState * interp , gc_mark_args_t * args )
628
676
{
629
677
_Py_FOR_EACH_TSTATE_BEGIN (interp , p ) {
630
678
for (_PyInterpreterFrame * f = p -> current_frame ; f != NULL ; f = f -> previous ) {
@@ -974,39 +1022,65 @@ move_legacy_finalizer_reachable(struct collection_state *state);
974
1022
#ifdef GC_ENABLE_MARK_ALIVE
975
1023
976
1024
static void
977
- gc_mark_buffer_prime (struct gc_mark_args * args )
978
- {
979
- for (;;) {
980
- Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
981
- if (buf_used >= BUFFER_HI ) {
982
- // When priming, don't fill the buffer since that would
983
- // likely cause the stack to be used shortly after when it
984
- // fills. We want to use the buffer as much as possible and
985
- // so we only fill to BUFFER_HI, not BUFFER_SIZE.
986
- return ;
1025
+ gc_prime_from_spans (gc_mark_args_t * args )
1026
+ {
1027
+ Py_ssize_t space = BUFFER_HI - (args -> enqueued - args -> dequeued );
1028
+ assert (space >= 1 ); // needed to make progress
1029
+ gc_span_t entry = args -> spans .stack [-- args -> spans .size ];
1030
+ while (entry .start < entry .end ) {
1031
+ PyObject * op = * entry .start ;
1032
+ if (op != NULL ) {
1033
+ if (space > 0 ) {
1034
+ gc_mark_buffer_push (op , args );
1035
+ space -- ;
1036
+ }
1037
+ else {
1038
+ // no more space in buffer, push remaining
1039
+ gc_mark_span_push (& args -> spans , entry .start , entry .end );
1040
+ break ;
1041
+ }
987
1042
}
988
- PyObject * op = _PyObjectStack_Pop (& args -> stack );
989
- if (op == NULL ) {
990
- break ;
1043
+ entry .start ++ ;
1044
+ }
1045
+ }
1046
+
1047
+ static void
1048
+ gc_prime_buffer (gc_mark_args_t * args )
1049
+ {
1050
+ if (args -> spans .size > 0 ) {
1051
+ gc_prime_from_spans (args );
1052
+ }
1053
+ else {
1054
+ // When priming, don't fill the buffer too full since that would
1055
+ // likely cause the stack to be used shortly after when it
1056
+ // fills. We want to use the buffer as much as possible and so
1057
+ // we only fill to BUFFER_HI, not BUFFER_SIZE.
1058
+ Py_ssize_t space = BUFFER_HI - (args -> enqueued - args -> dequeued );
1059
+ while (space > 0 ) {
1060
+ PyObject * op = _PyObjectStack_Pop (& args -> stack );
1061
+ if (op == NULL ) {
1062
+ return ;
1063
+ }
1064
+ gc_mark_buffer_push (op , args );
1065
+ space -- ;
991
1066
}
992
- gc_mark_buffer_push (op , args );
993
1067
}
994
1068
}
995
1069
996
1070
static int
997
- gc_propagate_alive (struct gc_mark_args * args )
1071
+ gc_propagate_alive (gc_mark_args_t * args )
998
1072
{
999
1073
for (;;) {
1000
1074
Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
1001
1075
if (buf_used <= BUFFER_LO ) {
1002
1076
// The mark buffer is getting empty. If it's too empty
1003
1077
// then there will not be enough delay between issuing
1004
- // the prefetch vs when the object is actually accessed.
1005
- // Prime the buffer with object pointers from the stack,
1006
- // if there are any available.
1007
- gc_mark_buffer_prime (args );
1078
+ // the prefetch and when the object is actually accessed.
1079
+ // Prime the buffer with object pointers from the stack or
1080
+ // from the spans, if there are any available.
1081
+ gc_prime_buffer (args );
1008
1082
if (args -> enqueued == args -> dequeued ) {
1009
- return 0 ; // stack and buffer are both empty
1083
+ return 0 ; // buffer empty, done
1010
1084
}
1011
1085
}
1012
1086
PyObject * op = args -> buffer [args -> dequeued % BUFFER_SIZE ];
@@ -1065,7 +1139,7 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
1065
1139
// Check that all objects don't have alive bit set
1066
1140
gc_visit_heaps (interp , & validate_alive_bits , & state -> base );
1067
1141
#endif
1068
- struct gc_mark_args mark_args = { 0 };
1142
+ gc_mark_args_t mark_args = { 0 };
1069
1143
1070
1144
#define MARK_ENQUEUE (op ) \
1071
1145
if (op != NULL ) { \
@@ -1102,6 +1176,10 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
1102
1176
return -1 ;
1103
1177
}
1104
1178
1179
+ assert (mark_args .spans .size == 0 );
1180
+ if (mark_args .spans .stack != NULL ) {
1181
+ PyMem_Free (mark_args .spans .stack );
1182
+ }
1105
1183
assert (mark_args .stack .head == NULL );
1106
1184
1107
1185
return 0 ;
0 commit comments