@@ -485,9 +485,10 @@ gc_maybe_untrack(PyObject *op)
485
485
// enough time between the enqueue and dequeue so that the needed memory
486
486
// for the object, most importantly ob_gc_bits and ob_type words, will
487
487
// already be in the CPU cache.
488
- #define BUFFER_SIZE 256
488
+ #define BUFFER_SIZE 256 // this must be a power of 2
489
489
#define BUFFER_HI 16
490
490
#define BUFFER_LO 8
491
+ #define BUFFER_MASK (BUFFER_SIZE - 1)
491
492
492
493
// Prefetch intructions will fetch the line of data from memory that
493
494
// contains the byte specified with the source operand to a location in
@@ -554,15 +555,63 @@ typedef struct {
554
555
} gc_span_stack_t ;
555
556
556
557
typedef struct {
557
- Py_ssize_t enqueued ;
558
- Py_ssize_t dequeued ;
558
+ unsigned int in ;
559
+ unsigned int out ;
559
560
_PyObjectStack stack ;
560
561
gc_span_stack_t spans ;
561
562
PyObject * buffer [BUFFER_SIZE ];
563
+ bool use_prefetch ;
562
564
} gc_mark_args_t ;
563
565
564
- // Called when we run out of space in the buffer. The object will be added
565
- // to gc_mark_args.stack instead.
566
+
567
+ // Returns number of entries in buffer
568
+ static inline unsigned int
569
+ gc_mark_buffer_len (gc_mark_args_t * args )
570
+ {
571
+ return args -> in - args -> out ;
572
+ }
573
+
574
+ // Returns number of free entry slots in buffer
575
+ static inline unsigned int
576
+ gc_mark_buffer_avail (gc_mark_args_t * args )
577
+ {
578
+ return BUFFER_SIZE - gc_mark_buffer_len (args );
579
+ }
580
+
581
+ static inline bool
582
+ gc_mark_buffer_is_empty (gc_mark_args_t * args )
583
+ {
584
+ return args -> in == args -> out ;
585
+ }
586
+
587
+ static inline bool
588
+ gc_mark_buffer_is_full (gc_mark_args_t * args )
589
+ {
590
+ return gc_mark_buffer_len (args ) == BUFFER_SIZE ;
591
+ }
592
+
593
+ static inline PyObject *
594
+ gc_mark_buffer_pop (gc_mark_args_t * args )
595
+ {
596
+ assert (!gc_mark_buffer_is_empty (args ));
597
+ PyObject * op = args -> buffer [args -> out & BUFFER_MASK ];
598
+ args -> out ++ ;
599
+ return op ;
600
+ }
601
+
602
+ // Called when there is space in the buffer for the object. Issue the
603
+ // prefetch instruction and add it to the end of the buffer.
604
+ static inline void
605
+ gc_mark_buffer_push (PyObject * op , gc_mark_args_t * args )
606
+ {
607
+ assert (!gc_mark_buffer_is_full (args ));
608
+ prefetch (op );
609
+ args -> buffer [args -> in & BUFFER_MASK ] = op ;
610
+ args -> in ++ ;
611
+ }
612
+
613
+ // Called when we run out of space in the buffer or if the prefetching
614
+ // is disabled. The object will be pushed on the gc_mark_args.stack.
566
615
static int
567
616
gc_mark_stack_push (_PyObjectStack * ms , PyObject * op )
568
617
{
@@ -575,6 +624,9 @@ gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
575
624
static int
576
625
gc_mark_span_push (gc_span_stack_t * ss , PyObject * * start , PyObject * * end )
577
626
{
627
+ if (start == end ) {
628
+ return 0 ;
629
+ }
578
630
if (ss -> size >= ss -> capacity ) {
579
631
if (ss -> capacity == 0 ) {
580
632
ss -> capacity = 256 ;
@@ -594,27 +646,36 @@ gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
594
646
return 0 ;
595
647
}
596
648
597
- // Called when there is space in the buffer for the object. Add it to the end
598
- // of the buffer and issue the prefetch instruction.
599
- static void
600
- gc_mark_buffer_push (PyObject * op , gc_mark_args_t * args )
649
+ static int
650
+ gc_mark_enqueue_no_buffer (PyObject * op , gc_mark_args_t * args )
601
651
{
602
- #ifdef Py_DEBUG
603
- Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
604
- assert (buf_used < BUFFER_SIZE );
605
- #endif
606
- prefetch (op );
607
- args -> buffer [args -> enqueued % BUFFER_SIZE ] = op ;
608
- args -> enqueued ++ ;
652
+ if (op == NULL ) {
653
+ return 0 ;
654
+ }
655
+ if (!gc_has_bit (op , _PyGC_BITS_TRACKED )) {
656
+ return 0 ;
657
+ }
658
+ if (gc_is_alive (op )) {
659
+ return 0 ; // already visited this object
660
+ }
661
+ if (gc_maybe_untrack (op )) {
662
+ return 0 ; // was untracked, don't visit it
663
+ }
664
+
665
+ // Need to call tp_traverse on this object. Add to stack and mark it
666
+ // alive so we don't traverse it a second time.
667
+ gc_set_alive (op );
668
+ if (_PyObjectStack_Push (& args -> stack , op ) < 0 ) {
669
+ return -1 ;
670
+ }
671
+ return 0 ;
609
672
}
610
673
611
- // Called when we find an object that needs to be marked alive (either from a
612
- // root or from calling tp_traverse).
613
674
static int
614
- gc_mark_enqueue (PyObject * op , gc_mark_args_t * args )
675
+ gc_mark_enqueue_buffer (PyObject * op , gc_mark_args_t * args )
615
676
{
616
677
assert (op != NULL );
617
- if (args -> enqueued - args -> dequeued < BUFFER_SIZE ) {
678
+ if (! gc_mark_buffer_is_full ( args ) ) {
618
679
gc_mark_buffer_push (op , args );
619
680
return 0 ;
620
681
}
@@ -623,12 +684,31 @@ gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
623
684
}
624
685
}
625
686
687
+ // Called when we find an object that needs to be marked alive (either from a
688
+ // root or from calling tp_traverse).
689
+ static int
690
+ gc_mark_enqueue (PyObject * op , gc_mark_args_t * args )
691
+ {
692
+ if (args -> use_prefetch ) {
693
+ return gc_mark_enqueue_buffer (op , args );
694
+ }
695
+ else {
696
+ return gc_mark_enqueue_no_buffer (op , args );
697
+ }
698
+ }
699
+
700
+ // Called when we have a contigous sequence of PyObject pointers, either
701
+ // a tuple or list object. This will add the items to the buffer if there
702
+ // is space for them all otherwise push a new "span" on the span stack. Using
703
+ // spans has the advantage of not creating a deep _PyObjectStack stack when
704
+ // dealing with long sequences. Those sequences will be processed in smaller
705
+ // chunks by the gc_prime_from_spans() function.
626
706
static int
627
707
gc_mark_enqueue_span (PyObject * * item , Py_ssize_t size , gc_mark_args_t * args )
628
708
{
629
- Py_ssize_t used = args -> enqueued - args -> dequeued ;
709
+ Py_ssize_t used = gc_mark_buffer_len ( args ) ;
630
710
Py_ssize_t free = BUFFER_SIZE - used ;
631
- if (free > size ) {
711
+ if (free >= size ) {
632
712
for (Py_ssize_t i = 0 ; i < size ; i ++ ) {
633
713
PyObject * op = item [i ];
634
714
if (op == NULL ) {
@@ -694,9 +774,9 @@ gc_abort_mark_alive(PyInterpreterState *interp,
694
774
struct collection_state * state ,
695
775
gc_mark_args_t * args )
696
776
{
697
- // We failed to allocate memory for "stack" while doing the "mark
698
- // alive" phase. In that case, free the object stack and make sure
699
- // that no objects have the alive bit set.
777
+ // We failed to allocate memory while doing the "mark alive" phase.
778
+ // In that case, free the memory used for marking state and make
779
+ // sure that no objects have the alive bit set.
700
780
_PyObjectStack_Clear (& args -> stack );
701
781
if (args -> spans .stack != NULL ) {
702
782
PyMem_Free (args -> spans .stack );
@@ -1089,24 +1169,26 @@ move_legacy_finalizer_reachable(struct collection_state *state);
1089
1169
static void
1090
1170
gc_prime_from_spans (gc_mark_args_t * args )
1091
1171
{
1092
- Py_ssize_t space = BUFFER_HI - (args -> enqueued - args -> dequeued );
1093
- assert (space >= 1 ); // needed to make progress
1172
+ Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len (args );
1173
+ // there should always be at least this amount of space
1174
+ assert (space <= gc_mark_buffer_avail (args ));
1175
+ assert (space > 0 );
1094
1176
gc_span_t entry = args -> spans .stack [-- args -> spans .size ];
1095
- while (entry .start < entry .end ) {
1177
+ // spans on the stack should always have one or more elements
1178
+ assert (entry .start < entry .end );
1179
+ do {
1096
1180
PyObject * op = * entry .start ;
1181
+ entry .start ++ ;
1097
1182
if (op != NULL ) {
1098
- if (space > 0 ) {
1099
- gc_mark_buffer_push (op , args );
1100
- space -- ;
1101
- }
1102
- else {
1103
- // no more space in buffer, push remaining
1183
+ gc_mark_buffer_push (op , args );
1184
+ space -- ;
1185
+ if (space == 0 ) {
1186
+ // buffer is as full was we want and not done with span
1104
1187
gc_mark_span_push (& args -> spans , entry .start , entry .end );
1105
- break ;
1188
+ return ;
1106
1189
}
1107
1190
}
1108
- entry .start ++ ;
1109
- }
1191
+ } while (entry .start < entry .end );
1110
1192
}
1111
1193
1112
1194
static void
@@ -1120,36 +1202,36 @@ gc_prime_buffer(gc_mark_args_t *args)
1120
1202
// likely cause the stack to be used shortly after when it
1121
1203
// fills. We want to use the buffer as much as possible and so
1122
1204
// we only fill to BUFFER_HI, not BUFFER_SIZE.
1123
- Py_ssize_t space = BUFFER_HI - (args -> enqueued - args -> dequeued );
1124
- while (space > 0 ) {
1205
+ Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len (args );
1206
+ assert (space > 0 );
1207
+ do {
1125
1208
PyObject * op = _PyObjectStack_Pop (& args -> stack );
1126
1209
if (op == NULL ) {
1127
1210
return ;
1128
1211
}
1129
1212
gc_mark_buffer_push (op , args );
1130
1213
space -- ;
1131
- }
1214
+ } while ( space > 0 );
1132
1215
}
1133
1216
}
1134
1217
1135
1218
static int
1136
- gc_propagate_alive (gc_mark_args_t * args )
1219
+ gc_propagate_alive_prefetch (gc_mark_args_t * args )
1137
1220
{
1138
1221
for (;;) {
1139
- Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
1222
+ Py_ssize_t buf_used = gc_mark_buffer_len ( args ) ;
1140
1223
if (buf_used <= BUFFER_LO ) {
1141
1224
// The mark buffer is getting empty. If it's too empty
1142
1225
// then there will not be enough delay between issuing
1143
1226
// the prefetch and when the object is actually accessed.
1144
1227
// Prime the buffer with object pointers from the stack or
1145
1228
// from the spans, if there are any available.
1146
1229
gc_prime_buffer (args );
1147
- if (args -> enqueued == args -> dequeued ) {
1148
- return 0 ; // buffer empty, done
1230
+ if (gc_mark_buffer_is_empty ( args ) ) {
1231
+ return 0 ;
1149
1232
}
1150
1233
}
1151
- PyObject * op = args -> buffer [args -> dequeued % BUFFER_SIZE ];
1152
- args -> dequeued ++ ;
1234
+ PyObject * op = gc_mark_buffer_pop (args );
1153
1235
1154
1236
if (!gc_has_bit (op , _PyGC_BITS_TRACKED )) {
1155
1237
continue ;
@@ -1174,12 +1256,35 @@ gc_propagate_alive(gc_mark_args_t *args)
1174
1256
return -1 ;
1175
1257
}
1176
1258
}
1177
- else if (traverse (op , (visitproc )& gc_mark_enqueue , args ) < 0 ) {
1259
+ else if (traverse (op , (visitproc )& gc_mark_enqueue_buffer , args ) < 0 ) {
1178
1260
return -1 ;
1179
1261
}
1180
1262
}
1181
1263
}
1182
1264
1265
+ static int
1266
+ gc_propagate_alive (gc_mark_args_t * args )
1267
+ {
1268
+ if (args -> use_prefetch ) {
1269
+ return gc_propagate_alive_prefetch (args );
1270
+ }
1271
+ else {
1272
+ for (;;) {
1273
+ PyObject * op = _PyObjectStack_Pop (& args -> stack );
1274
+ if (op == NULL ) {
1275
+ break ;
1276
+ }
1277
+ assert (_PyObject_GC_IS_TRACKED (op ));
1278
+ assert (gc_is_alive (op ));
1279
+ traverseproc traverse = Py_TYPE (op )-> tp_traverse ;
1280
+ if (traverse (op , (visitproc )& gc_mark_enqueue_no_buffer , args ) < 0 ) {
1281
+ return -1 ;
1282
+ }
1283
+ }
1284
+ return 0 ;
1285
+ }
1286
+ }
1287
+
1183
1288
// Using tp_traverse, mark everything reachable from known root objects
1184
1289
// (which must be non-garbage) as alive (_PyGC_BITS_ALIVE is set). In
1185
1290
// most programs, this marks nearly all objects that are not actually
@@ -1206,6 +1311,14 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
1206
1311
#endif
1207
1312
gc_mark_args_t mark_args = { 0 };
1208
1313
1314
+ // Using prefetch instructions is only a win if the set of objects being
1315
+ // examined by the GC does not fit into CPU caches. Otherwise, using the
1316
+ // buffer and prefetch instructions is just overhead. Using the long lived
1317
+ // object count seems a good estimate of if things will fit in the cache.
1318
+ // On 64-bit platforms, the minimum object size is 32 bytes. A 4MB L2 cache
1319
+ // would hold about 130k objects.
1320
+ mark_args .use_prefetch = interp -> gc .long_lived_total > 200000 ;
1321
+
1209
1322
#define MARK_ENQUEUE (op ) \
1210
1323
if (op != NULL ) { \
1211
1324
if (gc_mark_enqueue(op, &mark_args) < 0) { \
0 commit comments