Skip to content

Commit abfc49a

Browse files
committed
Use of prefetching conditionally.
Using the prefetch buffer only helps if there are enough objects. Use the long-lived count to decide if it's worth enabling. If not, fallback to the previous method of marking objects alive (dereference object pointers as we encounter them). Improve code by adding some additional helper functions, adding comments and general tidying. The buffer logic has been changed to use a mask for size rather than the % operator. Some other small optimizations that only help a little.
1 parent 15ada78 commit abfc49a

File tree

2 files changed

+165
-51
lines changed

2 files changed

+165
-51
lines changed
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
The free-threaded version of the cyclic garbage collector has been optimized
2-
to use CPU prefetch instructions during the collection. On large object
3-
graphs, this can reduce collection times by making it more likely that data
4-
is in the CPU cache when its needed.
1+
The free-threaded version of the cyclic garbage collector has been optimized to
2+
conditionally use CPU prefetch instructions during the collection. This can
3+
reduce collection times by making it more likely that data is in the CPU cache
4+
when it is needed. The prefetch instructions are enabled if the number of
5+
long-lived objects (objects surviving a full collection) exceeds a threshold.

Python/gc_free_threading.c

Lines changed: 160 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -485,9 +485,10 @@ gc_maybe_untrack(PyObject *op)
485485
// enough time between the enqueue and dequeue so that the needed memory
486486
// for the object, most importantly ob_gc_bits and ob_type words, will
487487
// already be in the CPU cache.
488-
#define BUFFER_SIZE 256
488+
#define BUFFER_SIZE 256 // this must be a power of 2
489489
#define BUFFER_HI 16
490490
#define BUFFER_LO 8
491+
#define BUFFER_MASK (BUFFER_SIZE - 1)
491492

492493
// Prefetch intructions will fetch the line of data from memory that
493494
// contains the byte specified with the source operand to a location in
@@ -554,15 +555,63 @@ typedef struct {
554555
} gc_span_stack_t;
555556

556557
typedef struct {
557-
Py_ssize_t enqueued;
558-
Py_ssize_t dequeued;
558+
unsigned int in;
559+
unsigned int out;
559560
_PyObjectStack stack;
560561
gc_span_stack_t spans;
561562
PyObject *buffer[BUFFER_SIZE];
563+
bool use_prefetch;
562564
} gc_mark_args_t;
563565

564-
// Called when we run out of space in the buffer. The object will be added
565-
// to gc_mark_args.stack instead.
566+
567+
// Returns number of entries in buffer
568+
static inline unsigned int
569+
gc_mark_buffer_len(gc_mark_args_t *args)
570+
{
571+
return args->in - args->out;
572+
}
573+
574+
// Returns number of free entry slots in buffer
575+
static inline unsigned int
576+
gc_mark_buffer_avail(gc_mark_args_t *args)
577+
{
578+
return BUFFER_SIZE - gc_mark_buffer_len(args);
579+
}
580+
581+
static inline bool
582+
gc_mark_buffer_is_empty(gc_mark_args_t *args)
583+
{
584+
return args->in == args->out;
585+
}
586+
587+
static inline bool
588+
gc_mark_buffer_is_full(gc_mark_args_t *args)
589+
{
590+
return gc_mark_buffer_len(args) == BUFFER_SIZE;
591+
}
592+
593+
static inline PyObject *
594+
gc_mark_buffer_pop(gc_mark_args_t *args)
595+
{
596+
assert(!gc_mark_buffer_is_empty(args));
597+
PyObject *op = args->buffer[args->out & BUFFER_MASK];
598+
args->out++;
599+
return op;
600+
}
601+
602+
// Called when there is space in the buffer for the object. Issue the
603+
// prefetch instruction and add it to the end of the buffer.
604+
static inline void
605+
gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args)
606+
{
607+
assert(!gc_mark_buffer_is_full(args));
608+
prefetch(op);
609+
args->buffer[args->in & BUFFER_MASK] = op;
610+
args->in++;
611+
}
612+
613+
// Called when we run out of space in the buffer or if the prefetching
614+
// is disabled. The object will be pushed on the gc_mark_args.stack.
566615
static int
567616
gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
568617
{
@@ -575,6 +624,9 @@ gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
575624
static int
576625
gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
577626
{
627+
if (start == end) {
628+
return 0;
629+
}
578630
if (ss->size >= ss->capacity) {
579631
if (ss->capacity == 0) {
580632
ss->capacity = 256;
@@ -594,27 +646,36 @@ gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
594646
return 0;
595647
}
596648

597-
// Called when there is space in the buffer for the object. Add it to the end
598-
// of the buffer and issue the prefetch instruction.
599-
static void
600-
gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args)
649+
static int
650+
gc_mark_enqueue_no_buffer(PyObject *op, gc_mark_args_t *args)
601651
{
602-
#ifdef Py_DEBUG
603-
Py_ssize_t buf_used = args->enqueued - args->dequeued;
604-
assert(buf_used < BUFFER_SIZE);
605-
#endif
606-
prefetch(op);
607-
args->buffer[args->enqueued % BUFFER_SIZE] = op;
608-
args->enqueued++;
652+
if (op == NULL) {
653+
return 0;
654+
}
655+
if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) {
656+
return 0;
657+
}
658+
if (gc_is_alive(op)) {
659+
return 0; // already visited this object
660+
}
661+
if (gc_maybe_untrack(op)) {
662+
return 0; // was untracked, don't visit it
663+
}
664+
665+
// Need to call tp_traverse on this object. Add to stack and mark it
666+
// alive so we don't traverse it a second time.
667+
gc_set_alive(op);
668+
if (_PyObjectStack_Push(&args->stack, op) < 0) {
669+
return -1;
670+
}
671+
return 0;
609672
}
610673

611-
// Called when we find an object that needs to be marked alive (either from a
612-
// root or from calling tp_traverse).
613674
static int
614-
gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
675+
gc_mark_enqueue_buffer(PyObject *op, gc_mark_args_t *args)
615676
{
616677
assert(op != NULL);
617-
if (args->enqueued - args->dequeued < BUFFER_SIZE) {
678+
if (!gc_mark_buffer_is_full(args)) {
618679
gc_mark_buffer_push(op, args);
619680
return 0;
620681
}
@@ -623,12 +684,31 @@ gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
623684
}
624685
}
625686

687+
// Called when we find an object that needs to be marked alive (either from a
688+
// root or from calling tp_traverse).
689+
static int
690+
gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
691+
{
692+
if (args->use_prefetch) {
693+
return gc_mark_enqueue_buffer(op, args);
694+
}
695+
else {
696+
return gc_mark_enqueue_no_buffer(op, args);
697+
}
698+
}
699+
700+
// Called when we have a contigous sequence of PyObject pointers, either
701+
// a tuple or list object. This will add the items to the buffer if there
702+
// is space for them all otherwise push a new "span" on the span stack. Using
703+
// spans has the advantage of not creating a deep _PyObjectStack stack when
704+
// dealing with long sequences. Those sequences will be processed in smaller
705+
// chunks by the gc_prime_from_spans() function.
626706
static int
627707
gc_mark_enqueue_span(PyObject **item, Py_ssize_t size, gc_mark_args_t *args)
628708
{
629-
Py_ssize_t used = args->enqueued - args->dequeued;
709+
Py_ssize_t used = gc_mark_buffer_len(args);
630710
Py_ssize_t free = BUFFER_SIZE - used;
631-
if (free > size) {
711+
if (free >= size) {
632712
for (Py_ssize_t i = 0; i < size; i++) {
633713
PyObject *op = item[i];
634714
if (op == NULL) {
@@ -694,9 +774,9 @@ gc_abort_mark_alive(PyInterpreterState *interp,
694774
struct collection_state *state,
695775
gc_mark_args_t *args)
696776
{
697-
// We failed to allocate memory for "stack" while doing the "mark
698-
// alive" phase. In that case, free the object stack and make sure
699-
// that no objects have the alive bit set.
777+
// We failed to allocate memory while doing the "mark alive" phase.
778+
// In that case, free the memory used for marking state and make
779+
// sure that no objects have the alive bit set.
700780
_PyObjectStack_Clear(&args->stack);
701781
if (args->spans.stack != NULL) {
702782
PyMem_Free(args->spans.stack);
@@ -1089,24 +1169,26 @@ move_legacy_finalizer_reachable(struct collection_state *state);
10891169
static void
10901170
gc_prime_from_spans(gc_mark_args_t *args)
10911171
{
1092-
Py_ssize_t space = BUFFER_HI - (args->enqueued - args->dequeued);
1093-
assert(space >= 1); // needed to make progress
1172+
Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args);
1173+
// there should always be at least this amount of space
1174+
assert(space <= gc_mark_buffer_avail(args));
1175+
assert(space > 0);
10941176
gc_span_t entry = args->spans.stack[--args->spans.size];
1095-
while (entry.start < entry.end) {
1177+
// spans on the stack should always have one or more elements
1178+
assert(entry.start < entry.end);
1179+
do {
10961180
PyObject *op = *entry.start;
1181+
entry.start++;
10971182
if (op != NULL) {
1098-
if (space > 0) {
1099-
gc_mark_buffer_push(op, args);
1100-
space--;
1101-
}
1102-
else {
1103-
// no more space in buffer, push remaining
1183+
gc_mark_buffer_push(op, args);
1184+
space--;
1185+
if (space == 0) {
1186+
// buffer is as full was we want and not done with span
11041187
gc_mark_span_push(&args->spans, entry.start, entry.end);
1105-
break;
1188+
return;
11061189
}
11071190
}
1108-
entry.start++;
1109-
}
1191+
} while (entry.start < entry.end);
11101192
}
11111193

11121194
static void
@@ -1120,36 +1202,36 @@ gc_prime_buffer(gc_mark_args_t *args)
11201202
// likely cause the stack to be used shortly after when it
11211203
// fills. We want to use the buffer as much as possible and so
11221204
// we only fill to BUFFER_HI, not BUFFER_SIZE.
1123-
Py_ssize_t space = BUFFER_HI - (args->enqueued - args->dequeued);
1124-
while (space > 0) {
1205+
Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args);
1206+
assert(space > 0);
1207+
do {
11251208
PyObject *op = _PyObjectStack_Pop(&args->stack);
11261209
if (op == NULL) {
11271210
return;
11281211
}
11291212
gc_mark_buffer_push(op, args);
11301213
space--;
1131-
}
1214+
} while (space > 0);
11321215
}
11331216
}
11341217

11351218
static int
1136-
gc_propagate_alive(gc_mark_args_t *args)
1219+
gc_propagate_alive_prefetch(gc_mark_args_t *args)
11371220
{
11381221
for (;;) {
1139-
Py_ssize_t buf_used = args->enqueued - args->dequeued;
1222+
Py_ssize_t buf_used = gc_mark_buffer_len(args);
11401223
if (buf_used <= BUFFER_LO) {
11411224
// The mark buffer is getting empty. If it's too empty
11421225
// then there will not be enough delay between issuing
11431226
// the prefetch and when the object is actually accessed.
11441227
// Prime the buffer with object pointers from the stack or
11451228
// from the spans, if there are any available.
11461229
gc_prime_buffer(args);
1147-
if (args->enqueued == args->dequeued) {
1148-
return 0; // buffer empty, done
1230+
if (gc_mark_buffer_is_empty(args)) {
1231+
return 0;
11491232
}
11501233
}
1151-
PyObject *op = args->buffer[args->dequeued % BUFFER_SIZE];
1152-
args->dequeued++;
1234+
PyObject *op = gc_mark_buffer_pop(args);
11531235

11541236
if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) {
11551237
continue;
@@ -1174,12 +1256,35 @@ gc_propagate_alive(gc_mark_args_t *args)
11741256
return -1;
11751257
}
11761258
}
1177-
else if (traverse(op, (visitproc)&gc_mark_enqueue, args) < 0) {
1259+
else if (traverse(op, (visitproc)&gc_mark_enqueue_buffer, args) < 0) {
11781260
return -1;
11791261
}
11801262
}
11811263
}
11821264

1265+
static int
1266+
gc_propagate_alive(gc_mark_args_t *args)
1267+
{
1268+
if (args->use_prefetch) {
1269+
return gc_propagate_alive_prefetch(args);
1270+
}
1271+
else {
1272+
for (;;) {
1273+
PyObject *op = _PyObjectStack_Pop(&args->stack);
1274+
if (op == NULL) {
1275+
break;
1276+
}
1277+
assert(_PyObject_GC_IS_TRACKED(op));
1278+
assert(gc_is_alive(op));
1279+
traverseproc traverse = Py_TYPE(op)->tp_traverse;
1280+
if (traverse(op, (visitproc)&gc_mark_enqueue_no_buffer, args) < 0) {
1281+
return -1;
1282+
}
1283+
}
1284+
return 0;
1285+
}
1286+
}
1287+
11831288
// Using tp_traverse, mark everything reachable from known root objects
11841289
// (which must be non-garbage) as alive (_PyGC_BITS_ALIVE is set). In
11851290
// most programs, this marks nearly all objects that are not actually
@@ -1206,6 +1311,14 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
12061311
#endif
12071312
gc_mark_args_t mark_args = { 0 };
12081313

1314+
// Using prefetch instructions is only a win if the set of objects being
1315+
// examined by the GC does not fit into CPU caches. Otherwise, using the
1316+
// buffer and prefetch instructions is just overhead. Using the long lived
1317+
// object count seems a good estimate of if things will fit in the cache.
1318+
// On 64-bit platforms, the minimum object size is 32 bytes. A 4MB L2 cache
1319+
// would hold about 130k objects.
1320+
mark_args.use_prefetch = interp->gc.long_lived_total > 200000;
1321+
12091322
#define MARK_ENQUEUE(op) \
12101323
if (op != NULL ) { \
12111324
if (gc_mark_enqueue(op, &mark_args) < 0) { \

0 commit comments

Comments
 (0)