Skip to content

Commit 1dd4bb6

Browse files
committed
Use prefetch in GC mark alive phase.
1 parent 3829104 commit 1dd4bb6

File tree

2 files changed

+197
-57
lines changed

2 files changed

+197
-57
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
The free-threaded version of the cyclic garbage collector has been optimized
2+
to use CPU prefetch instructions during the collection. On large object
3+
graphs, this can reduce collection times by making it more likely that data
4+
is in the CPU cache when its needed.

Python/gc_free_threading.c

Lines changed: 193 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
// enable the "mark alive" pass of GC
2222
#define GC_ENABLE_MARK_ALIVE 1
2323

24+
// if true, enable the use of "prefetch" CPU instructions
25+
#define GC_ENABLE_PREFETCH_INSTRUCTIONS 1
26+
2427
// include additional roots in "mark alive" pass
2528
#define GC_MARK_ALIVE_EXTRA_ROOTS 1
2629

@@ -464,29 +467,75 @@ gc_maybe_untrack(PyObject *op)
464467
}
465468

466469
#ifdef GC_ENABLE_MARK_ALIVE
470+
471+
// prefetch buffer and stack //////////////////////////////////
472+
473+
// The buffer is a circular FIFO queue of PyObject pointers. We take
474+
// care to not dereference these pointers until they are taken out of
475+
// the buffer. A prefetch CPU instruction is issued when a pointer is
476+
// put into the buffer. If all is working as expected, there will be
477+
// enough time between the enqueue and dequeue so that the needed memory
478+
// for the object, most importantly ob_gc_bits and ob_type words, will
479+
// already be in the CPU cache.
480+
#define BUFFER_SIZE 256
481+
#define BUFFER_HI 16
482+
#define BUFFER_LO 8
483+
484+
#if !(defined(__GNUC__) || defined(__clang__))
485+
#undef GC_ENABLE_PREFETCH_INSTRUCTIONS
486+
#endif
487+
488+
#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
489+
#define prefetch(ptr) __builtin_prefetch(ptr, 1, 3)
490+
#else
491+
#define prefetch(ptr)
492+
#endif
493+
494+
struct gc_mark_args {
495+
Py_ssize_t enqueued;
496+
Py_ssize_t dequeued;
497+
_PyObjectStack stack;
498+
PyObject *buffer[BUFFER_SIZE];
499+
};
500+
501+
// Called when we run out of space in the buffer. The object will be added
502+
// to gc_mark_args.stack instead.
467503
static int
468-
mark_alive_stack_push(PyObject *op, _PyObjectStack *stack)
504+
gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
469505
{
470-
if (op == NULL) {
471-
return 0;
506+
if (_PyObjectStack_Push(ms, op) < 0) {
507+
return -1;
472508
}
473-
if (!_PyObject_GC_IS_TRACKED(op)) {
509+
return 0;
510+
}
511+
512+
// Called when there is space in the buffer for the object. Add it to the end
513+
// of the buffer and issue the prefetch instruction.
514+
static inline void
515+
gc_mark_buffer_push(PyObject *op, struct gc_mark_args *args)
516+
{
517+
#if Py_DEBUG
518+
Py_ssize_t buf_used = args->enqueued - args->dequeued;
519+
assert(buf_used < BUFFER_SIZE);
520+
#endif
521+
args->buffer[args->enqueued % BUFFER_SIZE] = op;
522+
args->enqueued++;
523+
prefetch(op);
524+
}
525+
526+
// Called when we find an object that needs to be marked alive (either from a
527+
// root or from calling tp_traverse).
528+
static int
529+
gc_mark_enqueue(PyObject *op, struct gc_mark_args *args)
530+
{
531+
assert(op != NULL);
532+
if (args->enqueued - args->dequeued < BUFFER_SIZE) {
533+
gc_mark_buffer_push(op, args);
474534
return 0;
475535
}
476-
if (gc_is_alive(op)) {
477-
return 0; // already visited this object
478-
}
479-
if (gc_maybe_untrack(op)) {
480-
return 0; // was untracked, don't visit it
481-
}
482-
483-
// Need to call tp_traverse on this object. Add to stack and mark it
484-
// alive so we don't traverse it a second time.
485-
gc_set_alive(op);
486-
if (_PyObjectStack_Push(stack, op) < 0) {
487-
return -1;
536+
else {
537+
return gc_mark_stack_push(&args->stack, op);
488538
}
489-
return 0;
490539
}
491540

492541
static bool
@@ -503,36 +552,68 @@ gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
503552
return true;
504553
}
505554

555+
static int
556+
gc_mark_traverse_list(PyObject *self, void *args)
557+
{
558+
PyListObject *list = (PyListObject *)self;
559+
if (list->ob_item == NULL) {
560+
return 0;
561+
}
562+
for (Py_ssize_t i = 0; i < Py_SIZE(list); i++) {
563+
if (gc_mark_enqueue(list->ob_item[i], args) < 0) {
564+
return -1;
565+
}
566+
}
567+
return 0;
568+
}
569+
570+
static int
571+
gc_mark_traverse_tuple(PyObject *self, void *args)
572+
{
573+
_PyTuple_MaybeUntrack(self);
574+
if (!gc_has_bit(self, _PyGC_BITS_TRACKED)) {
575+
return 0;
576+
}
577+
PyTupleObject *tuple = _PyTuple_CAST(self);
578+
for (Py_ssize_t i = Py_SIZE(tuple); --i >= 0; ) {
579+
PyObject *item = tuple->ob_item[i];
580+
if (item == NULL) {
581+
continue;
582+
}
583+
if (gc_mark_enqueue(tuple->ob_item[i], args) < 0) {
584+
return -1;
585+
}
586+
}
587+
return 0;
588+
}
589+
506590
static void
507591
gc_abort_mark_alive(PyInterpreterState *interp,
508592
struct collection_state *state,
509-
_PyObjectStack *stack)
593+
struct gc_mark_args *args)
510594
{
511595
// We failed to allocate memory for "stack" while doing the "mark
512596
// alive" phase. In that case, free the object stack and make sure
513597
// that no objects have the alive bit set.
514-
_PyObjectStack_Clear(stack);
598+
_PyObjectStack_Clear(&args->stack);
515599
gc_visit_heaps(interp, &gc_clear_alive_bits, &state->base);
516600
}
517601

518602
#ifdef GC_MARK_ALIVE_STACKS
519603
static int
520-
gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref)
604+
gc_visit_stackref_mark_alive(struct gc_mark_args *args, _PyStackRef stackref)
521605
{
522-
// Note: we MUST check that it is deferred before checking the rest.
523-
// Otherwise we might read into invalid memory due to non-deferred references
524-
// being dead already.
525-
if (PyStackRef_IsDeferred(stackref) && !PyStackRef_IsNull(stackref)) {
606+
if (!PyStackRef_IsNull(stackref)) {
526607
PyObject *op = PyStackRef_AsPyObjectBorrow(stackref);
527-
if (mark_alive_stack_push(op, stack) < 0) {
608+
if (gc_mark_enqueue(op, args) < 0) {
528609
return -1;
529610
}
530611
}
531612
return 0;
532613
}
533614

534615
static int
535-
gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *stack)
616+
gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, struct gc_mark_args *args)
536617
{
537618
_Py_FOR_EACH_TSTATE_BEGIN(interp, p) {
538619
for (_PyInterpreterFrame *f = p->current_frame; f != NULL; f = f->previous) {
@@ -542,12 +623,12 @@ gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *st
542623
}
543624

544625
PyCodeObject *co = (PyCodeObject *)executable;
545-
int max_stack = co->co_nlocalsplus + co->co_stacksize;
546-
if (gc_visit_stackref_mark_alive(stack, f->f_executable) < 0) {
626+
int max_stack = co->co_nlocals;
627+
if (gc_visit_stackref_mark_alive(args, f->f_executable) < 0) {
547628
return -1;
548629
}
549630
for (int i = 0; i < max_stack; i++) {
550-
if (gc_visit_stackref_mark_alive(stack, f->localsplus[i]) < 0) {
631+
if (gc_visit_stackref_mark_alive(args, f->localsplus[i]) < 0) {
551632
return -1;
552633
}
553634
}
@@ -880,22 +961,73 @@ static int
880961
move_legacy_finalizer_reachable(struct collection_state *state);
881962

882963
#ifdef GC_ENABLE_MARK_ALIVE
883-
static int
884-
propagate_alive_bits(_PyObjectStack *stack)
964+
965+
static void
966+
gc_mark_buffer_prime(struct gc_mark_args *args)
885967
{
886968
for (;;) {
887-
PyObject *op = _PyObjectStack_Pop(stack);
969+
Py_ssize_t buf_used = args->enqueued - args->dequeued;
970+
if (buf_used >= BUFFER_HI) {
971+
// When priming, don't fill the buffer since that would
972+
// likely cause the stack to be used shortly after when it
973+
// fills. We want to use the buffer as much as possible and
974+
// so we only fill to BUFFER_HI, not BUFFER_SIZE.
975+
return;
976+
}
977+
PyObject *op = _PyObjectStack_Pop(&args->stack);
888978
if (op == NULL) {
889979
break;
890980
}
891-
assert(_PyObject_GC_IS_TRACKED(op));
892-
assert(gc_is_alive(op));
981+
gc_mark_buffer_push(op, args);
982+
}
983+
}
984+
985+
static int
986+
gc_propagate_alive(struct gc_mark_args *args)
987+
{
988+
for (;;) {
989+
Py_ssize_t buf_used = args->enqueued - args->dequeued;
990+
if (buf_used <= BUFFER_LO) {
991+
// The mark buffer is getting empty. If it's too empty
992+
// then there will not be enough delay between issuing
993+
// the prefetch vs when the object is actually accessed.
994+
// Prime the buffer with object pointers from the stack,
995+
// if there are any available.
996+
gc_mark_buffer_prime(args);
997+
if (args->enqueued == args->dequeued) {
998+
return 0; // stack and buffer are both empty
999+
}
1000+
}
1001+
PyObject *op = args->buffer[args->dequeued % BUFFER_SIZE];
1002+
args->dequeued++;
1003+
1004+
if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) {
1005+
continue;
1006+
}
1007+
1008+
if (gc_is_alive(op)) {
1009+
continue; // already visited this object
1010+
}
1011+
1012+
// Need to call tp_traverse on this object. Mark it alive so we
1013+
// don't traverse it a second time.
1014+
gc_set_alive(op);
1015+
8931016
traverseproc traverse = Py_TYPE(op)->tp_traverse;
894-
if (traverse(op, (visitproc)&mark_alive_stack_push, stack) < 0) {
1017+
if (traverse == PyList_Type.tp_traverse) {
1018+
if (gc_mark_traverse_list(op, args) < 0) {
1019+
return -1;
1020+
}
1021+
}
1022+
else if (traverse == PyTuple_Type.tp_traverse) {
1023+
if (gc_mark_traverse_tuple(op, args) < 0) {
1024+
return -1;
1025+
}
1026+
}
1027+
else if (traverse(op, (visitproc)&gc_mark_enqueue, args) < 0) {
8951028
return -1;
8961029
}
8971030
}
898-
return 0;
8991031
}
9001032

9011033
// Using tp_traverse, mark everything reachable from known root objects
@@ -915,48 +1047,52 @@ propagate_alive_bits(_PyObjectStack *stack)
9151047
//
9161048
// Returns -1 on failure (out of memory).
9171049
static int
918-
mark_alive_from_roots(PyInterpreterState *interp,
919-
struct collection_state *state)
1050+
gc_mark_alive_from_roots(PyInterpreterState *interp,
1051+
struct collection_state *state)
9201052
{
9211053
#ifdef GC_DEBUG
9221054
// Check that all objects don't have alive bit set
9231055
gc_visit_heaps(interp, &validate_alive_bits, &state->base);
9241056
#endif
925-
_PyObjectStack stack = { NULL };
926-
927-
#define STACK_PUSH(op) \
928-
if (mark_alive_stack_push(op, &stack) < 0) { \
929-
gc_abort_mark_alive(interp, state, &stack); \
930-
return -1; \
1057+
struct gc_mark_args mark_args = { 0 };
1058+
1059+
#define MARK_ENQUEUE(op) \
1060+
if (op != NULL ) { \
1061+
if (gc_mark_enqueue(op, &mark_args) < 0) { \
1062+
gc_abort_mark_alive(interp, state, &mark_args); \
1063+
return -1; \
1064+
} \
9311065
}
932-
STACK_PUSH(interp->sysdict);
1066+
MARK_ENQUEUE(interp->sysdict);
9331067
#ifdef GC_MARK_ALIVE_EXTRA_ROOTS
934-
STACK_PUSH(interp->builtins);
935-
STACK_PUSH(interp->dict);
1068+
MARK_ENQUEUE(interp->builtins);
1069+
MARK_ENQUEUE(interp->dict);
9361070
struct types_state *types = &interp->types;
9371071
for (int i = 0; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES; i++) {
938-
STACK_PUSH(types->builtins.initialized[i].tp_dict);
939-
STACK_PUSH(types->builtins.initialized[i].tp_subclasses);
1072+
MARK_ENQUEUE(types->builtins.initialized[i].tp_dict);
1073+
MARK_ENQUEUE(types->builtins.initialized[i].tp_subclasses);
9401074
}
9411075
for (int i = 0; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES; i++) {
942-
STACK_PUSH(types->for_extensions.initialized[i].tp_dict);
943-
STACK_PUSH(types->for_extensions.initialized[i].tp_subclasses);
1076+
MARK_ENQUEUE(types->for_extensions.initialized[i].tp_dict);
1077+
MARK_ENQUEUE(types->for_extensions.initialized[i].tp_subclasses);
9441078
}
9451079
#endif
9461080
#ifdef GC_MARK_ALIVE_STACKS
947-
if (gc_visit_thread_stacks_mark_alive(interp, &stack) < 0) {
948-
gc_abort_mark_alive(interp, state, &stack);
1081+
if (gc_visit_thread_stacks_mark_alive(interp, &mark_args) < 0) {
1082+
gc_abort_mark_alive(interp, state, &mark_args);
9491083
return -1;
9501084
}
9511085
#endif
952-
#undef STACK_PUSH
1086+
#undef MARK_ENQUEUE
9531087

9541088
// Use tp_traverse to find everything reachable from roots.
955-
if (propagate_alive_bits(&stack) < 0) {
956-
gc_abort_mark_alive(interp, state, &stack);
1089+
if (gc_propagate_alive(&mark_args) < 0) {
1090+
gc_abort_mark_alive(interp, state, &mark_args);
9571091
return -1;
9581092
}
9591093

1094+
assert(mark_args.stack.head == NULL);
1095+
9601096
return 0;
9611097
}
9621098
#endif // GC_ENABLE_MARK_ALIVE
@@ -1531,7 +1667,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state,
15311667
if (!state->gcstate->freeze_active) {
15321668
// Mark objects reachable from known roots as "alive". These will
15331669
// be ignored for rest of the GC pass.
1534-
int err = mark_alive_from_roots(interp, state);
1670+
int err = gc_mark_alive_from_roots(interp, state);
15351671
if (err < 0) {
15361672
_PyEval_StartTheWorld(interp);
15371673
PyErr_NoMemory();

0 commit comments

Comments
 (0)