Skip to content

Commit 7f51104

Browse files
committed
Implement "spans" for marking phase.
When traversing a list or tuple, use a "span" if the buffer can't hold all the items from the collection. This reduces the size of the object stack needed if large collections are encountered. It also helps keeps the buffer size optimal for prefetching.
1 parent 9fbfd43 commit 7f51104

File tree

1 file changed

+123
-45
lines changed

1 file changed

+123
-45
lines changed

Python/gc_free_threading.c

Lines changed: 123 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -482,8 +482,8 @@ gc_maybe_untrack(PyObject *op)
482482
#define BUFFER_LO 8
483483

484484
#if defined(__GNUC__) || defined(__clang__)
485-
#define PREFETCH_L1(ptr) __builtin_prefetch(ptr, 1, 3)
486-
#define PREFETCH_L2(ptr) __builtin_prefetch(ptr, 1, 2)
485+
#define PREFETCH_L1(ptr) __builtin_prefetch(ptr, 0, 3)
486+
#define PREFETCH_L2(ptr) __builtin_prefetch(ptr, 0, 2)
487487
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)
488488
#include <mmintrin.h>
489489
#define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
@@ -497,17 +497,30 @@ gc_maybe_untrack(PyObject *op)
497497
#endif
498498

499499
#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
500-
#define prefetch(ptr) PREFETCH_L2(ptr)
500+
#define prefetch(ptr) PREFETCH_L1(ptr)
501501
#else
502502
#define prefetch(ptr)
503503
#endif
504504

505-
struct gc_mark_args {
505+
// a contigous sequence of PyObject pointers
506+
typedef struct {
507+
PyObject **start;
508+
PyObject **end;
509+
} gc_span_t;
510+
511+
typedef struct {
512+
Py_ssize_t size;
513+
Py_ssize_t capacity;
514+
gc_span_t *stack;
515+
} gc_span_stack_t;
516+
517+
typedef struct {
506518
Py_ssize_t enqueued;
507519
Py_ssize_t dequeued;
508520
_PyObjectStack stack;
521+
gc_span_stack_t spans;
509522
PyObject *buffer[BUFFER_SIZE];
510-
};
523+
} gc_mark_args_t;
511524

512525
// Called when we run out of space in the buffer. The object will be added
513526
// to gc_mark_args.stack instead.
@@ -520,24 +533,45 @@ gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
520533
return 0;
521534
}
522535

536+
static int
537+
gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
538+
{
539+
if (ss->size >= ss->capacity) {
540+
if (ss->capacity == 0) {
541+
ss->capacity = 256;
542+
}
543+
else {
544+
ss->capacity *= 2;
545+
}
546+
ss->stack = (gc_span_t *)PyMem_Realloc(ss->stack, ss->capacity * sizeof(gc_span_t));
547+
if (ss->stack == NULL) {
548+
return -1;
549+
}
550+
}
551+
ss->stack[ss->size].start = start;
552+
ss->stack[ss->size].end = end;
553+
ss->size++;
554+
return 0;
555+
}
556+
523557
// Called when there is space in the buffer for the object. Add it to the end
524558
// of the buffer and issue the prefetch instruction.
525-
static inline void
526-
gc_mark_buffer_push(PyObject *op, struct gc_mark_args *args)
559+
static void
560+
gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args)
527561
{
528562
#ifdef Py_DEBUG
529563
Py_ssize_t buf_used = args->enqueued - args->dequeued;
530564
assert(buf_used < BUFFER_SIZE);
531565
#endif
566+
prefetch(op);
532567
args->buffer[args->enqueued % BUFFER_SIZE] = op;
533568
args->enqueued++;
534-
prefetch(op);
535569
}
536570

537571
// Called when we find an object that needs to be marked alive (either from a
538572
// root or from calling tp_traverse).
539573
static int
540-
gc_mark_enqueue(PyObject *op, struct gc_mark_args *args)
574+
gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
541575
{
542576
assert(op != NULL);
543577
if (args->enqueued - args->dequeued < BUFFER_SIZE) {
@@ -549,6 +583,25 @@ gc_mark_enqueue(PyObject *op, struct gc_mark_args *args)
549583
}
550584
}
551585

586+
static int
587+
gc_mark_enqueue_span(PyObject **item, Py_ssize_t size, gc_mark_args_t *args)
588+
{
589+
Py_ssize_t used = args->enqueued - args->dequeued;
590+
Py_ssize_t free = BUFFER_SIZE - used;
591+
if (free > size) {
592+
for (Py_ssize_t i = 0; i < size; i++) {
593+
gc_mark_buffer_push(item[i], args);
594+
}
595+
}
596+
else {
597+
PyObject **end = &item[size];
598+
if (gc_mark_span_push(&args->spans, item, end) < 0) {
599+
return -1;
600+
}
601+
}
602+
return 0;
603+
}
604+
552605
static bool
553606
gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
554607
void *block, size_t block_size, void *args)
@@ -570,10 +623,8 @@ gc_mark_traverse_list(PyObject *self, void *args)
570623
if (list->ob_item == NULL) {
571624
return 0;
572625
}
573-
for (Py_ssize_t i = 0; i < Py_SIZE(list); i++) {
574-
if (gc_mark_enqueue(list->ob_item[i], args) < 0) {
575-
return -1;
576-
}
626+
if (gc_mark_enqueue_span(list->ob_item, PyList_GET_SIZE(list), args) < 0) {
627+
return -1;
577628
}
578629
return 0;
579630
}
@@ -586,33 +637,30 @@ gc_mark_traverse_tuple(PyObject *self, void *args)
586637
return 0;
587638
}
588639
PyTupleObject *tuple = _PyTuple_CAST(self);
589-
for (Py_ssize_t i = Py_SIZE(tuple); --i >= 0; ) {
590-
PyObject *item = tuple->ob_item[i];
591-
if (item == NULL) {
592-
continue;
593-
}
594-
if (gc_mark_enqueue(tuple->ob_item[i], args) < 0) {
595-
return -1;
596-
}
640+
if (gc_mark_enqueue_span(tuple->ob_item, Py_SIZE(tuple), args) < 0) {
641+
return -1;
597642
}
598643
return 0;
599644
}
600645

601646
static void
602647
gc_abort_mark_alive(PyInterpreterState *interp,
603648
struct collection_state *state,
604-
struct gc_mark_args *args)
649+
gc_mark_args_t *args)
605650
{
606651
// We failed to allocate memory for "stack" while doing the "mark
607652
// alive" phase. In that case, free the object stack and make sure
608653
// that no objects have the alive bit set.
609654
_PyObjectStack_Clear(&args->stack);
655+
if (args->spans.stack != NULL) {
656+
PyMem_Free(args->spans.stack);
657+
}
610658
gc_visit_heaps(interp, &gc_clear_alive_bits, &state->base);
611659
}
612660

613661
#ifdef GC_MARK_ALIVE_STACKS
614662
static int
615-
gc_visit_stackref_mark_alive(struct gc_mark_args *args, _PyStackRef stackref)
663+
gc_visit_stackref_mark_alive(gc_mark_args_t *args, _PyStackRef stackref)
616664
{
617665
if (!PyStackRef_IsNull(stackref)) {
618666
PyObject *op = PyStackRef_AsPyObjectBorrow(stackref);
@@ -624,7 +672,7 @@ gc_visit_stackref_mark_alive(struct gc_mark_args *args, _PyStackRef stackref)
624672
}
625673

626674
static int
627-
gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, struct gc_mark_args *args)
675+
gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, gc_mark_args_t *args)
628676
{
629677
_Py_FOR_EACH_TSTATE_BEGIN(interp, p) {
630678
for (_PyInterpreterFrame *f = p->current_frame; f != NULL; f = f->previous) {
@@ -974,39 +1022,65 @@ move_legacy_finalizer_reachable(struct collection_state *state);
9741022
#ifdef GC_ENABLE_MARK_ALIVE
9751023

9761024
static void
977-
gc_mark_buffer_prime(struct gc_mark_args *args)
978-
{
979-
for (;;) {
980-
Py_ssize_t buf_used = args->enqueued - args->dequeued;
981-
if (buf_used >= BUFFER_HI) {
982-
// When priming, don't fill the buffer since that would
983-
// likely cause the stack to be used shortly after when it
984-
// fills. We want to use the buffer as much as possible and
985-
// so we only fill to BUFFER_HI, not BUFFER_SIZE.
986-
return;
1025+
gc_prime_from_spans(gc_mark_args_t *args)
1026+
{
1027+
Py_ssize_t space = BUFFER_HI - (args->enqueued - args->dequeued);
1028+
assert(space >= 1); // needed to make progress
1029+
gc_span_t entry = args->spans.stack[--args->spans.size];
1030+
while (entry.start < entry.end) {
1031+
PyObject *op = *entry.start;
1032+
if (op != NULL) {
1033+
if (space > 0) {
1034+
gc_mark_buffer_push(op, args);
1035+
space--;
1036+
}
1037+
else {
1038+
// no more space in buffer, push remaining
1039+
gc_mark_span_push(&args->spans, entry.start, entry.end);
1040+
break;
1041+
}
9871042
}
988-
PyObject *op = _PyObjectStack_Pop(&args->stack);
989-
if (op == NULL) {
990-
break;
1043+
entry.start++;
1044+
}
1045+
}
1046+
1047+
static void
1048+
gc_prime_buffer(gc_mark_args_t *args)
1049+
{
1050+
if (args->spans.size > 0) {
1051+
gc_prime_from_spans(args);
1052+
}
1053+
else {
1054+
// When priming, don't fill the buffer too full since that would
1055+
// likely cause the stack to be used shortly after when it
1056+
// fills. We want to use the buffer as much as possible and so
1057+
// we only fill to BUFFER_HI, not BUFFER_SIZE.
1058+
Py_ssize_t space = BUFFER_HI - (args->enqueued - args->dequeued);
1059+
while (space > 0) {
1060+
PyObject *op = _PyObjectStack_Pop(&args->stack);
1061+
if (op == NULL) {
1062+
return;
1063+
}
1064+
gc_mark_buffer_push(op, args);
1065+
space--;
9911066
}
992-
gc_mark_buffer_push(op, args);
9931067
}
9941068
}
9951069

9961070
static int
997-
gc_propagate_alive(struct gc_mark_args *args)
1071+
gc_propagate_alive(gc_mark_args_t *args)
9981072
{
9991073
for (;;) {
10001074
Py_ssize_t buf_used = args->enqueued - args->dequeued;
10011075
if (buf_used <= BUFFER_LO) {
10021076
// The mark buffer is getting empty. If it's too empty
10031077
// then there will not be enough delay between issuing
1004-
// the prefetch vs when the object is actually accessed.
1005-
// Prime the buffer with object pointers from the stack,
1006-
// if there are any available.
1007-
gc_mark_buffer_prime(args);
1078+
// the prefetch and when the object is actually accessed.
1079+
// Prime the buffer with object pointers from the stack or
1080+
// from the spans, if there are any available.
1081+
gc_prime_buffer(args);
10081082
if (args->enqueued == args->dequeued) {
1009-
return 0; // stack and buffer are both empty
1083+
return 0; // buffer empty, done
10101084
}
10111085
}
10121086
PyObject *op = args->buffer[args->dequeued % BUFFER_SIZE];
@@ -1065,7 +1139,7 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
10651139
// Check that all objects don't have alive bit set
10661140
gc_visit_heaps(interp, &validate_alive_bits, &state->base);
10671141
#endif
1068-
struct gc_mark_args mark_args = { 0 };
1142+
gc_mark_args_t mark_args = { 0 };
10691143

10701144
#define MARK_ENQUEUE(op) \
10711145
if (op != NULL ) { \
@@ -1102,6 +1176,10 @@ gc_mark_alive_from_roots(PyInterpreterState *interp,
11021176
return -1;
11031177
}
11041178

1179+
assert(mark_args.spans.size == 0);
1180+
if (mark_args.spans.stack != NULL) {
1181+
PyMem_Free(mark_args.spans.stack);
1182+
}
11051183
assert(mark_args.stack.head == NULL);
11061184

11071185
return 0;

0 commit comments

Comments
 (0)