21
21
// enable the "mark alive" pass of GC
22
22
#define GC_ENABLE_MARK_ALIVE 1
23
23
24
+ // if true, enable the use of "prefetch" CPU instructions
25
+ #define GC_ENABLE_PREFETCH_INSTRUCTIONS 1
26
+
24
27
// include additional roots in "mark alive" pass
25
28
#define GC_MARK_ALIVE_EXTRA_ROOTS 1
26
29
@@ -464,29 +467,75 @@ gc_maybe_untrack(PyObject *op)
464
467
}
465
468
466
469
#ifdef GC_ENABLE_MARK_ALIVE
470
+
471
+ // prefetch buffer and stack //////////////////////////////////
472
+
473
+ // The buffer is a circular FIFO queue of PyObject pointers. We take
474
+ // care to not dereference these pointers until they are taken out of
475
+ // the buffer. A prefetch CPU instruction is issued when a pointer is
476
+ // put into the buffer. If all is working as expected, there will be
477
+ // enough time between the enqueue and dequeue so that the needed memory
478
+ // for the object, most importantly ob_gc_bits and ob_type words, will
479
+ // already be in the CPU cache.
480
+ #define BUFFER_SIZE 256
481
+ #define BUFFER_HI 16
482
+ #define BUFFER_LO 8
483
+
484
+ #if !(defined(__GNUC__ ) || defined(__clang__ ))
485
+ #undef GC_ENABLE_PREFETCH_INSTRUCTIONS
486
+ #endif
487
+
488
+ #ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
489
+ #define prefetch (ptr ) __builtin_prefetch(ptr, 1, 3)
490
+ #else
491
+ #define prefetch (ptr )
492
+ #endif
493
+
494
+ struct gc_mark_args {
495
+ Py_ssize_t enqueued ;
496
+ Py_ssize_t dequeued ;
497
+ _PyObjectStack stack ;
498
+ PyObject * buffer [BUFFER_SIZE ];
499
+ };
500
+
501
+ // Called when we run out of space in the buffer. The object will be added
502
+ // to gc_mark_args.stack instead.
467
503
static int
468
- mark_alive_stack_push ( PyObject * op , _PyObjectStack * stack )
504
+ gc_mark_stack_push ( _PyObjectStack * ms , PyObject * op )
469
505
{
470
- if (op == NULL ) {
471
- return 0 ;
506
+ if (_PyObjectStack_Push ( ms , op ) < 0 ) {
507
+ return -1 ;
472
508
}
473
- if (!_PyObject_GC_IS_TRACKED (op )) {
509
+ return 0 ;
510
+ }
511
+
512
+ // Called when there is space in the buffer for the object. Add it to the end
513
+ // of the buffer and issue the prefetch instruction.
514
+ static inline void
515
+ gc_mark_buffer_push (PyObject * op , struct gc_mark_args * args )
516
+ {
517
+ #if Py_DEBUG
518
+ Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
519
+ assert (buf_used < BUFFER_SIZE );
520
+ #endif
521
+ args -> buffer [args -> enqueued % BUFFER_SIZE ] = op ;
522
+ args -> enqueued ++ ;
523
+ prefetch (op );
524
+ }
525
+
526
+ // Called when we find an object that needs to be marked alive (either from a
527
+ // root or from calling tp_traverse).
528
+ static int
529
+ gc_mark_enqueue (PyObject * op , struct gc_mark_args * args )
530
+ {
531
+ assert (op != NULL );
532
+ if (args -> enqueued - args -> dequeued < BUFFER_SIZE ) {
533
+ gc_mark_buffer_push (op , args );
474
534
return 0 ;
475
535
}
476
- if (gc_is_alive (op )) {
477
- return 0 ; // already visited this object
478
- }
479
- if (gc_maybe_untrack (op )) {
480
- return 0 ; // was untracked, don't visit it
481
- }
482
-
483
- // Need to call tp_traverse on this object. Add to stack and mark it
484
- // alive so we don't traverse it a second time.
485
- gc_set_alive (op );
486
- if (_PyObjectStack_Push (stack , op ) < 0 ) {
487
- return -1 ;
536
+ else {
537
+ return gc_mark_stack_push (& args -> stack , op );
488
538
}
489
- return 0 ;
490
539
}
491
540
492
541
static bool
@@ -503,36 +552,68 @@ gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
503
552
return true;
504
553
}
505
554
555
+ static int
556
+ gc_mark_traverse_list (PyObject * self , void * args )
557
+ {
558
+ PyListObject * list = (PyListObject * )self ;
559
+ if (list -> ob_item == NULL ) {
560
+ return 0 ;
561
+ }
562
+ for (Py_ssize_t i = 0 ; i < Py_SIZE (list ); i ++ ) {
563
+ if (gc_mark_enqueue (list -> ob_item [i ], args ) < 0 ) {
564
+ return -1 ;
565
+ }
566
+ }
567
+ return 0 ;
568
+ }
569
+
570
+ static int
571
+ gc_mark_traverse_tuple (PyObject * self , void * args )
572
+ {
573
+ _PyTuple_MaybeUntrack (self );
574
+ if (!gc_has_bit (self , _PyGC_BITS_TRACKED )) {
575
+ return 0 ;
576
+ }
577
+ PyTupleObject * tuple = _PyTuple_CAST (self );
578
+ for (Py_ssize_t i = Py_SIZE (tuple ); -- i >= 0 ; ) {
579
+ PyObject * item = tuple -> ob_item [i ];
580
+ if (item == NULL ) {
581
+ continue ;
582
+ }
583
+ if (gc_mark_enqueue (tuple -> ob_item [i ], args ) < 0 ) {
584
+ return -1 ;
585
+ }
586
+ }
587
+ return 0 ;
588
+ }
589
+
506
590
static void
507
591
gc_abort_mark_alive (PyInterpreterState * interp ,
508
592
struct collection_state * state ,
509
- _PyObjectStack * stack )
593
+ struct gc_mark_args * args )
510
594
{
511
595
// We failed to allocate memory for "stack" while doing the "mark
512
596
// alive" phase. In that case, free the object stack and make sure
513
597
// that no objects have the alive bit set.
514
- _PyObjectStack_Clear (stack );
598
+ _PyObjectStack_Clear (& args -> stack );
515
599
gc_visit_heaps (interp , & gc_clear_alive_bits , & state -> base );
516
600
}
517
601
518
602
#ifdef GC_MARK_ALIVE_STACKS
519
603
static int
520
- gc_visit_stackref_mark_alive (_PyObjectStack * stack , _PyStackRef stackref )
604
+ gc_visit_stackref_mark_alive (struct gc_mark_args * args , _PyStackRef stackref )
521
605
{
522
- // Note: we MUST check that it is deferred before checking the rest.
523
- // Otherwise we might read into invalid memory due to non-deferred references
524
- // being dead already.
525
- if (PyStackRef_IsDeferred (stackref ) && !PyStackRef_IsNull (stackref )) {
606
+ if (!PyStackRef_IsNull (stackref )) {
526
607
PyObject * op = PyStackRef_AsPyObjectBorrow (stackref );
527
- if (mark_alive_stack_push (op , stack ) < 0 ) {
608
+ if (gc_mark_enqueue (op , args ) < 0 ) {
528
609
return -1 ;
529
610
}
530
611
}
531
612
return 0 ;
532
613
}
533
614
534
615
static int
535
- gc_visit_thread_stacks_mark_alive (PyInterpreterState * interp , _PyObjectStack * stack )
616
+ gc_visit_thread_stacks_mark_alive (PyInterpreterState * interp , struct gc_mark_args * args )
536
617
{
537
618
_Py_FOR_EACH_TSTATE_BEGIN (interp , p ) {
538
619
for (_PyInterpreterFrame * f = p -> current_frame ; f != NULL ; f = f -> previous ) {
@@ -542,12 +623,12 @@ gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *st
542
623
}
543
624
544
625
PyCodeObject * co = (PyCodeObject * )executable ;
545
- int max_stack = co -> co_nlocalsplus + co -> co_stacksize ;
546
- if (gc_visit_stackref_mark_alive (stack , f -> f_executable ) < 0 ) {
626
+ int max_stack = co -> co_nlocals ;
627
+ if (gc_visit_stackref_mark_alive (args , f -> f_executable ) < 0 ) {
547
628
return -1 ;
548
629
}
549
630
for (int i = 0 ; i < max_stack ; i ++ ) {
550
- if (gc_visit_stackref_mark_alive (stack , f -> localsplus [i ]) < 0 ) {
631
+ if (gc_visit_stackref_mark_alive (args , f -> localsplus [i ]) < 0 ) {
551
632
return -1 ;
552
633
}
553
634
}
@@ -880,22 +961,73 @@ static int
880
961
move_legacy_finalizer_reachable (struct collection_state * state );
881
962
882
963
#ifdef GC_ENABLE_MARK_ALIVE
883
- static int
884
- propagate_alive_bits (_PyObjectStack * stack )
964
+
965
+ static void
966
+ gc_mark_buffer_prime (struct gc_mark_args * args )
885
967
{
886
968
for (;;) {
887
- PyObject * op = _PyObjectStack_Pop (stack );
969
+ Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
970
+ if (buf_used >= BUFFER_HI ) {
971
+ // When priming, don't fill the buffer since that would
972
+ // likely cause the stack to be used shortly after when it
973
+ // fills. We want to use the buffer as much as possible and
974
+ // so we only fill to BUFFER_HI, not BUFFER_SIZE.
975
+ return ;
976
+ }
977
+ PyObject * op = _PyObjectStack_Pop (& args -> stack );
888
978
if (op == NULL ) {
889
979
break ;
890
980
}
891
- assert (_PyObject_GC_IS_TRACKED (op ));
892
- assert (gc_is_alive (op ));
981
+ gc_mark_buffer_push (op , args );
982
+ }
983
+ }
984
+
985
+ static int
986
+ gc_propagate_alive (struct gc_mark_args * args )
987
+ {
988
+ for (;;) {
989
+ Py_ssize_t buf_used = args -> enqueued - args -> dequeued ;
990
+ if (buf_used <= BUFFER_LO ) {
991
+ // The mark buffer is getting empty. If it's too empty
992
+ // then there will not be enough delay between issuing
993
+ // the prefetch vs when the object is actually accessed.
994
+ // Prime the buffer with object pointers from the stack,
995
+ // if there are any available.
996
+ gc_mark_buffer_prime (args );
997
+ if (args -> enqueued == args -> dequeued ) {
998
+ return 0 ; // stack and buffer are both empty
999
+ }
1000
+ }
1001
+ PyObject * op = args -> buffer [args -> dequeued % BUFFER_SIZE ];
1002
+ args -> dequeued ++ ;
1003
+
1004
+ if (!gc_has_bit (op , _PyGC_BITS_TRACKED )) {
1005
+ continue ;
1006
+ }
1007
+
1008
+ if (gc_is_alive (op )) {
1009
+ continue ; // already visited this object
1010
+ }
1011
+
1012
+ // Need to call tp_traverse on this object. Mark it alive so we
1013
+ // don't traverse it a second time.
1014
+ gc_set_alive (op );
1015
+
893
1016
traverseproc traverse = Py_TYPE (op )-> tp_traverse ;
894
- if (traverse (op , (visitproc )& mark_alive_stack_push , stack ) < 0 ) {
1017
+ if (traverse == PyList_Type .tp_traverse ) {
1018
+ if (gc_mark_traverse_list (op , args ) < 0 ) {
1019
+ return -1 ;
1020
+ }
1021
+ }
1022
+ else if (traverse == PyTuple_Type .tp_traverse ) {
1023
+ if (gc_mark_traverse_tuple (op , args ) < 0 ) {
1024
+ return -1 ;
1025
+ }
1026
+ }
1027
+ else if (traverse (op , (visitproc )& gc_mark_enqueue , args ) < 0 ) {
895
1028
return -1 ;
896
1029
}
897
1030
}
898
- return 0 ;
899
1031
}
900
1032
901
1033
// Using tp_traverse, mark everything reachable from known root objects
@@ -915,48 +1047,52 @@ propagate_alive_bits(_PyObjectStack *stack)
915
1047
//
916
1048
// Returns -1 on failure (out of memory).
917
1049
static int
918
- mark_alive_from_roots (PyInterpreterState * interp ,
919
- struct collection_state * state )
1050
+ gc_mark_alive_from_roots (PyInterpreterState * interp ,
1051
+ struct collection_state * state )
920
1052
{
921
1053
#ifdef GC_DEBUG
922
1054
// Check that all objects don't have alive bit set
923
1055
gc_visit_heaps (interp , & validate_alive_bits , & state -> base );
924
1056
#endif
925
- _PyObjectStack stack = { NULL };
926
-
927
- #define STACK_PUSH (op ) \
928
- if (mark_alive_stack_push(op, &stack) < 0) { \
929
- gc_abort_mark_alive(interp, state, &stack); \
930
- return -1; \
1057
+ struct gc_mark_args mark_args = { 0 };
1058
+
1059
+ #define MARK_ENQUEUE (op ) \
1060
+ if (op != NULL ) { \
1061
+ if (gc_mark_enqueue(op, &mark_args) < 0) { \
1062
+ gc_abort_mark_alive(interp, state, &mark_args); \
1063
+ return -1; \
1064
+ } \
931
1065
}
932
- STACK_PUSH (interp -> sysdict );
1066
+ MARK_ENQUEUE (interp -> sysdict );
933
1067
#ifdef GC_MARK_ALIVE_EXTRA_ROOTS
934
- STACK_PUSH (interp -> builtins );
935
- STACK_PUSH (interp -> dict );
1068
+ MARK_ENQUEUE (interp -> builtins );
1069
+ MARK_ENQUEUE (interp -> dict );
936
1070
struct types_state * types = & interp -> types ;
937
1071
for (int i = 0 ; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES ; i ++ ) {
938
- STACK_PUSH (types -> builtins .initialized [i ].tp_dict );
939
- STACK_PUSH (types -> builtins .initialized [i ].tp_subclasses );
1072
+ MARK_ENQUEUE (types -> builtins .initialized [i ].tp_dict );
1073
+ MARK_ENQUEUE (types -> builtins .initialized [i ].tp_subclasses );
940
1074
}
941
1075
for (int i = 0 ; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES ; i ++ ) {
942
- STACK_PUSH (types -> for_extensions .initialized [i ].tp_dict );
943
- STACK_PUSH (types -> for_extensions .initialized [i ].tp_subclasses );
1076
+ MARK_ENQUEUE (types -> for_extensions .initialized [i ].tp_dict );
1077
+ MARK_ENQUEUE (types -> for_extensions .initialized [i ].tp_subclasses );
944
1078
}
945
1079
#endif
946
1080
#ifdef GC_MARK_ALIVE_STACKS
947
- if (gc_visit_thread_stacks_mark_alive (interp , & stack ) < 0 ) {
948
- gc_abort_mark_alive (interp , state , & stack );
1081
+ if (gc_visit_thread_stacks_mark_alive (interp , & mark_args ) < 0 ) {
1082
+ gc_abort_mark_alive (interp , state , & mark_args );
949
1083
return -1 ;
950
1084
}
951
1085
#endif
952
- #undef STACK_PUSH
1086
+ #undef MARK_ENQUEUE
953
1087
954
1088
// Use tp_traverse to find everything reachable from roots.
955
- if (propagate_alive_bits ( & stack ) < 0 ) {
956
- gc_abort_mark_alive (interp , state , & stack );
1089
+ if (gc_propagate_alive ( & mark_args ) < 0 ) {
1090
+ gc_abort_mark_alive (interp , state , & mark_args );
957
1091
return -1 ;
958
1092
}
959
1093
1094
+ assert (mark_args .stack .head == NULL );
1095
+
960
1096
return 0 ;
961
1097
}
962
1098
#endif // GC_ENABLE_MARK_ALIVE
@@ -1531,7 +1667,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state,
1531
1667
if (!state -> gcstate -> freeze_active ) {
1532
1668
// Mark objects reachable from known roots as "alive". These will
1533
1669
// be ignored for rest of the GC pass.
1534
- int err = mark_alive_from_roots (interp , state );
1670
+ int err = gc_mark_alive_from_roots (interp , state );
1535
1671
if (err < 0 ) {
1536
1672
_PyEval_StartTheWorld (interp );
1537
1673
PyErr_NoMemory ();
0 commit comments