Skip to content

Commit 15ada78

Browse files
committed
Improve "prefetch" instruction macros.
1 parent e12dd2e commit 15ada78

File tree

1 file changed

+42
-11
lines changed

1 file changed

+42
-11
lines changed

Python/gc_free_threading.c

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -489,25 +489,56 @@ gc_maybe_untrack(PyObject *op)
489489
#define BUFFER_HI 16
490490
#define BUFFER_LO 8
491491

492+
// Prefetch intructions will fetch the line of data from memory that
493+
// contains the byte specified with the source operand to a location in
494+
// the cache hierarchy specified by a locality hint. The instruction
495+
// is only a hint and the CPU is free to ignore it. Instructions and
496+
// behaviour are CPU specific but the definitions of locality hints
497+
// below are mostly consistent.
498+
//
499+
// * T0 (temporal data) prefetch data into all levels of the cache hierarchy.
500+
//
501+
// * T1 (temporal data with respect to first level cache) prefetch data into
502+
// level 2 cache and higher.
503+
//
504+
// * T2 (temporal data with respect to second level cache) prefetch data into
505+
// level 3 cache and higher, or an implementation-specific choice.
506+
//
507+
// * NTA (non-temporal data with respect to all cache levels) prefetch data into
508+
// non-temporal cache structure and into a location close to the processor,
509+
// minimizing cache pollution.
510+
492511
#if defined(__GNUC__) || defined(__clang__)
493-
#define PREFETCH_L1(ptr) __builtin_prefetch(ptr, 0, 3)
494-
#define PREFETCH_L2(ptr) __builtin_prefetch(ptr, 0, 2)
512+
#define PREFETCH_T0(ptr) __builtin_prefetch(ptr, 0, 3)
513+
#define PREFETCH_T1(ptr) __builtin_prefetch(ptr, 0, 2)
514+
#define PREFETCH_T2(ptr) __builtin_prefetch(ptr, 0, 1)
515+
#define PREFETCH_NTA(ptr) __builtin_prefetch(ptr, 0, 0)
495516
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)
496517
#include <mmintrin.h>
497-
#define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
498-
#define PREFETCH_L2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
499-
#elif defined(__aarch64__)
500-
#define PREFETCH_L1(ptr) do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
501-
#define PREFETCH_L2(ptr) do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
518+
#define PREFETCH_T0(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
519+
#define PREFETCH_T1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
520+
#define PREFETCH_T2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T2)
521+
#define PREFETCH_NTA(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_NTA)
522+
#elif defined (__aarch64__)
523+
#define PREFETCH_T0(ptr) \
524+
do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
525+
#define PREFETCH_T1(ptr) \
526+
do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
527+
#define PREFETCH_T2(ptr) \
528+
do { __asm__ __volatile__("prfm pldl3keep, %0" ::"Q"(*(ptr))); } while (0)
529+
#define PREFETCH_NTA(ptr) \
530+
do { __asm__ __volatile__("prfm pldl1strm, %0" ::"Q"(*(ptr))); } while (0)
502531
#else
503-
#define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */
504-
#define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */
532+
#define PREFETCH_T0(ptr) do { (void)(ptr); } while (0) /* disabled */
533+
#define PREFETCH_T1(ptr) do { (void)(ptr); } while (0) /* disabled */
534+
#define PREFETCH_T2(ptr) do { (void)(ptr); } while (0) /* disabled */
535+
#define PREFETCH_NTA(ptr) do { (void)(ptr); } while (0) /* disabled */
505536
#endif
506537

507538
#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
508-
#define prefetch(ptr) PREFETCH_L1(ptr)
539+
#define prefetch(ptr) PREFETCH_T1(ptr)
509540
#else
510-
#define prefetch(ptr)
541+
#define prefetch(ptr)
511542
#endif
512543

513544
// a contigous sequence of PyObject pointers, can contain NULLs

0 commit comments

Comments
 (0)