From 2901caf1942d1898a93e9946a78813cf9544af2e Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Fri, 19 Apr 2024 13:12:34 -0700 Subject: [PATCH 01/18] Replace stencils with dedicated writer functions --- Python/jit.c | 493 +++++++++++++++++++++---------------------- Tools/jit/_writer.py | 160 +++++++++----- 2 files changed, 347 insertions(+), 306 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index df14e48c564447..3e5b3ae79d3a69 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -16,8 +16,6 @@ #include "pycore_sliceobject.h" #include "pycore_jit.h" -#include "jit_stencils.h" - // Memory management stuff: //////////////////////////////////////////////////// #ifndef MS_WINDOWS @@ -146,256 +144,257 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, #define IS_AARCH64_LDR_OR_STR(I) (((I) & 0x3B000000) == 0x39000000) #define IS_AARCH64_MOV(I) (((I) & 0x9F800000) == 0x92800000) -// Fill all of stencil's holes in the memory pointed to by base, using the -// values in patches. -static void -patch(unsigned char *base, const Stencil *stencil, uintptr_t patches[]) +// LLD is a great reference for performing relocations... just keep in +// mind that Tools/jit/build.py does filtering and preprocessing for us! +// Here's a good place to start for each platform: +// - aarch64-apple-darwin: +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h +// - aarch64-pc-windows-msvc: +// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp +// - aarch64-unknown-linux-gnu: +// - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/AArch64.cpp +// - i686-pc-windows-msvc: +// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp +// - x86_64-apple-darwin: +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/X86_64.cpp +// - x86_64-pc-windows-msvc: +// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp +// - x86_64-unknown-linux-gnu: +// - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp + +// 32-bit absolute address. +static inline void +patch_32(unsigned char *location, uintptr_t value) { - for (size_t i = 0; i < stencil->holes_size; i++) { - const Hole *hole = &stencil->holes[i]; - unsigned char *location = base + hole->offset; - uint64_t value = patches[hole->value] + (uintptr_t)hole->symbol + hole->addend; - uint8_t *loc8 = (uint8_t *)location; - uint32_t *loc32 = (uint32_t *)location; - uint64_t *loc64 = (uint64_t *)location; - // LLD is a great reference for performing relocations... just keep in - // mind that Tools/jit/build.py does filtering and preprocessing for us! - // Here's a good place to start for each platform: - // - aarch64-apple-darwin: - // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp - // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp - // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h - // - aarch64-pc-windows-msvc: - // - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp - // - aarch64-unknown-linux-gnu: - // - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/AArch64.cpp - // - i686-pc-windows-msvc: - // - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp - // - x86_64-apple-darwin: - // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/X86_64.cpp - // - x86_64-pc-windows-msvc: - // - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp - // - x86_64-unknown-linux-gnu: - // - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp - switch (hole->kind) { - case HoleKind_IMAGE_REL_I386_DIR32: - // 32-bit absolute address. - // Check that we're not out of range of 32 unsigned bits: - assert(value < (1ULL << 32)); - *loc32 = (uint32_t)value; - continue; - case HoleKind_ARM64_RELOC_UNSIGNED: - case HoleKind_R_AARCH64_ABS64: - case HoleKind_X86_64_RELOC_UNSIGNED: - case HoleKind_R_X86_64_64: - // 64-bit absolute address. - *loc64 = value; - continue; - case HoleKind_IMAGE_REL_AMD64_REL32: - case HoleKind_IMAGE_REL_I386_REL32: - case HoleKind_R_X86_64_GOTPCRELX: - case HoleKind_R_X86_64_REX_GOTPCRELX: - case HoleKind_X86_64_RELOC_GOT: - case HoleKind_X86_64_RELOC_GOT_LOAD: { - // 32-bit relative address. - // Try to relax the GOT load into an immediate value: - uint64_t relaxed = *(uint64_t *)(value + 4) - 4; - if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && - (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) - { - if (loc8[-2] == 0x8B) { - // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX] - loc8[-2] = 0x8D; - value = relaxed; - } - else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { - // call qword ptr [rip + AAA] -> nop; call XXX - loc8[-2] = 0x90; - loc8[-1] = 0xE8; - value = relaxed; - } - else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { - // jmp qword ptr [rip + AAA] -> nop; jmp XXX - loc8[-2] = 0x90; - loc8[-1] = 0xE9; - value = relaxed; - } - } - } - // Fall through... - case HoleKind_R_X86_64_GOTPCREL: - case HoleKind_R_X86_64_PC32: - case HoleKind_X86_64_RELOC_SIGNED: - case HoleKind_X86_64_RELOC_BRANCH: - // 32-bit relative address. - value -= (uintptr_t)location; - // Check that we're not out of range of 32 signed bits: - assert((int64_t)value >= -(1LL << 31)); - assert((int64_t)value < (1LL << 31)); - *loc32 = (uint32_t)value; - continue; - case HoleKind_ARM64_RELOC_BRANCH26: - case HoleKind_IMAGE_REL_ARM64_BRANCH26: - case HoleKind_R_AARCH64_CALL26: - case HoleKind_R_AARCH64_JUMP26: - // 28-bit relative branch. - assert(IS_AARCH64_BRANCH(*loc32)); - value -= (uintptr_t)location; - // Check that we're not out of range of 28 signed bits: - assert((int64_t)value >= -(1 << 27)); - assert((int64_t)value < (1 << 27)); - // Since instructions are 4-byte aligned, only use 26 bits: - assert(get_bits(value, 0, 2) == 0); - set_bits(loc32, 0, value, 2, 26); - continue; - case HoleKind_R_AARCH64_MOVW_UABS_G0_NC: - // 16-bit low part of an absolute address. - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 0 of 3"): - assert(get_bits(*loc32, 21, 2) == 0); - set_bits(loc32, 5, value, 0, 16); - continue; - case HoleKind_R_AARCH64_MOVW_UABS_G1_NC: - // 16-bit middle-low part of an absolute address. - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 1 of 3"): - assert(get_bits(*loc32, 21, 2) == 1); - set_bits(loc32, 5, value, 16, 16); - continue; - case HoleKind_R_AARCH64_MOVW_UABS_G2_NC: - // 16-bit middle-high part of an absolute address. - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 2 of 3"): - assert(get_bits(*loc32, 21, 2) == 2); - set_bits(loc32, 5, value, 32, 16); - continue; - case HoleKind_R_AARCH64_MOVW_UABS_G3: - // 16-bit high part of an absolute address. - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 3 of 3"): - assert(get_bits(*loc32, 21, 2) == 3); - set_bits(loc32, 5, value, 48, 16); - continue; - case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21: - case HoleKind_IMAGE_REL_ARM64_PAGEBASE_REL21: - case HoleKind_R_AARCH64_ADR_GOT_PAGE: - case HoleKind_R_AARCH64_ADR_PREL_PG_HI21: - // 21-bit count of pages between this page and an absolute address's - // page... I know, I know, it's weird. Pairs nicely with - // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below). - assert(IS_AARCH64_ADRP(*loc32)); - // Try to relax the pair of GOT loads into an immediate value: - const Hole *next_hole = &stencil->holes[i + 1]; - if (i + 1 < stencil->holes_size && - (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || - next_hole->kind == HoleKind_IMAGE_REL_ARM64_PAGEOFFSET_12L || - next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) && - next_hole->offset == hole->offset + 4 && - next_hole->symbol == hole->symbol && - next_hole->addend == hole->addend && - next_hole->value == hole->value) - { - unsigned char reg = get_bits(loc32[0], 0, 5); - assert(IS_AARCH64_LDR_OR_STR(loc32[1])); - // There should be only one register involved: - assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. - assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. - uint64_t relaxed = *(uint64_t *)value; - if (relaxed < (1UL << 16)) { - // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop - loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; - loc32[1] = 0xD503201F; - i++; - continue; - } - if (relaxed < (1ULL << 32)) { - // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY - loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; - loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg; - i++; - continue; - } - relaxed = value - (uintptr_t)location; - if ((relaxed & 0x3) == 0 && - (int64_t)relaxed >= -(1L << 19) && - (int64_t)relaxed < (1L << 19)) - { - // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop - loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg; - loc32[1] = 0xD503201F; - i++; - continue; - } - } - // Fall through... - case HoleKind_ARM64_RELOC_PAGE21: - // Number of pages between this page and the value's page: - value = (value >> 12) - ((uintptr_t)location >> 12); - // Check that we're not out of range of 21 signed bits: - assert((int64_t)value >= -(1 << 20)); - assert((int64_t)value < (1 << 20)); - // value[0:2] goes in loc[29:31]: - set_bits(loc32, 29, value, 0, 2); - // value[2:21] goes in loc[5:26]: - set_bits(loc32, 5, value, 2, 19); - continue; - case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12: - case HoleKind_ARM64_RELOC_PAGEOFF12: - case HoleKind_IMAGE_REL_ARM64_PAGEOFFSET_12A: - case HoleKind_IMAGE_REL_ARM64_PAGEOFFSET_12L: - case HoleKind_R_AARCH64_ADD_ABS_LO12_NC: - case HoleKind_R_AARCH64_LD64_GOT_LO12_NC: - // 12-bit low part of an absolute address. Pairs nicely with - // ARM64_RELOC_GOT_LOAD_PAGE21 (above). - assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); - // There might be an implicit shift encoded in the instruction: - uint8_t shift = 0; - if (IS_AARCH64_LDR_OR_STR(*loc32)) { - shift = (uint8_t)get_bits(*loc32, 30, 2); - // If both of these are set, the shift is supposed to be 4. - // That's pretty weird, and it's never actually been observed... - assert(get_bits(*loc32, 23, 1) == 0 || get_bits(*loc32, 26, 1) == 0); - } - value = get_bits(value, 0, 12); - assert(get_bits(value, 0, shift) == 0); - set_bits(loc32, 10, value, shift, 12); - continue; - } - Py_UNREACHABLE(); + uint32_t *loc32 = (uint32_t *)location; + // Check that we're not out of range of 32 unsigned bits: + assert(value < (1ULL << 32)); + *loc32 = (uint32_t)value; +} + +// 32-bit relative address. +static inline void +patch_32r(unsigned char *location, uintptr_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + value -= (uintptr_t)location; + // Check that we're not out of range of 32 signed bits: + assert((int64_t)value >= -(1LL << 31)); + assert((int64_t)value < (1LL << 31)); + *loc32 = (uint32_t)value; +} + +// 64-bit absolute address. +static inline void +patch_64(unsigned char *location, uintptr_t value) +{ + uint64_t *loc64 = (uint64_t *)location; + *loc64 = value; +} + +// 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21 +// (below). +static inline void +patch_aarch64_12(unsigned char *location, uintptr_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); + // There might be an implicit shift encoded in the instruction: + uint8_t shift = 0; + if (IS_AARCH64_LDR_OR_STR(*loc32)) { + shift = (uint8_t)get_bits(*loc32, 30, 2); + // If both of these are set, the shift is supposed to be 4. + // That's pretty weird, and it's never actually been observed... + assert(get_bits(*loc32, 23, 1) == 0 || get_bits(*loc32, 26, 1) == 0); } + value = get_bits(value, 0, 12); + assert(get_bits(value, 0, shift) == 0); + set_bits(loc32, 10, value, shift, 12); } -static void -copy_and_patch(unsigned char *base, const Stencil *stencil, uintptr_t patches[]) +// 16-bit low part of an absolute address. +static inline void +patch_aarch64_16a(unsigned char *location, uintptr_t value) { - memcpy(base, stencil->body, stencil->body_size); - patch(base, stencil, patches); + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 0 of 3"): + assert(get_bits(*loc32, 21, 2) == 0); + set_bits(loc32, 5, value, 0, 16); } -static void -emit(const StencilGroup *group, uintptr_t patches[]) +// 16-bit middle-low part of an absolute address. +static inline void +patch_aarch64_16b(unsigned char *location, uintptr_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 1 of 3"): + assert(get_bits(*loc32, 21, 2) == 1); + set_bits(loc32, 5, value, 16, 16); +} + +// 16-bit middle-high part of an absolute address. +static inline void +patch_aarch64_16c(unsigned char *location, uintptr_t value) { - copy_and_patch((unsigned char *)patches[HoleValue_DATA], &group->data, patches); - copy_and_patch((unsigned char *)patches[HoleValue_CODE], &group->code, patches); + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 2 of 3"): + assert(get_bits(*loc32, 21, 2) == 2); + set_bits(loc32, 5, value, 32, 16); } +// 16-bit high part of an absolute address. +static inline void +patch_aarch64_16d(unsigned char *location, uintptr_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 3 of 3"): + assert(get_bits(*loc32, 21, 2) == 3); + set_bits(loc32, 5, value, 48, 16); +} + +// 21-bit count of pages between this page and an absolute address's page... I +// know, I know, it's weird. Pairs nicely with patch_aarch64_12 (above). +static inline void +patch_aarch64_21(unsigned char *location, uintptr_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + value = (value >> 12) - ((uintptr_t)location >> 12); + // Check that we're not out of range of 21 signed bits: + assert((int64_t)value >= -(1 << 20)); + assert((int64_t)value < (1 << 20)); + // value[0:2] goes in loc[29:31]: + set_bits(loc32, 29, value, 0, 2); + // value[2:21] goes in loc[5:26]: + set_bits(loc32, 5, value, 2, 19); +} + +static inline void +patch_aarch64_21x(unsigned char *location, uintptr_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_ADRP(*loc32)); + // // Try to relax the pair of GOT loads into an immediate value: + // const Hole *next_hole = &stencil->holes[i + 1]; + // if (i + 1 < stencil->holes_size && + // (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || + // next_hole->kind == HoleKind_IMAGE_REL_ARM64_PAGEOFFSET_12L || + // next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) && + // next_hole->offset == hole->offset + 4 && + // next_hole->symbol == hole->symbol && + // next_hole->addend == hole->addend && + // next_hole->value == hole->value) + // { + // unsigned char reg = get_bits(loc32[0], 0, 5); + // assert(IS_AARCH64_LDR_OR_STR(loc32[1])); + // // There should be only one register involved: + // assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. + // assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. + // uint64_t relaxed = *(uint64_t *)value; + // if (relaxed < (1UL << 16)) { + // // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop + // loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; + // loc32[1] = 0xD503201F; + // i++; + // continue; + // } + // if (relaxed < (1ULL << 32)) { + // // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY + // loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; + // loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg; + // i++; + // continue; + // } + // relaxed = value - (uintptr_t)location; + // if ((relaxed & 0x3) == 0 && + // (int64_t)relaxed >= -(1L << 19) && + // (int64_t)relaxed < (1L << 19)) + // { + // // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop + // loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg; + // loc32[1] = 0xD503201F; + // i++; + // continue; + // } + // } + patch_aarch64_21(location, value); +} + +// 28-bit relative branch. +static inline void +patch_aarch64_26(unsigned char *location, uintptr_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_BRANCH(*loc32)); + value -= (uintptr_t)location; + // Check that we're not out of range of 28 signed bits: + assert((int64_t)value >= -(1 << 27)); + assert((int64_t)value < (1 << 27)); + // Since instructions are 4-byte aligned, only use 26 bits: + assert(get_bits(value, 0, 2) == 0); + set_bits(loc32, 0, value, 2, 26); +} + +// 32-bit relative address. +static inline void +patch_x86_64_32x(unsigned char *location, uintptr_t value) +{ + uint8_t *loc8 = (uint8_t *)location; + uint32_t *loc32 = (uint32_t *)location; + // Try to relax the GOT load into an immediate value: + uint64_t relaxed = *(uint64_t *)(value + 4) - 4; + if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && + (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) + { + if (loc8[-2] == 0x8B) { + // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX] + loc8[-2] = 0x8D; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { + // call qword ptr [rip + AAA] -> nop; call XXX + loc8[-2] = 0x90; + loc8[-1] = 0xE8; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { + // jmp qword ptr [rip + AAA] -> nop; jmp XXX + loc8[-2] = 0x90; + loc8[-1] = 0xE9; + value = relaxed; + } + } + // XXX: Dup of patch_R_X86_64_GOTPCREL: + value -= (uintptr_t)location; + // Check that we're not out of range of 32 signed bits: + assert((int64_t)value >= -(1LL << 31)); + assert((int64_t)value < (1LL << 31)); + *loc32 = (uint32_t)value; +} + +#include "jit_stencils.h" + // Compiles executor in-place. Don't forget to call _PyJIT_Free later! int _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size_t length) { // Loop once to find the total compiled size: size_t instruction_starts[UOP_MAX_TRACE_LENGTH]; - size_t code_size = trampoline.code.body_size; - size_t data_size = trampoline.data.body_size; + size_t code_size = emitted_trampoline_code; + size_t data_size = emitted_trampoline_data; for (size_t i = 0; i < length; i++) { _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; - const StencilGroup *group = &stencil_groups[instruction->opcode]; instruction_starts[i] = code_size; - code_size += group->code.body_size; - data_size += group->data.body_size; + code_size += emitted[instruction->opcode][0]; + data_size += emitted[instruction->opcode][1]; } - code_size += stencil_groups[_FATAL_ERROR].code.body_size; - data_size += stencil_groups[_FATAL_ERROR].data.body_size; + code_size += emitted[_FATAL_ERROR][0]; + data_size += emitted[_FATAL_ERROR][1]; // Round up to the nearest page: size_t page_size = get_page_size(); assert((page_size & (page_size - 1)) == 0); @@ -414,26 +413,24 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size // (which may be different for efficiency reasons). On platforms where // we don't change calling conventions, the trampoline is empty and // nothing is emitted here: - const StencilGroup *group = &trampoline; // Think of patches as a dictionary mapping HoleValue to uintptr_t: uintptr_t patches[] = GET_PATCHES(); patches[HoleValue_CODE] = (uintptr_t)code; - patches[HoleValue_CONTINUE] = (uintptr_t)code + group->code.body_size; + patches[HoleValue_CONTINUE] = (uintptr_t)code + emitted_trampoline_code; patches[HoleValue_DATA] = (uintptr_t)data; patches[HoleValue_EXECUTOR] = (uintptr_t)executor; - patches[HoleValue_TOP] = (uintptr_t)memory + trampoline.code.body_size; + patches[HoleValue_TOP] = (uintptr_t)memory + emitted_trampoline_code; patches[HoleValue_ZERO] = 0; - emit(group, patches); - code += group->code.body_size; - data += group->data.body_size; + emit_trampoline(patches); + code += emitted_trampoline_code; + data += emitted_trampoline_data; } assert(trace[0].opcode == _START_EXECUTOR || trace[0].opcode == _COLD_EXIT); for (size_t i = 0; i < length; i++) { _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; - const StencilGroup *group = &stencil_groups[instruction->opcode]; uintptr_t patches[] = GET_PATCHES(); patches[HoleValue_CODE] = (uintptr_t)code; - patches[HoleValue_CONTINUE] = (uintptr_t)code + group->code.body_size; + patches[HoleValue_CONTINUE] = (uintptr_t)code + emitted[instruction->opcode][0]; patches[HoleValue_DATA] = (uintptr_t)data; patches[HoleValue_EXECUTOR] = (uintptr_t)executor; patches[HoleValue_OPARG] = instruction->oparg; @@ -468,13 +465,13 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size } patches[HoleValue_TOP] = (uintptr_t)memory + instruction_starts[1]; patches[HoleValue_ZERO] = 0; - emit(group, patches); - code += group->code.body_size; - data += group->data.body_size; + // XXX: Args: code, data, executor, instruction, instruction_starts (with memory added) + emitters[instruction->opcode](patches); + code += emitted[instruction->opcode][0]; + data += emitted[instruction->opcode][1]; } { // Protect against accidental buffer overrun into data: - const StencilGroup *group = &stencil_groups[_FATAL_ERROR]; uintptr_t patches[] = GET_PATCHES(); patches[HoleValue_CODE] = (uintptr_t)code; patches[HoleValue_CONTINUE] = (uintptr_t)code; @@ -482,9 +479,9 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size patches[HoleValue_EXECUTOR] = (uintptr_t)executor; patches[HoleValue_TOP] = (uintptr_t)code; patches[HoleValue_ZERO] = 0; - emit(group, patches); - code += group->code.body_size; - data += group->data.body_size; + emitters[_FATAL_ERROR](patches); + code += emitted[_FATAL_ERROR][0]; + data += emitted[_FATAL_ERROR][1]; } assert(code == memory + code_size); assert(data == memory + code_size + data_size); @@ -493,7 +490,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size return -1; } executor->jit_code = memory; - executor->jit_side_entry = memory + trampoline.code.body_size; + executor->jit_side_entry = memory + emitted_trampoline_code; executor->jit_size = total_size; return 0; } diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index ccd67850c37787..4d73f027294570 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -5,61 +5,110 @@ import _schema import _stencils +_PATCH_REMAP = { + # aarch64-apple-darwin: + "ARM64_RELOC_GOT_LOAD_PAGE21": "aarch64_21x", + "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "aarch64_12", + "ARM64_RELOC_PAGE21": "aarch64_21", + "ARM64_RELOC_PAGEOFF12": "aarch64_12", + "ARM64_RELOC_UNSIGNED": "64", + # x86_64-pc-windows-msvc: + "IMAGE_REL_AMD64_REL32": "x86_64_32x", + # aarch64-pc-windows-msvc: + "IMAGE_REL_ARM64_BRANCH26": "aarch64_26", + "IMAGE_REL_ARM64_PAGEBASE_REL21": "aarch64_21x", + "IMAGE_REL_ARM64_PAGEOFFSET_12A": "aarch64_12", + "IMAGE_REL_ARM64_PAGEOFFSET_12L": "aarch64_12", + # i686-pc-windows-msvc: + "IMAGE_REL_I386_DIR32": "32", + "IMAGE_REL_I386_REL32": "x86_64_32x", # XXX + # aarch64-unknown-linux-gnu: + "R_AARCH64_ABS64": "64", + "R_AARCH64_ADR_GOT_PAGE": "aarch64_21x", + "R_AARCH64_CALL26": "aarch64_26", + "R_AARCH64_JUMP26": "aarch64_26", + "R_AARCH64_LD64_GOT_LO12_NC": "aarch64_12", + "R_AARCH64_MOVW_UABS_G0_NC": "aarch64_16a", + "R_AARCH64_MOVW_UABS_G1_NC": "aarch64_16b", + "R_AARCH64_MOVW_UABS_G2_NC": "aarch64_16c", + "R_AARCH64_MOVW_UABS_G3": "aarch64_16d", + # x86_64-unknown-linux-gnu: + "R_X86_64_64": "64", + "R_X86_64_GOTPCREL": "32r", + "R_X86_64_GOTPCRELX": "x86_64_32x", + "R_X86_64_PC32": "32r", + "R_X86_64_REX_GOTPCRELX": "x86_64_32x", + # x86_64-apple-darwin: + "X86_64_RELOC_BRANCH": "32r", + "X86_64_RELOC_GOT": "x86_64_32x", + "X86_64_RELOC_GOT_LOAD": "x86_64_32x", + "X86_64_RELOC_SIGNED": "32r", + "X86_64_RELOC_UNSIGNED": "64", +} def _dump_header() -> typing.Iterator[str]: - yield "typedef enum {" - for kind in typing.get_args(_schema.HoleKind): - yield f" HoleKind_{kind}," - yield "} HoleKind;" - yield "" + yield "typedef void (*emitter)(uintptr_t patches[]);" + # yield "typedef enum {" + # for kind in typing.get_args(_schema.HoleKind): + # yield f" HoleKind_{kind}," + # yield "} HoleKind;" + # yield "" yield "typedef enum {" for value in _stencils.HoleValue: yield f" HoleValue_{value.name}," yield "} HoleValue;" yield "" - yield "typedef struct {" - yield " const size_t offset;" - yield " const HoleKind kind;" - yield " const HoleValue value;" - yield " const void *symbol;" - yield " const uint64_t addend;" - yield "} Hole;" - yield "" - yield "typedef struct {" - yield " const size_t body_size;" - yield " const unsigned char * const body;" - yield " const size_t holes_size;" - yield " const Hole * const holes;" - yield "} Stencil;" - yield "" - yield "typedef struct {" - yield " const Stencil code;" - yield " const Stencil data;" - yield "} StencilGroup;" - yield "" + # yield "typedef struct {" + # yield " const size_t offset;" + # yield " const HoleKind kind;" + # yield " const HoleValue value;" + # yield " const void *symbol;" + # yield " const uint64_t addend;" + # yield "} Hole;" + # yield "" + # yield "typedef struct {" + # yield " const size_t body_size;" + # yield " const unsigned char * const body;" + # yield " const size_t holes_size;" + # yield " const Hole * const holes;" + # yield "} Stencil;" + # yield "" + # yield "typedef struct {" + # yield " const Stencil code;" + # yield " const Stencil data;" + # yield "} StencilGroup;" + # yield "" -def _dump_footer(opnames: typing.Iterable[str]) -> typing.Iterator[str]: - yield "#define INIT_STENCIL(STENCIL) { \\" - yield " .body_size = Py_ARRAY_LENGTH(STENCIL##_body) - 1, \\" - yield " .body = STENCIL##_body, \\" - yield " .holes_size = Py_ARRAY_LENGTH(STENCIL##_holes) - 1, \\" - yield " .holes = STENCIL##_holes, \\" - yield "}" - yield "" - yield "#define INIT_STENCIL_GROUP(OP) { \\" - yield " .code = INIT_STENCIL(OP##_code), \\" - yield " .data = INIT_STENCIL(OP##_data), \\" - yield "}" +def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: + # yield "#define INIT_STENCIL(STENCIL) { \\" + # yield " .body_size = Py_ARRAY_LENGTH(STENCIL##_body) - 1, \\" + # yield " .body = STENCIL##_body, \\" + # yield " .holes_size = Py_ARRAY_LENGTH(STENCIL##_holes) - 1, \\" + # yield " .holes = STENCIL##_holes, \\" + # yield "}" + # yield "" + # yield "#define INIT_STENCIL_GROUP(OP) { \\" + # yield " .code = INIT_STENCIL(OP##_code), \\" + # yield " .data = INIT_STENCIL(OP##_data), \\" + # yield "}" + # yield "" + yield "static const emitter emitters[MAX_UOP_ID + 1] = {" + for opname in sorted(groups): + if opname == "trampoline": + continue + yield f" [{opname}] = emit_{opname}," + yield "};" yield "" - yield "static const StencilGroup stencil_groups[512] = {" - for opname in opnames: + yield "static const size_t emitted[MAX_UOP_ID + 1][2] = {" + for opname, group in sorted(groups.items()): if opname == "trampoline": continue - yield f" [{opname}] = INIT_STENCIL_GROUP({opname})," + yield f" [{opname}] = {{{len(group.code.body)}, {len(group.data.body)}}}," yield "};" yield "" - yield "static const StencilGroup trampoline = INIT_STENCIL_GROUP(trampoline);" + yield f"static const size_t emitted_trampoline_code = {len(groups['trampoline'].code.body)};" + yield f"static const size_t emitted_trampoline_data = {len(groups['trampoline'].data.body)};" yield "" yield "#define GET_PATCHES() { \\" for value in _stencils.HoleValue: @@ -68,27 +117,22 @@ def _dump_footer(opnames: typing.Iterable[str]) -> typing.Iterator[str]: def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator[str]: - yield f"// {opname}" - for part, stencil in [("code", group.code), ("data", group.data)]: + yield f"void emit_{opname}(uintptr_t patches[]) {{" + yield f" unsigned char *location;" + for part, stencil in [("data", group.data), ("code", group.code)]: for line in stencil.disassembly: - yield f"// {line}" + yield f" // {line}" if stencil.body: - size = len(stencil.body) + 1 - yield f"static const unsigned char {opname}_{part}_body[{size}] = {{" + yield f" const unsigned char {part}[{len(stencil.body)}] = {{" for i in range(0, len(stencil.body), 8): row = " ".join(f"{byte:#04x}," for byte in stencil.body[i : i + 8]) - yield f" {row}" - yield "};" - else: - yield f"static const unsigned char {opname}_{part}_body[1];" - if stencil.holes: - size = len(stencil.holes) + 1 - yield f"static const Hole {opname}_{part}_holes[{size}] = {{" - for hole in stencil.holes: - yield f" {hole.as_c()}," - yield "};" - else: - yield f"static const Hole {opname}_{part}_holes[1];" + yield f" {row}" + yield " };" + yield f" location = (unsigned char *)patches[HoleValue_{part.upper()}];" + yield f" memcpy(location, {part}, sizeof({part}));" + for hole in stencil.holes: + yield f" patch_{_PATCH_REMAP[hole.kind]}(location + {hole.offset}, patches[HoleValue_{hole.value.name}]{f' + (uintptr_t)&{hole.symbol}' if hole.symbol else ''}{f' + {_stencils._signed(hole.addend):#x}' if _stencils._signed(hole.addend) else ''});" + yield "}" yield "" From f30fa640dc56183f7dfaaf8235b5cb2c52c4fd63 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Fri, 19 Apr 2024 14:33:51 -0700 Subject: [PATCH 02/18] Generate patching logic --- Python/jit.c | 84 +++++---------------- Tools/jit/_stencils.py | 11 --- Tools/jit/_writer.py | 168 +++++++++++++++++++---------------------- 3 files changed, 97 insertions(+), 166 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 3e5b3ae79d3a69..f4b75796c2df4f 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -381,15 +381,13 @@ patch_x86_64_32x(unsigned char *location, uintptr_t value) // Compiles executor in-place. Don't forget to call _PyJIT_Free later! int -_PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size_t length) +_PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], size_t length) { // Loop once to find the total compiled size: - size_t instruction_starts[UOP_MAX_TRACE_LENGTH]; size_t code_size = emitted_trampoline_code; size_t data_size = emitted_trampoline_data; for (size_t i = 0; i < length; i++) { - _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; - instruction_starts[i] = code_size; + const _PyUOpInstruction *instruction = &trace[i]; code_size += emitted[instruction->opcode][0]; data_size += emitted[instruction->opcode][1]; } @@ -404,6 +402,15 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size if (memory == NULL) { return -1; } + // Loop again to find the offsets of each instruction: + size_t offset = emitted_trampoline_code; + uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; + for (size_t i = 0; i < length; i++) { + const _PyUOpInstruction *instruction = &trace[i]; + instruction_starts[i] = (uintptr_t)memory + offset; + offset += emitted[instruction->opcode][0]; + } + assert(offset + emitted[_FATAL_ERROR][0] == code_size); // Loop again to emit the code: unsigned char *code = memory; unsigned char *data = memory + code_size; @@ -413,76 +420,21 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size // (which may be different for efficiency reasons). On platforms where // we don't change calling conventions, the trampoline is empty and // nothing is emitted here: - // Think of patches as a dictionary mapping HoleValue to uintptr_t: - uintptr_t patches[] = GET_PATCHES(); - patches[HoleValue_CODE] = (uintptr_t)code; - patches[HoleValue_CONTINUE] = (uintptr_t)code + emitted_trampoline_code; - patches[HoleValue_DATA] = (uintptr_t)data; - patches[HoleValue_EXECUTOR] = (uintptr_t)executor; - patches[HoleValue_TOP] = (uintptr_t)memory + emitted_trampoline_code; - patches[HoleValue_ZERO] = 0; - emit_trampoline(patches); + emit_trampoline(code, data, executor, NULL, instruction_starts); code += emitted_trampoline_code; data += emitted_trampoline_data; } assert(trace[0].opcode == _START_EXECUTOR || trace[0].opcode == _COLD_EXIT); for (size_t i = 0; i < length; i++) { - _PyUOpInstruction *instruction = (_PyUOpInstruction *)&trace[i]; - uintptr_t patches[] = GET_PATCHES(); - patches[HoleValue_CODE] = (uintptr_t)code; - patches[HoleValue_CONTINUE] = (uintptr_t)code + emitted[instruction->opcode][0]; - patches[HoleValue_DATA] = (uintptr_t)data; - patches[HoleValue_EXECUTOR] = (uintptr_t)executor; - patches[HoleValue_OPARG] = instruction->oparg; - #if SIZEOF_VOID_P == 8 - patches[HoleValue_OPERAND] = instruction->operand; - #else - assert(SIZEOF_VOID_P == 4); - patches[HoleValue_OPERAND_HI] = instruction->operand >> 32; - patches[HoleValue_OPERAND_LO] = instruction->operand & UINT32_MAX; - #endif - switch (instruction->format) { - case UOP_FORMAT_TARGET: - patches[HoleValue_TARGET] = instruction->target; - break; - case UOP_FORMAT_EXIT: - assert(instruction->exit_index < executor->exit_count); - patches[HoleValue_EXIT_INDEX] = instruction->exit_index; - if (instruction->error_target < length) { - patches[HoleValue_ERROR_TARGET] = (uintptr_t)memory + instruction_starts[instruction->error_target]; - } - break; - case UOP_FORMAT_JUMP: - assert(instruction->jump_target < length); - patches[HoleValue_JUMP_TARGET] = (uintptr_t)memory + instruction_starts[instruction->jump_target]; - if (instruction->error_target < length) { - patches[HoleValue_ERROR_TARGET] = (uintptr_t)memory + instruction_starts[instruction->error_target]; - } - break; - default: - assert(0); - Py_FatalError("Illegal instruction format"); - } - patches[HoleValue_TOP] = (uintptr_t)memory + instruction_starts[1]; - patches[HoleValue_ZERO] = 0; - // XXX: Args: code, data, executor, instruction, instruction_starts (with memory added) - emitters[instruction->opcode](patches); + const _PyUOpInstruction *instruction = &trace[i]; + emitters[instruction->opcode](code, data, executor, instruction, instruction_starts); code += emitted[instruction->opcode][0]; data += emitted[instruction->opcode][1]; } - { - // Protect against accidental buffer overrun into data: - uintptr_t patches[] = GET_PATCHES(); - patches[HoleValue_CODE] = (uintptr_t)code; - patches[HoleValue_CONTINUE] = (uintptr_t)code; - patches[HoleValue_DATA] = (uintptr_t)data; - patches[HoleValue_EXECUTOR] = (uintptr_t)executor; - patches[HoleValue_TOP] = (uintptr_t)code; - patches[HoleValue_ZERO] = 0; - emitters[_FATAL_ERROR](patches); - code += emitted[_FATAL_ERROR][0]; - data += emitted[_FATAL_ERROR][1]; - } + // Protect against accidental buffer overrun into data: + emitters[_FATAL_ERROR](code, data, executor, NULL, instruction_starts); + code += emitted[_FATAL_ERROR][0]; + data += emitted[_FATAL_ERROR][1]; assert(code == memory + code_size); assert(data == memory + code_size + data_size); if (mark_executable(memory, total_size)) { diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 9feceb45388d05..8f97f58975ba16 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -66,17 +66,6 @@ class Hole: # Convenience method: replace = dataclasses.replace - def as_c(self) -> str: - """Dump this hole as an initialization of a C Hole struct.""" - parts = [ - f"{self.offset:#x}", - f"HoleKind_{self.kind}", - f"HoleValue_{self.value.name}", - f"&{self.symbol}" if self.symbol else "NULL", - f"{_signed(self.addend):#x}", - ] - return f"{{{', '.join(parts)}}}" - @dataclasses.dataclass class Stencil: diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 4d73f027294570..3facf0827361d7 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -5,94 +5,85 @@ import _schema import _stencils -_PATCH_REMAP = { +_PATCH_FUNCS = { # aarch64-apple-darwin: - "ARM64_RELOC_GOT_LOAD_PAGE21": "aarch64_21x", - "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "aarch64_12", - "ARM64_RELOC_PAGE21": "aarch64_21", - "ARM64_RELOC_PAGEOFF12": "aarch64_12", - "ARM64_RELOC_UNSIGNED": "64", + "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21x", + "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12", + "ARM64_RELOC_PAGE21": "patch_aarch64_21", + "ARM64_RELOC_PAGEOFF12": "patch_aarch64_12", + "ARM64_RELOC_UNSIGNED": "patch_64", # x86_64-pc-windows-msvc: - "IMAGE_REL_AMD64_REL32": "x86_64_32x", + "IMAGE_REL_AMD64_REL32": "patch_x86_64_32x", # aarch64-pc-windows-msvc: - "IMAGE_REL_ARM64_BRANCH26": "aarch64_26", - "IMAGE_REL_ARM64_PAGEBASE_REL21": "aarch64_21x", - "IMAGE_REL_ARM64_PAGEOFFSET_12A": "aarch64_12", - "IMAGE_REL_ARM64_PAGEOFFSET_12L": "aarch64_12", + "IMAGE_REL_ARM64_BRANCH26": "patch_aarch64_26", + "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21x", + "IMAGE_REL_ARM64_PAGEOFFSET_12A": "patch_aarch64_12", + "IMAGE_REL_ARM64_PAGEOFFSET_12L": "patch_aarch64_12", # i686-pc-windows-msvc: - "IMAGE_REL_I386_DIR32": "32", - "IMAGE_REL_I386_REL32": "x86_64_32x", # XXX + "IMAGE_REL_I386_DIR32": "patch_32", + "IMAGE_REL_I386_REL32": "patch_x86_64_32x", # XXX # aarch64-unknown-linux-gnu: - "R_AARCH64_ABS64": "64", - "R_AARCH64_ADR_GOT_PAGE": "aarch64_21x", - "R_AARCH64_CALL26": "aarch64_26", - "R_AARCH64_JUMP26": "aarch64_26", - "R_AARCH64_LD64_GOT_LO12_NC": "aarch64_12", - "R_AARCH64_MOVW_UABS_G0_NC": "aarch64_16a", - "R_AARCH64_MOVW_UABS_G1_NC": "aarch64_16b", - "R_AARCH64_MOVW_UABS_G2_NC": "aarch64_16c", - "R_AARCH64_MOVW_UABS_G3": "aarch64_16d", + "R_AARCH64_ABS64": "patch_64", + "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21x", + "R_AARCH64_CALL26": "patch_aarch64_26", + "R_AARCH64_JUMP26": "patch_aarch64_26", + "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12", + "R_AARCH64_MOVW_UABS_G0_NC": "patch_aarch64_16a", + "R_AARCH64_MOVW_UABS_G1_NC": "patch_aarch64_16b", + "R_AARCH64_MOVW_UABS_G2_NC": "patch_aarch64_16c", + "R_AARCH64_MOVW_UABS_G3": "patch_aarch64_16d", # x86_64-unknown-linux-gnu: - "R_X86_64_64": "64", - "R_X86_64_GOTPCREL": "32r", - "R_X86_64_GOTPCRELX": "x86_64_32x", - "R_X86_64_PC32": "32r", - "R_X86_64_REX_GOTPCRELX": "x86_64_32x", + "R_X86_64_64": "patch_64", + "R_X86_64_GOTPCREL": "patch_32r", + "R_X86_64_GOTPCRELX": "patch_x86_64_32x", + "R_X86_64_PC32": "patch_32r", + "R_X86_64_REX_GOTPCRELX": "patch_x86_64_32x", # x86_64-apple-darwin: - "X86_64_RELOC_BRANCH": "32r", - "X86_64_RELOC_GOT": "x86_64_32x", - "X86_64_RELOC_GOT_LOAD": "x86_64_32x", - "X86_64_RELOC_SIGNED": "32r", - "X86_64_RELOC_UNSIGNED": "64", + "X86_64_RELOC_BRANCH": "patch_32r", + "X86_64_RELOC_GOT": "patch_x86_64_32x", + "X86_64_RELOC_GOT_LOAD": "patch_x86_64_32x", + "X86_64_RELOC_SIGNED": "patch_32r", + "X86_64_RELOC_UNSIGNED": "patch_64", } -def _dump_header() -> typing.Iterator[str]: - yield "typedef void (*emitter)(uintptr_t patches[]);" - # yield "typedef enum {" - # for kind in typing.get_args(_schema.HoleKind): - # yield f" HoleKind_{kind}," - # yield "} HoleKind;" - # yield "" - yield "typedef enum {" - for value in _stencils.HoleValue: - yield f" HoleValue_{value.name}," - yield "} HoleValue;" - yield "" - # yield "typedef struct {" - # yield " const size_t offset;" - # yield " const HoleKind kind;" - # yield " const HoleValue value;" - # yield " const void *symbol;" - # yield " const uint64_t addend;" - # yield "} Hole;" - # yield "" - # yield "typedef struct {" - # yield " const size_t body_size;" - # yield " const unsigned char * const body;" - # yield " const size_t holes_size;" - # yield " const Hole * const holes;" - # yield "} Stencil;" - # yield "" - # yield "typedef struct {" - # yield " const Stencil code;" - # yield " const Stencil data;" - # yield "} StencilGroup;" - # yield "" +_HOLE_EXPRS = { + _stencils.HoleValue.CODE: "(uintptr_t)code", + _stencils.HoleValue.CONTINUE: "(uintptr_t)code + sizeof(code_body)", + _stencils.HoleValue.DATA: "(uintptr_t)data", + _stencils.HoleValue.EXECUTOR: "(uintptr_t)executor", + # _stencils.HoleValue.GOT: "", + _stencils.HoleValue.OPARG: "instruction->oparg", + _stencils.HoleValue.OPERAND: "instruction->operand", + _stencils.HoleValue.OPERAND_HI: "(instruction->operand >> 32)", + _stencils.HoleValue.OPERAND_LO: "(instruction->operand & UINT32_MAX)", + _stencils.HoleValue.TARGET: "instruction->target", + _stencils.HoleValue.JUMP_TARGET: "instruction_starts[instruction->jump_target]", + _stencils.HoleValue.ERROR_TARGET: "instruction_starts[instruction->error_target]", + _stencils.HoleValue.EXIT_INDEX: "instruction->exit_index", + _stencils.HoleValue.TOP: "instruction_starts[1]", + _stencils.HoleValue.ZERO: "", +} + +def _hole_to_patch(where: str, hole: _stencils.Hole) -> str: + func = _PATCH_FUNCS[hole.kind] + location = f"{where} + {hole.offset:#x}" + value = _HOLE_EXPRS[hole.value] + if hole.symbol: + if value: + value += " + " + value += f"(uintptr_t)&{hole.symbol}" + if _stencils._signed(hole.addend): + if value: + value += " + " + value += f"{_stencils._signed(hole.addend):#x}" + return f"{func}({location}, {value});" def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: - # yield "#define INIT_STENCIL(STENCIL) { \\" - # yield " .body_size = Py_ARRAY_LENGTH(STENCIL##_body) - 1, \\" - # yield " .body = STENCIL##_body, \\" - # yield " .holes_size = Py_ARRAY_LENGTH(STENCIL##_holes) - 1, \\" - # yield " .holes = STENCIL##_holes, \\" - # yield "}" - # yield "" - # yield "#define INIT_STENCIL_GROUP(OP) { \\" - # yield " .code = INIT_STENCIL(OP##_code), \\" - # yield " .data = INIT_STENCIL(OP##_data), \\" - # yield "}" - # yield "" + yield "typedef void (*emitter)(unsigned char *code, unsigned char *data," + yield " _PyExecutorObject *executor, const _PyUOpInstruction *instruction," + yield " uintptr_t instruction_starts[]);" + yield "" yield "static const emitter emitters[MAX_UOP_ID + 1] = {" for opname in sorted(groups): if opname == "trampoline": @@ -110,35 +101,34 @@ def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[s yield f"static const size_t emitted_trampoline_code = {len(groups['trampoline'].code.body)};" yield f"static const size_t emitted_trampoline_data = {len(groups['trampoline'].data.body)};" yield "" - yield "#define GET_PATCHES() { \\" - for value in _stencils.HoleValue: - yield f" [HoleValue_{value.name}] = (uintptr_t)0xBADBADBADBADBADB, \\" - yield "}" def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator[str]: - yield f"void emit_{opname}(uintptr_t patches[]) {{" - yield f" unsigned char *location;" - for part, stencil in [("data", group.data), ("code", group.code)]: + yield "void" + yield f"emit_{opname}(unsigned char *code, unsigned char *data," + yield f" {' ' * len(opname)} _PyExecutorObject *executor, const _PyUOpInstruction *instruction," + yield f" {' ' * len(opname)} uintptr_t instruction_starts[])" + yield "{" + for part, stencil in [("code", group.code), ("data", group.data)]: for line in stencil.disassembly: yield f" // {line}" if stencil.body: - yield f" const unsigned char {part}[{len(stencil.body)}] = {{" + yield f" const unsigned char {part}_body[{len(stencil.body)}] = {{" for i in range(0, len(stencil.body), 8): row = " ".join(f"{byte:#04x}," for byte in stencil.body[i : i + 8]) yield f" {row}" yield " };" - yield f" location = (unsigned char *)patches[HoleValue_{part.upper()}];" - yield f" memcpy(location, {part}, sizeof({part}));" + for part, stencil in [("data", group.data), ("code", group.code)]: + if stencil.body: + yield f" memcpy({part}, {part}_body, sizeof({part}_body));" for hole in stencil.holes: - yield f" patch_{_PATCH_REMAP[hole.kind]}(location + {hole.offset}, patches[HoleValue_{hole.value.name}]{f' + (uintptr_t)&{hole.symbol}' if hole.symbol else ''}{f' + {_stencils._signed(hole.addend):#x}' if _stencils._signed(hole.addend) else ''});" + yield f" {_hole_to_patch(part, hole)}" yield "}" yield "" def dump(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: """Yield a JIT compiler line-by-line as a C header file.""" - yield from _dump_header() - for opname, group in groups.items(): + for opname, group in sorted(groups.items()): yield from _dump_stencil(opname, group) yield from _dump_footer(groups) From 23e211c1b5010dcd53c0ceaf0650f05e7c442f91 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Fri, 19 Apr 2024 15:49:05 -0700 Subject: [PATCH 03/18] Cleanup --- Python/jit.c | 31 +++++++++++-------------------- Tools/jit/_writer.py | 26 +++++++++++++------------- 2 files changed, 24 insertions(+), 33 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index f4b75796c2df4f..4fcbb0cc46e614 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -194,7 +194,7 @@ patch_64(unsigned char *location, uintptr_t value) *loc64 = value; } -// 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21 +// 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21r // (below). static inline void patch_aarch64_12(unsigned char *location, uintptr_t value) @@ -261,7 +261,7 @@ patch_aarch64_16d(unsigned char *location, uintptr_t value) // 21-bit count of pages between this page and an absolute address's page... I // know, I know, it's weird. Pairs nicely with patch_aarch64_12 (above). static inline void -patch_aarch64_21(unsigned char *location, uintptr_t value) +patch_aarch64_21r(unsigned char *location, uintptr_t value) { uint32_t *loc32 = (uint32_t *)location; value = (value >> 12) - ((uintptr_t)location >> 12); @@ -275,7 +275,7 @@ patch_aarch64_21(unsigned char *location, uintptr_t value) } static inline void -patch_aarch64_21x(unsigned char *location, uintptr_t value) +patch_aarch64_21rx(unsigned char *location, uintptr_t value) { uint32_t *loc32 = (uint32_t *)location; assert(IS_AARCH64_ADRP(*loc32)); @@ -322,12 +322,12 @@ patch_aarch64_21x(unsigned char *location, uintptr_t value) // continue; // } // } - patch_aarch64_21(location, value); + patch_aarch64_21r(location, value); } // 28-bit relative branch. static inline void -patch_aarch64_26(unsigned char *location, uintptr_t value) +patch_aarch64_26r(unsigned char *location, uintptr_t value) { uint32_t *loc32 = (uint32_t *)location; assert(IS_AARCH64_BRANCH(*loc32)); @@ -342,10 +342,9 @@ patch_aarch64_26(unsigned char *location, uintptr_t value) // 32-bit relative address. static inline void -patch_x86_64_32x(unsigned char *location, uintptr_t value) +patch_x86_64_32rx(unsigned char *location, uintptr_t value) { uint8_t *loc8 = (uint8_t *)location; - uint32_t *loc32 = (uint32_t *)location; // Try to relax the GOT load into an immediate value: uint64_t relaxed = *(uint64_t *)(value + 4) - 4; if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && @@ -369,12 +368,7 @@ patch_x86_64_32x(unsigned char *location, uintptr_t value) value = relaxed; } } - // XXX: Dup of patch_R_X86_64_GOTPCREL: - value -= (uintptr_t)location; - // Check that we're not out of range of 32 signed bits: - assert((int64_t)value >= -(1LL << 31)); - assert((int64_t)value < (1LL << 31)); - *loc32 = (uint32_t)value; + patch_32r(location, value); } #include "jit_stencils.h" @@ -384,10 +378,12 @@ int _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], size_t length) { // Loop once to find the total compiled size: + uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; size_t code_size = emitted_trampoline_code; size_t data_size = emitted_trampoline_data; for (size_t i = 0; i < length; i++) { const _PyUOpInstruction *instruction = &trace[i]; + instruction_starts[i] = code_size; code_size += emitted[instruction->opcode][0]; data_size += emitted[instruction->opcode][1]; } @@ -402,15 +398,10 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz if (memory == NULL) { return -1; } - // Loop again to find the offsets of each instruction: - size_t offset = emitted_trampoline_code; - uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; + // Update the offsets of each instruction: for (size_t i = 0; i < length; i++) { - const _PyUOpInstruction *instruction = &trace[i]; - instruction_starts[i] = (uintptr_t)memory + offset; - offset += emitted[instruction->opcode][0]; + instruction_starts[i] += (uintptr_t)memory; } - assert(offset + emitted[_FATAL_ERROR][0] == code_size); // Loop again to emit the code: unsigned char *code = memory; unsigned char *data = memory + code_size; diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 3facf0827361d7..31e2c390e05d85 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -7,26 +7,26 @@ _PATCH_FUNCS = { # aarch64-apple-darwin: - "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21x", + "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21rx", "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12", - "ARM64_RELOC_PAGE21": "patch_aarch64_21", + "ARM64_RELOC_PAGE21": "patch_aarch64_21r", "ARM64_RELOC_PAGEOFF12": "patch_aarch64_12", "ARM64_RELOC_UNSIGNED": "patch_64", # x86_64-pc-windows-msvc: - "IMAGE_REL_AMD64_REL32": "patch_x86_64_32x", + "IMAGE_REL_AMD64_REL32": "patch_x86_64_32rx", # aarch64-pc-windows-msvc: - "IMAGE_REL_ARM64_BRANCH26": "patch_aarch64_26", - "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21x", + "IMAGE_REL_ARM64_BRANCH26": "patch_aarch64_26r", + "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21rx", "IMAGE_REL_ARM64_PAGEOFFSET_12A": "patch_aarch64_12", "IMAGE_REL_ARM64_PAGEOFFSET_12L": "patch_aarch64_12", # i686-pc-windows-msvc: "IMAGE_REL_I386_DIR32": "patch_32", - "IMAGE_REL_I386_REL32": "patch_x86_64_32x", # XXX + "IMAGE_REL_I386_REL32": "patch_x86_64_32rx", # XXX # aarch64-unknown-linux-gnu: "R_AARCH64_ABS64": "patch_64", - "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21x", - "R_AARCH64_CALL26": "patch_aarch64_26", - "R_AARCH64_JUMP26": "patch_aarch64_26", + "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21rx", + "R_AARCH64_CALL26": "patch_aarch64_26r", + "R_AARCH64_JUMP26": "patch_aarch64_26r", "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12", "R_AARCH64_MOVW_UABS_G0_NC": "patch_aarch64_16a", "R_AARCH64_MOVW_UABS_G1_NC": "patch_aarch64_16b", @@ -35,13 +35,13 @@ # x86_64-unknown-linux-gnu: "R_X86_64_64": "patch_64", "R_X86_64_GOTPCREL": "patch_32r", - "R_X86_64_GOTPCRELX": "patch_x86_64_32x", + "R_X86_64_GOTPCRELX": "patch_x86_64_32rx", "R_X86_64_PC32": "patch_32r", - "R_X86_64_REX_GOTPCRELX": "patch_x86_64_32x", + "R_X86_64_REX_GOTPCRELX": "patch_x86_64_32rx", # x86_64-apple-darwin: "X86_64_RELOC_BRANCH": "patch_32r", - "X86_64_RELOC_GOT": "patch_x86_64_32x", - "X86_64_RELOC_GOT_LOAD": "patch_x86_64_32x", + "X86_64_RELOC_GOT": "patch_x86_64_32rx", + "X86_64_RELOC_GOT_LOAD": "patch_x86_64_32rx", "X86_64_RELOC_SIGNED": "patch_32r", "X86_64_RELOC_UNSIGNED": "patch_64", } From 431fbed4c641b507ce47dd52f13312498cdb9f4d Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Fri, 19 Apr 2024 16:01:37 -0700 Subject: [PATCH 04/18] uint64_t -> uintptr_t --- Python/jit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 4fcbb0cc46e614..18d67a1f271bcb 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -116,7 +116,7 @@ mark_executable(unsigned char *memory, size_t size) // value[value_start : value_start + len] static uint32_t -get_bits(uint64_t value, uint8_t value_start, uint8_t width) +get_bits(uintptr_t value, uint8_t value_start, uint8_t width) { assert(width <= 32); return (value >> value_start) & ((1ULL << width) - 1); @@ -124,7 +124,7 @@ get_bits(uint64_t value, uint8_t value_start, uint8_t width) // *loc[loc_start : loc_start + width] = value[value_start : value_start + width] static void -set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, +set_bits(uint32_t *loc, uint8_t loc_start, uintptr_t value, uint8_t value_start, uint8_t width) { assert(loc_start + width <= 32); @@ -295,7 +295,7 @@ patch_aarch64_21rx(unsigned char *location, uintptr_t value) // // There should be only one register involved: // assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. // assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. - // uint64_t relaxed = *(uint64_t *)value; + // uintptr_t relaxed = *(uintptr_t *)value; // if (relaxed < (1UL << 16)) { // // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop // loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; @@ -346,7 +346,7 @@ patch_x86_64_32rx(unsigned char *location, uintptr_t value) { uint8_t *loc8 = (uint8_t *)location; // Try to relax the GOT load into an immediate value: - uint64_t relaxed = *(uint64_t *)(value + 4) - 4; + uintptr_t relaxed = *(uintptr_t *)(value + 4) - 4; if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) { From 3e6b25c3da42c8a1edfd9f49669827953daff52b Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Sat, 20 Apr 2024 16:16:56 -0700 Subject: [PATCH 05/18] uintptr_t -> uint64_t --- Python/jit.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 18d67a1f271bcb..26f9cb11b10e22 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -116,7 +116,7 @@ mark_executable(unsigned char *memory, size_t size) // value[value_start : value_start + len] static uint32_t -get_bits(uintptr_t value, uint8_t value_start, uint8_t width) +get_bits(uint64_t value, uint8_t value_start, uint8_t width) { assert(width <= 32); return (value >> value_start) & ((1ULL << width) - 1); @@ -124,7 +124,7 @@ get_bits(uintptr_t value, uint8_t value_start, uint8_t width) // *loc[loc_start : loc_start + width] = value[value_start : value_start + width] static void -set_bits(uint32_t *loc, uint8_t loc_start, uintptr_t value, uint8_t value_start, +set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, uint8_t width) { assert(loc_start + width <= 32); @@ -166,7 +166,7 @@ set_bits(uint32_t *loc, uint8_t loc_start, uintptr_t value, uint8_t value_start, // 32-bit absolute address. static inline void -patch_32(unsigned char *location, uintptr_t value) +patch_32(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; // Check that we're not out of range of 32 unsigned bits: @@ -176,7 +176,7 @@ patch_32(unsigned char *location, uintptr_t value) // 32-bit relative address. static inline void -patch_32r(unsigned char *location, uintptr_t value) +patch_32r(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; value -= (uintptr_t)location; @@ -188,7 +188,7 @@ patch_32r(unsigned char *location, uintptr_t value) // 64-bit absolute address. static inline void -patch_64(unsigned char *location, uintptr_t value) +patch_64(unsigned char *location, uint64_t value) { uint64_t *loc64 = (uint64_t *)location; *loc64 = value; @@ -197,7 +197,7 @@ patch_64(unsigned char *location, uintptr_t value) // 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21r // (below). static inline void -patch_aarch64_12(unsigned char *location, uintptr_t value) +patch_aarch64_12(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); @@ -216,7 +216,7 @@ patch_aarch64_12(unsigned char *location, uintptr_t value) // 16-bit low part of an absolute address. static inline void -patch_aarch64_16a(unsigned char *location, uintptr_t value) +patch_aarch64_16a(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; assert(IS_AARCH64_MOV(*loc32)); @@ -227,7 +227,7 @@ patch_aarch64_16a(unsigned char *location, uintptr_t value) // 16-bit middle-low part of an absolute address. static inline void -patch_aarch64_16b(unsigned char *location, uintptr_t value) +patch_aarch64_16b(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; assert(IS_AARCH64_MOV(*loc32)); @@ -238,7 +238,7 @@ patch_aarch64_16b(unsigned char *location, uintptr_t value) // 16-bit middle-high part of an absolute address. static inline void -patch_aarch64_16c(unsigned char *location, uintptr_t value) +patch_aarch64_16c(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; assert(IS_AARCH64_MOV(*loc32)); @@ -249,7 +249,7 @@ patch_aarch64_16c(unsigned char *location, uintptr_t value) // 16-bit high part of an absolute address. static inline void -patch_aarch64_16d(unsigned char *location, uintptr_t value) +patch_aarch64_16d(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; assert(IS_AARCH64_MOV(*loc32)); @@ -261,7 +261,7 @@ patch_aarch64_16d(unsigned char *location, uintptr_t value) // 21-bit count of pages between this page and an absolute address's page... I // know, I know, it's weird. Pairs nicely with patch_aarch64_12 (above). static inline void -patch_aarch64_21r(unsigned char *location, uintptr_t value) +patch_aarch64_21r(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; value = (value >> 12) - ((uintptr_t)location >> 12); @@ -275,7 +275,7 @@ patch_aarch64_21r(unsigned char *location, uintptr_t value) } static inline void -patch_aarch64_21rx(unsigned char *location, uintptr_t value) +patch_aarch64_21rx(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; assert(IS_AARCH64_ADRP(*loc32)); @@ -295,7 +295,7 @@ patch_aarch64_21rx(unsigned char *location, uintptr_t value) // // There should be only one register involved: // assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. // assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. - // uintptr_t relaxed = *(uintptr_t *)value; + // uint64_t relaxed = *(uint64_t *)value; // if (relaxed < (1UL << 16)) { // // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop // loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; @@ -327,7 +327,7 @@ patch_aarch64_21rx(unsigned char *location, uintptr_t value) // 28-bit relative branch. static inline void -patch_aarch64_26r(unsigned char *location, uintptr_t value) +patch_aarch64_26r(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; assert(IS_AARCH64_BRANCH(*loc32)); @@ -342,11 +342,11 @@ patch_aarch64_26r(unsigned char *location, uintptr_t value) // 32-bit relative address. static inline void -patch_x86_64_32rx(unsigned char *location, uintptr_t value) +patch_x86_64_32rx(unsigned char *location, uint64_t value) { uint8_t *loc8 = (uint8_t *)location; // Try to relax the GOT load into an immediate value: - uintptr_t relaxed = *(uintptr_t *)(value + 4) - 4; + uint64_t relaxed = *(uint64_t *)(value + 4) - 4; if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) { From 82030c854c7b6223706576c2981385edb68fb489 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Sat, 20 Apr 2024 16:40:55 -0700 Subject: [PATCH 06/18] Linting --- Tools/jit/_stencils.py | 74 ++++++++++++++++++++++++++++++++++++++++ Tools/jit/_writer.py | 76 +----------------------------------------- 2 files changed, 75 insertions(+), 75 deletions(-) diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 8f97f58975ba16..4127a4a1acd7b0 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -47,6 +47,65 @@ class HoleValue(enum.Enum): ZERO = enum.auto() +_PATCH_FUNCS = { + # aarch64-apple-darwin: + "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21rx", # XXX + "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12", + "ARM64_RELOC_PAGE21": "patch_aarch64_21r", + "ARM64_RELOC_PAGEOFF12": "patch_aarch64_12", + "ARM64_RELOC_UNSIGNED": "patch_64", + # x86_64-pc-windows-msvc: + "IMAGE_REL_AMD64_REL32": "patch_x86_64_32rx", + # aarch64-pc-windows-msvc: + "IMAGE_REL_ARM64_BRANCH26": "patch_aarch64_26r", + "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21rx", # XXX + "IMAGE_REL_ARM64_PAGEOFFSET_12A": "patch_aarch64_12", + "IMAGE_REL_ARM64_PAGEOFFSET_12L": "patch_aarch64_12", + # i686-pc-windows-msvc: + "IMAGE_REL_I386_DIR32": "patch_32", + "IMAGE_REL_I386_REL32": "patch_x86_64_32rx", # XXX + # aarch64-unknown-linux-gnu: + "R_AARCH64_ABS64": "patch_64", + "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21rx", # XXX + "R_AARCH64_CALL26": "patch_aarch64_26r", + "R_AARCH64_JUMP26": "patch_aarch64_26r", + "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12", + "R_AARCH64_MOVW_UABS_G0_NC": "patch_aarch64_16a", + "R_AARCH64_MOVW_UABS_G1_NC": "patch_aarch64_16b", + "R_AARCH64_MOVW_UABS_G2_NC": "patch_aarch64_16c", + "R_AARCH64_MOVW_UABS_G3": "patch_aarch64_16d", + # x86_64-unknown-linux-gnu: + "R_X86_64_64": "patch_64", + "R_X86_64_GOTPCREL": "patch_32r", + "R_X86_64_GOTPCRELX": "patch_x86_64_32rx", + "R_X86_64_PC32": "patch_32r", + "R_X86_64_REX_GOTPCRELX": "patch_x86_64_32rx", + # x86_64-apple-darwin: + "X86_64_RELOC_BRANCH": "patch_32r", + "X86_64_RELOC_GOT": "patch_x86_64_32rx", + "X86_64_RELOC_GOT_LOAD": "patch_x86_64_32rx", + "X86_64_RELOC_SIGNED": "patch_32r", + "X86_64_RELOC_UNSIGNED": "patch_64", +} +_HOLE_EXPRS = { + HoleValue.CODE: "(uintptr_t)code", + HoleValue.CONTINUE: "(uintptr_t)code + sizeof(code_body)", + HoleValue.DATA: "(uintptr_t)data", + HoleValue.EXECUTOR: "(uintptr_t)executor", + # HoleValue.GOT: "", + HoleValue.OPARG: "instruction->oparg", + HoleValue.OPERAND: "instruction->operand", + HoleValue.OPERAND_HI: "(instruction->operand >> 32)", + HoleValue.OPERAND_LO: "(instruction->operand & UINT32_MAX)", + HoleValue.TARGET: "instruction->target", + HoleValue.JUMP_TARGET: "instruction_starts[instruction->jump_target]", + HoleValue.ERROR_TARGET: "instruction_starts[instruction->error_target]", + HoleValue.EXIT_INDEX: "instruction->exit_index", + HoleValue.TOP: "instruction_starts[1]", + HoleValue.ZERO: "", +} + + @dataclasses.dataclass class Hole: """ @@ -66,6 +125,21 @@ class Hole: # Convenience method: replace = dataclasses.replace + def as_c(self, where: str) -> str: + """Dump this hole as a call to a patch_* function.""" + func = _PATCH_FUNCS[self.kind] + location = f"{where} + {self.offset:#x}" + value = _HOLE_EXPRS[self.value] + if self.symbol: + if value: + value += " + " + value += f"(uintptr_t)&{self.symbol}" + if _signed(self.addend): + if value: + value += " + " + value += f"{_signed(self.addend):#x}" + return f"{func}({location}, {value});" + @dataclasses.dataclass class Stencil: diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 31e2c390e05d85..a6ffc7cd108bd9 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -2,82 +2,8 @@ import typing -import _schema import _stencils -_PATCH_FUNCS = { - # aarch64-apple-darwin: - "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21rx", - "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12", - "ARM64_RELOC_PAGE21": "patch_aarch64_21r", - "ARM64_RELOC_PAGEOFF12": "patch_aarch64_12", - "ARM64_RELOC_UNSIGNED": "patch_64", - # x86_64-pc-windows-msvc: - "IMAGE_REL_AMD64_REL32": "patch_x86_64_32rx", - # aarch64-pc-windows-msvc: - "IMAGE_REL_ARM64_BRANCH26": "patch_aarch64_26r", - "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21rx", - "IMAGE_REL_ARM64_PAGEOFFSET_12A": "patch_aarch64_12", - "IMAGE_REL_ARM64_PAGEOFFSET_12L": "patch_aarch64_12", - # i686-pc-windows-msvc: - "IMAGE_REL_I386_DIR32": "patch_32", - "IMAGE_REL_I386_REL32": "patch_x86_64_32rx", # XXX - # aarch64-unknown-linux-gnu: - "R_AARCH64_ABS64": "patch_64", - "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21rx", - "R_AARCH64_CALL26": "patch_aarch64_26r", - "R_AARCH64_JUMP26": "patch_aarch64_26r", - "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12", - "R_AARCH64_MOVW_UABS_G0_NC": "patch_aarch64_16a", - "R_AARCH64_MOVW_UABS_G1_NC": "patch_aarch64_16b", - "R_AARCH64_MOVW_UABS_G2_NC": "patch_aarch64_16c", - "R_AARCH64_MOVW_UABS_G3": "patch_aarch64_16d", - # x86_64-unknown-linux-gnu: - "R_X86_64_64": "patch_64", - "R_X86_64_GOTPCREL": "patch_32r", - "R_X86_64_GOTPCRELX": "patch_x86_64_32rx", - "R_X86_64_PC32": "patch_32r", - "R_X86_64_REX_GOTPCRELX": "patch_x86_64_32rx", - # x86_64-apple-darwin: - "X86_64_RELOC_BRANCH": "patch_32r", - "X86_64_RELOC_GOT": "patch_x86_64_32rx", - "X86_64_RELOC_GOT_LOAD": "patch_x86_64_32rx", - "X86_64_RELOC_SIGNED": "patch_32r", - "X86_64_RELOC_UNSIGNED": "patch_64", -} - -_HOLE_EXPRS = { - _stencils.HoleValue.CODE: "(uintptr_t)code", - _stencils.HoleValue.CONTINUE: "(uintptr_t)code + sizeof(code_body)", - _stencils.HoleValue.DATA: "(uintptr_t)data", - _stencils.HoleValue.EXECUTOR: "(uintptr_t)executor", - # _stencils.HoleValue.GOT: "", - _stencils.HoleValue.OPARG: "instruction->oparg", - _stencils.HoleValue.OPERAND: "instruction->operand", - _stencils.HoleValue.OPERAND_HI: "(instruction->operand >> 32)", - _stencils.HoleValue.OPERAND_LO: "(instruction->operand & UINT32_MAX)", - _stencils.HoleValue.TARGET: "instruction->target", - _stencils.HoleValue.JUMP_TARGET: "instruction_starts[instruction->jump_target]", - _stencils.HoleValue.ERROR_TARGET: "instruction_starts[instruction->error_target]", - _stencils.HoleValue.EXIT_INDEX: "instruction->exit_index", - _stencils.HoleValue.TOP: "instruction_starts[1]", - _stencils.HoleValue.ZERO: "", -} - -def _hole_to_patch(where: str, hole: _stencils.Hole) -> str: - func = _PATCH_FUNCS[hole.kind] - location = f"{where} + {hole.offset:#x}" - value = _HOLE_EXPRS[hole.value] - if hole.symbol: - if value: - value += " + " - value += f"(uintptr_t)&{hole.symbol}" - if _stencils._signed(hole.addend): - if value: - value += " + " - value += f"{_stencils._signed(hole.addend):#x}" - return f"{func}({location}, {value});" - def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: yield "typedef void (*emitter)(unsigned char *code, unsigned char *data," @@ -122,7 +48,7 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator if stencil.body: yield f" memcpy({part}, {part}_body, sizeof({part}_body));" for hole in stencil.holes: - yield f" {_hole_to_patch(part, hole)}" + yield f" {hole.as_c(part)}" yield "}" yield "" From 236af82a52b2d1168419a54a2e9a9590f628dfed Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Sat, 27 Apr 2024 18:18:41 -0700 Subject: [PATCH 07/18] Restore AArch64 pair folding --- Python/jit.c | 83 +++++++++++++++++++----------------------- Tools/jit/_stencils.py | 21 ++++++++++- Tools/jit/_writer.py | 11 +++++- 3 files changed, 67 insertions(+), 48 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 26f9cb11b10e22..e30220c9f0f340 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -277,51 +277,6 @@ patch_aarch64_21r(unsigned char *location, uint64_t value) static inline void patch_aarch64_21rx(unsigned char *location, uint64_t value) { - uint32_t *loc32 = (uint32_t *)location; - assert(IS_AARCH64_ADRP(*loc32)); - // // Try to relax the pair of GOT loads into an immediate value: - // const Hole *next_hole = &stencil->holes[i + 1]; - // if (i + 1 < stencil->holes_size && - // (next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || - // next_hole->kind == HoleKind_IMAGE_REL_ARM64_PAGEOFFSET_12L || - // next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) && - // next_hole->offset == hole->offset + 4 && - // next_hole->symbol == hole->symbol && - // next_hole->addend == hole->addend && - // next_hole->value == hole->value) - // { - // unsigned char reg = get_bits(loc32[0], 0, 5); - // assert(IS_AARCH64_LDR_OR_STR(loc32[1])); - // // There should be only one register involved: - // assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. - // assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. - // uint64_t relaxed = *(uint64_t *)value; - // if (relaxed < (1UL << 16)) { - // // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop - // loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; - // loc32[1] = 0xD503201F; - // i++; - // continue; - // } - // if (relaxed < (1ULL << 32)) { - // // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY - // loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; - // loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg; - // i++; - // continue; - // } - // relaxed = value - (uintptr_t)location; - // if ((relaxed & 0x3) == 0 && - // (int64_t)relaxed >= -(1L << 19) && - // (int64_t)relaxed < (1L << 19)) - // { - // // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop - // loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg; - // loc32[1] = 0xD503201F; - // i++; - // continue; - // } - // } patch_aarch64_21r(location, value); } @@ -340,6 +295,44 @@ patch_aarch64_26r(unsigned char *location, uint64_t value) set_bits(loc32, 0, value, 2, 26); } +// A pair of patch_aarch64_21rx and patch_aarch64_12. +static inline void +patch_aarch64_33rx(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_ADRP(*loc32)); + // Try to relax the pair of GOT loads into an immediate value: + unsigned char reg = get_bits(loc32[0], 0, 5); + assert(IS_AARCH64_LDR_OR_STR(loc32[1])); + // There should be only one register involved: + assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. + assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. + uint64_t relaxed = *(uint64_t *)value; + if (relaxed < (1UL << 16)) { + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; + loc32[1] = 0xD503201F; + return; + } + if (relaxed < (1ULL << 32)) { + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; + loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg; + return; + } + relaxed = value - (uintptr_t)location; + if ((relaxed & 0x3) == 0 && + (int64_t)relaxed >= -(1L << 19) && + (int64_t)relaxed < (1L << 19)) + { + // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop + loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg; + loc32[1] = 0xD503201F; + return; + } + patch_aarch64_21r(location, value); +} + // 32-bit relative address. static inline void patch_x86_64_32rx(unsigned char *location, uint64_t value) diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 4127a4a1acd7b0..d454113cebedc9 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -3,6 +3,7 @@ import dataclasses import enum import sys +import typing import _schema @@ -122,12 +123,28 @@ class Hole: symbol: str | None # ...plus this addend: addend: int + func: str = dataclasses.field(init=False) # Convenience method: replace = dataclasses.replace + def __post_init__(self) -> None: + self.func = _PATCH_FUNCS[self.kind] + + def fold(self, other: typing.Self) -> typing.Self | None: + if ( + self.offset + 4 == other.offset + and self.value == other.value + and self.symbol == other.symbol + and self.addend == other.addend + and self.func == "patch_aarch64_21rx" + and other.func == "patch_aarch64_12" + ): + folded = self.replace() + folded.func = "patch_aarch64_33rx" + return folded + def as_c(self, where: str) -> str: """Dump this hole as a call to a patch_* function.""" - func = _PATCH_FUNCS[self.kind] location = f"{where} + {self.offset:#x}" value = _HOLE_EXPRS[self.value] if self.symbol: @@ -138,7 +155,7 @@ def as_c(self, where: str) -> str: if value: value += " + " value += f"{_signed(self.addend):#x}" - return f"{func}({location}, {value});" + return f"{self.func}({location}, {value});" @dataclasses.dataclass diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index a6ffc7cd108bd9..189c81ebb58142 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -1,5 +1,6 @@ """Utilities for writing StencilGroups out to a C header file.""" +import itertools import typing import _stencils @@ -47,7 +48,15 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator for part, stencil in [("data", group.data), ("code", group.code)]: if stencil.body: yield f" memcpy({part}, {part}_body, sizeof({part}_body));" - for hole in stencil.holes: + skip = False + stencil.holes.sort(key=lambda hole: hole.offset) + for hole, pair in itertools.zip_longest(stencil.holes, stencil.holes[1:]): + if skip: + skip = False + continue + if pair and (folded := hole.fold(pair)): + skip = True + hole = folded yield f" {hole.as_c(part)}" yield "}" yield "" From 3b7e693e1a4a737dfb554bd40bbaa23dc10d2cb6 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Mon, 29 Apr 2024 08:55:45 -0700 Subject: [PATCH 08/18] Cleanup --- Tools/jit/_stencils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index d454113cebedc9..42a368780fda78 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -50,7 +50,7 @@ class HoleValue(enum.Enum): _PATCH_FUNCS = { # aarch64-apple-darwin: - "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21rx", # XXX + "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21rx", "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12", "ARM64_RELOC_PAGE21": "patch_aarch64_21r", "ARM64_RELOC_PAGEOFF12": "patch_aarch64_12", @@ -59,15 +59,15 @@ class HoleValue(enum.Enum): "IMAGE_REL_AMD64_REL32": "patch_x86_64_32rx", # aarch64-pc-windows-msvc: "IMAGE_REL_ARM64_BRANCH26": "patch_aarch64_26r", - "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21rx", # XXX + "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21rx", "IMAGE_REL_ARM64_PAGEOFFSET_12A": "patch_aarch64_12", "IMAGE_REL_ARM64_PAGEOFFSET_12L": "patch_aarch64_12", # i686-pc-windows-msvc: "IMAGE_REL_I386_DIR32": "patch_32", - "IMAGE_REL_I386_REL32": "patch_x86_64_32rx", # XXX + "IMAGE_REL_I386_REL32": "patch_x86_64_32rx", # aarch64-unknown-linux-gnu: "R_AARCH64_ABS64": "patch_64", - "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21rx", # XXX + "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21rx", "R_AARCH64_CALL26": "patch_aarch64_26r", "R_AARCH64_JUMP26": "patch_aarch64_26r", "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12", @@ -131,6 +131,7 @@ def __post_init__(self) -> None: self.func = _PATCH_FUNCS[self.kind] def fold(self, other: typing.Self) -> typing.Self | None: + """Combine two holes into a single hole, if possible.""" if ( self.offset + 4 == other.offset and self.value == other.value @@ -142,6 +143,7 @@ def fold(self, other: typing.Self) -> typing.Self | None: folded = self.replace() folded.func = "patch_aarch64_33rx" return folded + return None def as_c(self, where: str) -> str: """Dump this hole as a call to a patch_* function.""" From fbb97fca7f0de5851f339937b39f31e7736b611d Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Mon, 29 Apr 2024 16:55:18 -0700 Subject: [PATCH 09/18] Add missing relocations --- Python/jit.c | 3 ++- Tools/jit/_stencils.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Python/jit.c b/Python/jit.c index e30220c9f0f340..bf29e4707a6c96 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -330,7 +330,8 @@ patch_aarch64_33rx(unsigned char *location, uint64_t value) loc32[1] = 0xD503201F; return; } - patch_aarch64_21r(location, value); + patch_aarch64_21rx(location, value); + patch_aarch64_12(location + 4, value); } // 32-bit relative address. diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 42a368780fda78..6810f961ab4992 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -50,6 +50,7 @@ class HoleValue(enum.Enum): _PATCH_FUNCS = { # aarch64-apple-darwin: + "ARM64_RELOC_BRANCH26": "patch_aarch64_26r", "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21rx", "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12", "ARM64_RELOC_PAGE21": "patch_aarch64_21r", @@ -67,7 +68,9 @@ class HoleValue(enum.Enum): "IMAGE_REL_I386_REL32": "patch_x86_64_32rx", # aarch64-unknown-linux-gnu: "R_AARCH64_ABS64": "patch_64", + "R_AARCH64_ADD_ABS_LO12_NC": "patch_aarch64_12", "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21rx", + "R_AARCH64_ADR_PREL_PG_HI21": "patch_aarch64_21rx", "R_AARCH64_CALL26": "patch_aarch64_26r", "R_AARCH64_JUMP26": "patch_aarch64_26r", "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12", From c40bb341d647f9011acdfd24dff523f3167f35a2 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Mon, 29 Apr 2024 17:49:55 -0700 Subject: [PATCH 10/18] Fix AArch64 folds --- Python/jit.c | 10 ++++++++-- Tools/jit/_stencils.py | 10 +++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index bf29e4707a6c96..9da90a885f3f08 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -214,6 +214,12 @@ patch_aarch64_12(unsigned char *location, uint64_t value) set_bits(loc32, 10, value, shift, 12); } +static inline void +patch_aarch64_12x(unsigned char *location, uint64_t value) +{ + patch_aarch64_12(location, value); +} + // 16-bit low part of an absolute address. static inline void patch_aarch64_16a(unsigned char *location, uint64_t value) @@ -295,7 +301,7 @@ patch_aarch64_26r(unsigned char *location, uint64_t value) set_bits(loc32, 0, value, 2, 26); } -// A pair of patch_aarch64_21rx and patch_aarch64_12. +// A pair of patch_aarch64_21rx and patch_aarch64_12x. static inline void patch_aarch64_33rx(unsigned char *location, uint64_t value) { @@ -331,7 +337,7 @@ patch_aarch64_33rx(unsigned char *location, uint64_t value) return; } patch_aarch64_21rx(location, value); - patch_aarch64_12(location + 4, value); + patch_aarch64_12x(location + 4, value); } // 32-bit relative address. diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 6810f961ab4992..b04cf4a6a475a0 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -52,7 +52,7 @@ class HoleValue(enum.Enum): # aarch64-apple-darwin: "ARM64_RELOC_BRANCH26": "patch_aarch64_26r", "ARM64_RELOC_GOT_LOAD_PAGE21": "patch_aarch64_21rx", - "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12", + "ARM64_RELOC_GOT_LOAD_PAGEOFF12": "patch_aarch64_12x", "ARM64_RELOC_PAGE21": "patch_aarch64_21r", "ARM64_RELOC_PAGEOFF12": "patch_aarch64_12", "ARM64_RELOC_UNSIGNED": "patch_64", @@ -62,7 +62,7 @@ class HoleValue(enum.Enum): "IMAGE_REL_ARM64_BRANCH26": "patch_aarch64_26r", "IMAGE_REL_ARM64_PAGEBASE_REL21": "patch_aarch64_21rx", "IMAGE_REL_ARM64_PAGEOFFSET_12A": "patch_aarch64_12", - "IMAGE_REL_ARM64_PAGEOFFSET_12L": "patch_aarch64_12", + "IMAGE_REL_ARM64_PAGEOFFSET_12L": "patch_aarch64_12x", # i686-pc-windows-msvc: "IMAGE_REL_I386_DIR32": "patch_32", "IMAGE_REL_I386_REL32": "patch_x86_64_32rx", @@ -70,10 +70,10 @@ class HoleValue(enum.Enum): "R_AARCH64_ABS64": "patch_64", "R_AARCH64_ADD_ABS_LO12_NC": "patch_aarch64_12", "R_AARCH64_ADR_GOT_PAGE": "patch_aarch64_21rx", - "R_AARCH64_ADR_PREL_PG_HI21": "patch_aarch64_21rx", + "R_AARCH64_ADR_PREL_PG_HI21": "patch_aarch64_21r", "R_AARCH64_CALL26": "patch_aarch64_26r", "R_AARCH64_JUMP26": "patch_aarch64_26r", - "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12", + "R_AARCH64_LD64_GOT_LO12_NC": "patch_aarch64_12x", "R_AARCH64_MOVW_UABS_G0_NC": "patch_aarch64_16a", "R_AARCH64_MOVW_UABS_G1_NC": "patch_aarch64_16b", "R_AARCH64_MOVW_UABS_G2_NC": "patch_aarch64_16c", @@ -141,7 +141,7 @@ def fold(self, other: typing.Self) -> typing.Self | None: and self.symbol == other.symbol and self.addend == other.addend and self.func == "patch_aarch64_21rx" - and other.func == "patch_aarch64_12" + and other.func == "patch_aarch64_12x" ): folded = self.replace() folded.func = "patch_aarch64_33rx" From bd570b51896beca998d69197cf9e401907c71a53 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 1 May 2024 13:10:55 -0700 Subject: [PATCH 11/18] Dedent --- Python/jit.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 9da90a885f3f08..7237ae5232f886 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -402,19 +402,17 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz for (size_t i = 0; i < length; i++) { instruction_starts[i] += (uintptr_t)memory; } - // Loop again to emit the code: unsigned char *code = memory; unsigned char *data = memory + code_size; - { - // Compile the trampoline, which handles converting between the native - // calling convention and the calling convention used by jitted code - // (which may be different for efficiency reasons). On platforms where - // we don't change calling conventions, the trampoline is empty and - // nothing is emitted here: - emit_trampoline(code, data, executor, NULL, instruction_starts); - code += emitted_trampoline_code; - data += emitted_trampoline_data; - } + // Compile the trampoline, which handles converting between the native + // calling convention and the calling convention used by jitted code + // (which may be different for efficiency reasons). On platforms where + // we don't change calling conventions, the trampoline is empty and + // nothing is emitted here: + emit_trampoline(code, data, executor, NULL, instruction_starts); + code += emitted_trampoline_code; + data += emitted_trampoline_data; + // Loop again to emit the code: assert(trace[0].opcode == _START_EXECUTOR || trace[0].opcode == _COLD_EXIT); for (size_t i = 0; i < length; i++) { const _PyUOpInstruction *instruction = &trace[i]; From b2fd9d2b9d6595815e3b25a721cdbebbe3184ae6 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 1 May 2024 16:31:12 -0700 Subject: [PATCH 12/18] Use a single array of structs --- Python/jit.c | 43 ++++++++++++++++++++++++++----------------- Tools/jit/_writer.py | 34 ++++++++++++++++------------------ 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 7237ae5232f886..c3c221e164ae1b 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -377,18 +377,24 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value) int _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], size_t length) { + const StencilGroup *group; // Loop once to find the total compiled size: uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; - size_t code_size = emitted_trampoline_code; - size_t data_size = emitted_trampoline_data; + size_t code_size = 0; + size_t data_size = 0; + group = &trampoline; + code_size += group->code_size; + data_size += group->data_size; for (size_t i = 0; i < length; i++) { const _PyUOpInstruction *instruction = &trace[i]; + group = &stencil_groups[instruction->opcode]; instruction_starts[i] = code_size; - code_size += emitted[instruction->opcode][0]; - data_size += emitted[instruction->opcode][1]; + code_size += group->code_size; + data_size += group->data_size; } - code_size += emitted[_FATAL_ERROR][0]; - data_size += emitted[_FATAL_ERROR][1]; + group = &stencil_groups[_FATAL_ERROR]; + code_size += group->code_size; + data_size += group->data_size; // Round up to the nearest page: size_t page_size = get_page_size(); assert((page_size & (page_size - 1)) == 0); @@ -402,6 +408,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz for (size_t i = 0; i < length; i++) { instruction_starts[i] += (uintptr_t)memory; } + // Loop again to emit the code: unsigned char *code = memory; unsigned char *data = memory + code_size; // Compile the trampoline, which handles converting between the native @@ -409,21 +416,23 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz // (which may be different for efficiency reasons). On platforms where // we don't change calling conventions, the trampoline is empty and // nothing is emitted here: - emit_trampoline(code, data, executor, NULL, instruction_starts); - code += emitted_trampoline_code; - data += emitted_trampoline_data; - // Loop again to emit the code: + group = &trampoline; + group->emit(code, data, executor, NULL, instruction_starts); + code += group->code_size; + data += group->data_size; assert(trace[0].opcode == _START_EXECUTOR || trace[0].opcode == _COLD_EXIT); for (size_t i = 0; i < length; i++) { const _PyUOpInstruction *instruction = &trace[i]; - emitters[instruction->opcode](code, data, executor, instruction, instruction_starts); - code += emitted[instruction->opcode][0]; - data += emitted[instruction->opcode][1]; + group = &stencil_groups[instruction->opcode]; + group->emit(code, data, executor, instruction, instruction_starts); + code += group->code_size; + data += group->data_size; } // Protect against accidental buffer overrun into data: - emitters[_FATAL_ERROR](code, data, executor, NULL, instruction_starts); - code += emitted[_FATAL_ERROR][0]; - data += emitted[_FATAL_ERROR][1]; + group = &stencil_groups[_FATAL_ERROR]; + group->emit(code, data, executor, NULL, instruction_starts); + code += group->code_size; + data += group->data_size; assert(code == memory + code_size); assert(data == memory + code_size + data_size); if (mark_executable(memory, total_size)) { @@ -431,7 +440,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz return -1; } executor->jit_code = memory; - executor->jit_side_entry = memory + emitted_trampoline_code; + executor->jit_side_entry = memory + trampoline.code_size; executor->jit_size = total_size; return 0; } diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 189c81ebb58142..be628b63569e2e 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -5,36 +5,34 @@ import _stencils +def _initialize_stencil_group(opname: str, group: _stencils.StencilGroup) -> str: + return f"{{emit_{opname}, {len(group.code.body)}, {len(group.data.body)}}}" def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: - yield "typedef void (*emitter)(unsigned char *code, unsigned char *data," - yield " _PyExecutorObject *executor, const _PyUOpInstruction *instruction," - yield " uintptr_t instruction_starts[]);" + yield "typedef struct {" + yield " void (*emit)(" + yield " unsigned char *code, unsigned char *data, _PyExecutorObject *executor," + yield " const _PyUOpInstruction *instruction, uintptr_t instruction_starts[]);" + yield " size_t code_size;" + yield " size_t data_size;" + yield "} StencilGroup;" yield "" - yield "static const emitter emitters[MAX_UOP_ID + 1] = {" - for opname in sorted(groups): - if opname == "trampoline": - continue - yield f" [{opname}] = emit_{opname}," - yield "};" + initializer = _initialize_stencil_group('trampoline', groups['trampoline']) + yield f"static const StencilGroup trampoline = {initializer};" yield "" - yield "static const size_t emitted[MAX_UOP_ID + 1][2] = {" + yield "static const StencilGroup stencil_groups[MAX_UOP_ID + 1] = {" for opname, group in sorted(groups.items()): if opname == "trampoline": continue - yield f" [{opname}] = {{{len(group.code.body)}, {len(group.data.body)}}}," + yield f" [{opname}] = {_initialize_stencil_group(opname, group)}," yield "};" - yield "" - yield f"static const size_t emitted_trampoline_code = {len(groups['trampoline'].code.body)};" - yield f"static const size_t emitted_trampoline_data = {len(groups['trampoline'].data.body)};" - yield "" def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator[str]: yield "void" - yield f"emit_{opname}(unsigned char *code, unsigned char *data," - yield f" {' ' * len(opname)} _PyExecutorObject *executor, const _PyUOpInstruction *instruction," - yield f" {' ' * len(opname)} uintptr_t instruction_starts[])" + yield f"emit_{opname}(" + yield " unsigned char *code, unsigned char *data, _PyExecutorObject *executor," + yield " const _PyUOpInstruction *instruction,uintptr_t instruction_starts[])" yield "{" for part, stencil in [("code", group.code), ("data", group.data)]: for line in stencil.disassembly: From 7aa12a2c3e3d3e11cad07f59cecb35b9e45e63c2 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 1 May 2024 16:48:00 -0700 Subject: [PATCH 13/18] Move C initializer formation to StencilGroup --- Tools/jit/_stencils.py | 4 ++++ Tools/jit/_writer.py | 7 ++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index b04cf4a6a475a0..f0344d79203cb2 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -350,6 +350,10 @@ def _emit_global_offset_table(self) -> None: ) self.data.body.extend([0] * 8) + def as_c(self, opname: str) -> str: + """Dump this hole as a StencilGroup initializer.""" + return f"{{emit_{opname}, {len(self.code.body)}, {len(self.data.body)}}}" + def symbol_to_value(symbol: str) -> tuple[HoleValue, str | None]: """ diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index be628b63569e2e..1384d3b5b66c35 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -5,8 +5,6 @@ import _stencils -def _initialize_stencil_group(opname: str, group: _stencils.StencilGroup) -> str: - return f"{{emit_{opname}, {len(group.code.body)}, {len(group.data.body)}}}" def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: yield "typedef struct {" @@ -17,14 +15,13 @@ def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[s yield " size_t data_size;" yield "} StencilGroup;" yield "" - initializer = _initialize_stencil_group('trampoline', groups['trampoline']) - yield f"static const StencilGroup trampoline = {initializer};" + yield f"static const StencilGroup trampoline = {groups['trampoline'].as_c('trampoline')};" yield "" yield "static const StencilGroup stencil_groups[MAX_UOP_ID + 1] = {" for opname, group in sorted(groups.items()): if opname == "trampoline": continue - yield f" [{opname}] = {_initialize_stencil_group(opname, group)}," + yield f" [{opname}] = {group.as_c(opname)}," yield "};" From 9ec64ac8f7db4d554a6fac97965e1e8dcc2341f2 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 1 May 2024 16:52:57 -0700 Subject: [PATCH 14/18] Add comment on why data is first --- Tools/jit/_writer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 1384d3b5b66c35..ed261860c45708 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -40,6 +40,7 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator row = " ".join(f"{byte:#04x}," for byte in stencil.body[i : i + 8]) yield f" {row}" yield " };" + # Data is written first (so relaxations in the code work properly): for part, stencil in [("data", group.data), ("code", group.code)]: if stencil.body: yield f" memcpy({part}, {part}_body, sizeof({part}_body));" From eb0826f3d2cba5c785b07fbe0bc827984da4edc6 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 1 May 2024 17:24:27 -0700 Subject: [PATCH 15/18] Silence warnings --- Python/jit.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index c3c221e164ae1b..8c648ec4166e0c 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -165,7 +165,7 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, // - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp // 32-bit absolute address. -static inline void +void patch_32(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; @@ -175,7 +175,7 @@ patch_32(unsigned char *location, uint64_t value) } // 32-bit relative address. -static inline void +void patch_32r(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; @@ -187,7 +187,7 @@ patch_32r(unsigned char *location, uint64_t value) } // 64-bit absolute address. -static inline void +void patch_64(unsigned char *location, uint64_t value) { uint64_t *loc64 = (uint64_t *)location; @@ -196,7 +196,7 @@ patch_64(unsigned char *location, uint64_t value) // 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21r // (below). -static inline void +void patch_aarch64_12(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; @@ -214,14 +214,14 @@ patch_aarch64_12(unsigned char *location, uint64_t value) set_bits(loc32, 10, value, shift, 12); } -static inline void +void patch_aarch64_12x(unsigned char *location, uint64_t value) { patch_aarch64_12(location, value); } // 16-bit low part of an absolute address. -static inline void +void patch_aarch64_16a(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; @@ -232,7 +232,7 @@ patch_aarch64_16a(unsigned char *location, uint64_t value) } // 16-bit middle-low part of an absolute address. -static inline void +void patch_aarch64_16b(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; @@ -243,7 +243,7 @@ patch_aarch64_16b(unsigned char *location, uint64_t value) } // 16-bit middle-high part of an absolute address. -static inline void +void patch_aarch64_16c(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; @@ -254,7 +254,7 @@ patch_aarch64_16c(unsigned char *location, uint64_t value) } // 16-bit high part of an absolute address. -static inline void +void patch_aarch64_16d(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; @@ -266,7 +266,7 @@ patch_aarch64_16d(unsigned char *location, uint64_t value) // 21-bit count of pages between this page and an absolute address's page... I // know, I know, it's weird. Pairs nicely with patch_aarch64_12 (above). -static inline void +void patch_aarch64_21r(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; @@ -280,14 +280,14 @@ patch_aarch64_21r(unsigned char *location, uint64_t value) set_bits(loc32, 5, value, 2, 19); } -static inline void +void patch_aarch64_21rx(unsigned char *location, uint64_t value) { patch_aarch64_21r(location, value); } // 28-bit relative branch. -static inline void +void patch_aarch64_26r(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; @@ -302,7 +302,7 @@ patch_aarch64_26r(unsigned char *location, uint64_t value) } // A pair of patch_aarch64_21rx and patch_aarch64_12x. -static inline void +void patch_aarch64_33rx(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; @@ -341,7 +341,7 @@ patch_aarch64_33rx(unsigned char *location, uint64_t value) } // 32-bit relative address. -static inline void +void patch_x86_64_32rx(unsigned char *location, uint64_t value) { uint8_t *loc8 = (uint8_t *)location; From a04d7f84bedd6cd4ddf46469247138e0c93d6bc0 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Thu, 2 May 2024 08:42:25 -0700 Subject: [PATCH 16/18] Add missing space --- Tools/jit/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index ed261860c45708..9d11094f85c7ff 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -29,7 +29,7 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator yield "void" yield f"emit_{opname}(" yield " unsigned char *code, unsigned char *data, _PyExecutorObject *executor," - yield " const _PyUOpInstruction *instruction,uintptr_t instruction_starts[])" + yield " const _PyUOpInstruction *instruction, uintptr_t instruction_starts[])" yield "{" for part, stencil in [("code", group.code), ("data", group.data)]: for line in stencil.disassembly: From b919fcca41af3362adbc940ec8a3c56ae977d7c3 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Fri, 3 May 2024 14:30:23 -0700 Subject: [PATCH 17/18] Clarify which patch functions are relaxing (and what that means) --- Python/jit.c | 22 ++++++++++++++++++++-- Tools/jit/_stencils.py | 6 ++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 8c648ec4166e0c..7c316a410dda6a 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -164,6 +164,11 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, // - x86_64-unknown-linux-gnu: // - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp +// Many of these patches are "relaxing", meaning that they can rewrite the +// code they're patching to be more efficient (like turning a 64-bit memory +// load into a 32-bit immediate load). These patches have an "x" in their name. +// Relative patches have an "r" in their name. + // 32-bit absolute address. void patch_32(unsigned char *location, uint64_t value) @@ -214,9 +219,15 @@ patch_aarch64_12(unsigned char *location, uint64_t value) set_bits(loc32, 10, value, shift, 12); } +// Relaxable 12-bit low part of an absolute address. Pairs nicely with +// patch_aarch64_21rx (below). void patch_aarch64_12x(unsigned char *location, uint64_t value) { + // This can *only* be relaxed if it occurs immediately before a matching + // patch_aarch64_21rx. If that happens, the JIT build step will replace both + // calls with a single call to patch_aarch64_33rx. Otherwise, we end up + // here, and the instruction is patched normally: patch_aarch64_12(location, value); } @@ -280,9 +291,15 @@ patch_aarch64_21r(unsigned char *location, uint64_t value) set_bits(loc32, 5, value, 2, 19); } +// Relaxable 21-bit count of pages between this page and an absolute address's +// page. Pairs nicely with patch_aarch64_12x (above). void patch_aarch64_21rx(unsigned char *location, uint64_t value) { + // This can *only* be relaxed if it occurs immediately before a matching + // patch_aarch64_12x. If that happens, the JIT build step will replace both + // calls with a single call to patch_aarch64_33rx. Otherwise, we end up + // here, and the instruction is patched normally: patch_aarch64_21r(location, value); } @@ -306,8 +323,8 @@ void patch_aarch64_33rx(unsigned char *location, uint64_t value) { uint32_t *loc32 = (uint32_t *)location; - assert(IS_AARCH64_ADRP(*loc32)); // Try to relax the pair of GOT loads into an immediate value: + assert(IS_AARCH64_ADRP(*loc32)); unsigned char reg = get_bits(loc32[0], 0, 5); assert(IS_AARCH64_LDR_OR_STR(loc32[1])); // There should be only one register involved: @@ -336,11 +353,12 @@ patch_aarch64_33rx(unsigned char *location, uint64_t value) loc32[1] = 0xD503201F; return; } + // Couldn't do it. Just patch the two instructions normally: patch_aarch64_21rx(location, value); patch_aarch64_12x(location + 4, value); } -// 32-bit relative address. +// Relaxable 32-bit relative address. void patch_x86_64_32rx(unsigned char *location, uint64_t value) { diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index f0344d79203cb2..fc48aceeff219f 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -48,6 +48,9 @@ class HoleValue(enum.Enum): ZERO = enum.auto() +# Map relocation types to our JIT's patch functions. "r" suffixes indicate that +# the patch function is relative. "x" suffixes indicate that they are "relaxing" +# (see comments in jit.c for more info): _PATCH_FUNCS = { # aarch64-apple-darwin: "ARM64_RELOC_BRANCH26": "patch_aarch64_26r", @@ -91,6 +94,7 @@ class HoleValue(enum.Enum): "X86_64_RELOC_SIGNED": "patch_32r", "X86_64_RELOC_UNSIGNED": "patch_64", } +# Translate HoleValues to C expressions: _HOLE_EXPRS = { HoleValue.CODE: "(uintptr_t)code", HoleValue.CONTINUE: "(uintptr_t)code + sizeof(code_body)", @@ -143,6 +147,8 @@ def fold(self, other: typing.Self) -> typing.Self | None: and self.func == "patch_aarch64_21rx" and other.func == "patch_aarch64_12x" ): + # These can *only* be properly relaxed when they appear together and + # patch the same value: folded = self.replace() folded.func = "patch_aarch64_33rx" return folded From 46adf09dcce8c464a43d44006d9e7bace30c17f8 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Fri, 3 May 2024 14:31:39 -0700 Subject: [PATCH 18/18] Exaplain why GOT is commented out --- Tools/jit/_stencils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index fc48aceeff219f..6e046df3026ae9 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -100,6 +100,7 @@ class HoleValue(enum.Enum): HoleValue.CONTINUE: "(uintptr_t)code + sizeof(code_body)", HoleValue.DATA: "(uintptr_t)data", HoleValue.EXECUTOR: "(uintptr_t)executor", + # These should all have been turned into DATA values by process_relocations: # HoleValue.GOT: "", HoleValue.OPARG: "instruction->oparg", HoleValue.OPERAND: "instruction->operand",