diff --git a/Include/internal/pycore_function.h b/Include/internal/pycore_function.h index e844d323ec7927..3f3da8a44b77e4 100644 --- a/Include/internal/pycore_function.h +++ b/Include/internal/pycore_function.h @@ -16,13 +16,22 @@ extern PyObject* _PyFunction_Vectorcall( #define FUNC_MAX_WATCHERS 8 +#define FUNC_VERSION_CACHE_SIZE (1<<12) /* Must be a power of 2 */ struct _py_func_state { uint32_t next_version; + // Borrowed references to function objects whose + // func_version % FUNC_VERSION_CACHE_SIZE + // once was equal to the index in the table. + // They are cleared when the function is deallocated. + PyFunctionObject *func_version_cache[FUNC_VERSION_CACHE_SIZE]; }; extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr); extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func); +extern void _PyFunction_SetVersion(PyFunctionObject *func, uint32_t version); +PyFunctionObject *_PyFunction_LookupByVersion(uint32_t version); + extern PyObject *_Py_set_function_type_params( PyThreadState* unused, PyObject *func, PyObject *type_params); diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 9f4437c09e92cb..7944a67e8d5467 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -52,9 +52,14 @@ #define _ITER_CHECK_RANGE 328 #define _IS_ITER_EXHAUSTED_RANGE 329 #define _ITER_NEXT_RANGE 330 -#define _POP_JUMP_IF_FALSE 331 -#define _POP_JUMP_IF_TRUE 332 -#define JUMP_TO_TOP 333 +#define _CHECK_PEP_523 331 +#define _CHECK_FUNCTION_EXACT_ARGS 332 +#define _CHECK_STACK_SPACE 333 +#define _INIT_CALL_PY_EXACT_ARGS 334 +#define _PUSH_FRAME 335 +#define _POP_JUMP_IF_FALSE 336 +#define _POP_JUMP_IF_TRUE 337 +#define JUMP_TO_TOP 338 #ifndef NEED_OPCODE_METADATA extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump); @@ -951,6 +956,7 @@ _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { enum InstructionFormat { INSTR_FMT_IB, INSTR_FMT_IBC, + INSTR_FMT_IBC0, INSTR_FMT_IBC00, INSTR_FMT_IBC000, INSTR_FMT_IBC00000000, @@ -995,6 +1001,7 @@ struct opcode_macro_expansion { #define OPARG_CACHE_4 4 #define OPARG_TOP 5 #define OPARG_BOTTOM 6 +#define OPARG_SAVE_IP 7 #define OPCODE_METADATA_FMT(OP) (_PyOpcode_opcode_metadata[(OP)].instr_format) #define SAME_OPCODE_METADATA(OP1, OP2) \ @@ -1230,6 +1237,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[OPCODE_METADATA_SIZE] = { }; const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPANSION_SIZE] = { [NOP] = { .nuops = 1, .uops = { { NOP, 0, 0 } } }, + [RESUME] = { .nuops = 1, .uops = { { RESUME, 0, 0 } } }, [LOAD_FAST_CHECK] = { .nuops = 1, .uops = { { LOAD_FAST_CHECK, 0, 0 } } }, [LOAD_FAST] = { .nuops = 1, .uops = { { LOAD_FAST, 0, 0 } } }, [LOAD_FAST_AND_CLEAR] = { .nuops = 1, .uops = { { LOAD_FAST_AND_CLEAR, 0, 0 } } }, @@ -1336,6 +1344,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPAN [GET_YIELD_FROM_ITER] = { .nuops = 1, .uops = { { GET_YIELD_FROM_ITER, 0, 0 } } }, [WITH_EXCEPT_START] = { .nuops = 1, .uops = { { WITH_EXCEPT_START, 0, 0 } } }, [PUSH_EXC_INFO] = { .nuops = 1, .uops = { { PUSH_EXC_INFO, 0, 0 } } }, + [CALL_PY_EXACT_ARGS] = { .nuops = 6, .uops = { { _CHECK_PEP_523, 0, 0 }, { _CHECK_FUNCTION_EXACT_ARGS, 2, 1 }, { _CHECK_STACK_SPACE, 0, 0 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { SAVE_IP, 7, 3 }, { _PUSH_FRAME, 0, 0 } } }, [CALL_NO_KW_TYPE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TYPE_1, 0, 0 } } }, [CALL_NO_KW_STR_1] = { .nuops = 1, .uops = { { CALL_NO_KW_STR_1, 0, 0 } } }, [CALL_NO_KW_TUPLE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TUPLE_1, 0, 0 } } }, @@ -1389,6 +1398,11 @@ const char * const _PyOpcode_uop_name[OPCODE_UOP_NAME_SIZE] = { [_ITER_CHECK_RANGE] = "_ITER_CHECK_RANGE", [_IS_ITER_EXHAUSTED_RANGE] = "_IS_ITER_EXHAUSTED_RANGE", [_ITER_NEXT_RANGE] = "_ITER_NEXT_RANGE", + [_CHECK_PEP_523] = "_CHECK_PEP_523", + [_CHECK_FUNCTION_EXACT_ARGS] = "_CHECK_FUNCTION_EXACT_ARGS", + [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE", + [_INIT_CALL_PY_EXACT_ARGS] = "_INIT_CALL_PY_EXACT_ARGS", + [_PUSH_FRAME] = "_PUSH_FRAME", [_POP_JUMP_IF_FALSE] = "_POP_JUMP_IF_FALSE", [_POP_JUMP_IF_TRUE] = "_POP_JUMP_IF_TRUE", [JUMP_TO_TOP] = "JUMP_TO_TOP", diff --git a/Include/internal/pycore_uops.h b/Include/internal/pycore_uops.h index 57a5970353b360..254eeca2361bea 100644 --- a/Include/internal/pycore_uops.h +++ b/Include/internal/pycore_uops.h @@ -8,7 +8,7 @@ extern "C" { # error "this header requires Py_BUILD_CORE define" #endif -#define _Py_UOP_MAX_TRACE_LENGTH 32 +#define _Py_UOP_MAX_TRACE_LENGTH 64 typedef struct { uint32_t opcode; diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index 001d37de8e0eb3..ea4c4ffc3a8f11 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2618,6 +2618,24 @@ def testfunc(it): with self.assertRaises(StopIteration): next(it) + def test_call_py_exact_args(self): + def testfunc(n): + def dummy(x): + return x+1 + for i in range(n): + dummy(i) + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(10) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_PUSH_FRAME", uops) + self.assertIn("_BINARY_OP_ADD_INT", uops) + + if __name__ == "__main__": unittest.main() diff --git a/Objects/funcobject.c b/Objects/funcobject.c index 8c0bface3ac710..33191d23f18230 100644 --- a/Objects/funcobject.c +++ b/Objects/funcobject.c @@ -223,7 +223,73 @@ PyFunction_NewWithQualName(PyObject *code, PyObject *globals, PyObject *qualname return NULL; } -uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) +/* +Function versions +----------------- + +Function versions are used to detect when a function object has been +updated, invalidating inline cache data used by the `CALL` bytecode +(notably `CALL_PY_EXACT_ARGS` and a few other `CALL` specializations). + +They are also used by the Tier 2 superblock creation code to find +the function being called (and from there the code object). + +How does a function's `func_version` field get initialized? + +- `PyFunction_New` and friends initialize it to 0. +- The `MAKE_FUNCTION` instruction sets it from the code's `co_version`. +- It is reset to 0 when various attributes like `__code__` are set. +- A new version is allocated by `_PyFunction_GetVersionForCurrentState` + when the specializer needs a version and the version is 0. + +The latter allocates versions using a counter in the interpreter state; +when the counter wraps around to 0, no more versions are allocated. +There is one other special case: functions with a non-standard +`vectorcall` field are not given a version. + +When the function version is 0, the `CALL` bytecode is not specialized. + +Code object versions +-------------------- + +So where to code objects get their `co_version`? There is a single +static global counter, `_Py_next_func_version`. This is initialized in +the generated (!) file `Python/deepfreeze/deepfreeze.c`, to 1 plus the +number of deep-frozen function objects in that file. +(In `_bootstrap_python.c` and `freeze_module.c` it is initialized to 1.) + +Code objects get a new `co_version` allocated from this counter upon +creation. Since code objects are nominally immutable, `co_version` can +not be invalidated. The only way it can be 0 is when 2**32 or more +code objects have been created during the process's lifetime. +(The counter isn't reset by `fork()`, extending the lifetime.) +*/ + +void +_PyFunction_SetVersion(PyFunctionObject *func, uint32_t version) +{ + func->func_version = version; + if (version != 0) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + interp->func_state.func_version_cache[ + version % FUNC_VERSION_CACHE_SIZE] = func; + } +} + +PyFunctionObject * +_PyFunction_LookupByVersion(uint32_t version) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject *func = interp->func_state.func_version_cache[ + version % FUNC_VERSION_CACHE_SIZE]; + if (func != NULL && func->func_version == version) { + return (PyFunctionObject *)Py_NewRef(func); + } + return NULL; +} + +uint32_t +_PyFunction_GetVersionForCurrentState(PyFunctionObject *func) { if (func->func_version != 0) { return func->func_version; @@ -236,7 +302,7 @@ uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func) return 0; } uint32_t v = interp->func_state.next_version++; - func->func_version = v; + _PyFunction_SetVersion(func, v); return v; } @@ -851,6 +917,15 @@ func_dealloc(PyFunctionObject *op) if (op->func_weakreflist != NULL) { PyObject_ClearWeakRefs((PyObject *) op); } + if (op->func_version != 0) { + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyFunctionObject **slot = + interp->func_state.func_version_cache + + (op->func_version % FUNC_VERSION_CACHE_SIZE); + if (*slot == op) { + *slot = NULL; + } + } (void)func_clear(op); // These aren't cleared by func_clear(). Py_DECREF(op->func_code); diff --git a/Python/bytecodes.c b/Python/bytecodes.c index b2281abc6663da..536ee09af9798a 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -135,6 +135,7 @@ dummy_func( } inst(RESUME, (--)) { + #if TIER_ONE assert(tstate->cframe == &cframe); assert(frame == cframe.current_frame); /* Possibly combine this with eval breaker */ @@ -143,7 +144,9 @@ dummy_func( ERROR_IF(err, error); next_instr--; } - else if (oparg < 2) { + else + #endif + if (oparg < 2) { CHECK_EVAL_BREAKER(); } } @@ -957,13 +960,13 @@ dummy_func( { PyGenObject *gen = (PyGenObject *)receiver; _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } if (Py_IsNone(v) && PyIter_Check(receiver)) { @@ -996,13 +999,13 @@ dummy_func( DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND); STAT_INC(SEND, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -2586,7 +2589,6 @@ dummy_func( DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER); STAT_INC(FOR_ITER, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; _PyFrame_StackPush(gen_frame, Py_None); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; @@ -2594,6 +2596,7 @@ dummy_func( SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER); assert(next_instr[oparg].op.code == END_FOR || next_instr[oparg].op.code == INSTRUMENTED_END_FOR); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -2944,32 +2947,70 @@ dummy_func( GO_TO_INSTRUCTION(CALL_PY_EXACT_ARGS); } - inst(CALL_PY_EXACT_ARGS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { - ASSERT_KWNAMES_IS_NULL(); + op(_CHECK_PEP_523, (--)) { DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } + } + + op(_CHECK_FUNCTION_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { + ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + } + + op(_CHECK_STACK_SPACE, (callable, unused, unused[oparg] -- callable, unused, unused[oparg])) { + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + } + + op(_INIT_CALL_PY_EXACT_ARGS, (callable, self_or_null, args[oparg] -- new_frame: _PyInterpreterFrame*)) { + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } STAT_INC(CALL, hit); - _PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); for (int i = 0; i < argcount; i++) { new_frame->localsplus[i] = args[i]; } - // Manipulate stack directly since we leave using DISPATCH_INLINED(). - STACK_SHRINK(oparg + 2); - SKIP_OVER(INLINE_CACHE_ENTRIES_CALL); + } + + // The 'unused' output effect represents the return value + // (which will be pushed when the frame returns). + // It is needed so CALL_PY_EXACT_ARGS matches its family. + op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) { + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + assert(tstate->interp->eval_frame == NULL); + SAVE_FRAME_STATE(); // Signals to the code generator + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif } + macro(CALL_PY_EXACT_ARGS) = + unused/1 + // Skip over the counter + _CHECK_PEP_523 + + _CHECK_FUNCTION_EXACT_ARGS + + _CHECK_STACK_SPACE + + _INIT_CALL_PY_EXACT_ARGS + + SAVE_IP + // Tier 2 only; special-cased oparg + _PUSH_FRAME; + inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); @@ -3504,7 +3545,8 @@ dummy_func( goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; } diff --git a/Python/ceval.c b/Python/ceval.c index b966399a342d08..2370636d765d9c 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -770,6 +770,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #endif { +#define TIER_ONE 1 #include "generated_cases.c.h" /* INSTRUMENTED_LINE has to be here, rather than in bytecodes.c, diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 8dc8b754485856..77f760f0bb5995 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -103,11 +103,16 @@ DISPATCH_GOTO(); \ } +#define SAVE_FRAME_STATE() \ + do { \ + frame->prev_instr = next_instr - 1; \ + _PyFrame_SetStackPointer(frame, stack_pointer); \ + } while (0) + #define DISPATCH_INLINED(NEW_FRAME) \ do { \ assert(tstate->interp->eval_frame == NULL); \ - _PyFrame_SetStackPointer(frame, stack_pointer); \ - frame->prev_instr = next_instr - 1; \ + SAVE_FRAME_STATE(); \ (NEW_FRAME)->previous = frame; \ frame = cframe.current_frame = (NEW_FRAME); \ CALL_STAT_INC(inlined_py_calls); \ diff --git a/Python/executor.c b/Python/executor.c index 4a18618c0c6c0c..c18ba6d4ade3e5 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -30,6 +30,14 @@ #undef ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZATION 0 +#undef SAVE_FRAME_STATE +#define SAVE_FRAME_STATE() \ + do { \ + /* Assume preceding SAVE_IP has set frame->prev_instr */ \ + frame->prev_instr--; \ + _PyFrame_SetStackPointer(frame, stack_pointer); \ + } while (0) + _PyInterpreterFrame * _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject **stack_pointer) @@ -81,6 +89,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject OBJECT_STAT_INC(optimization_uops_executed); switch (opcode) { +#define TIER_TWO 2 #include "executor_cases.c.h" default: diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index d6d541a3b61ab4..ba93ef0fac3753 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -7,6 +7,24 @@ break; } + case RESUME: { + #if TIER_ONE + assert(tstate->cframe == &cframe); + assert(frame == cframe.current_frame); + /* Possibly combine this with eval breaker */ + if (_PyFrame_GetCode(frame)->_co_instrumentation_version != tstate->interp->monitoring_version) { + int err = _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); + if (err) goto error; + next_instr--; + } + else + #endif + if (oparg < 2) { + CHECK_EVAL_BREAKER(); + } + break; + } + case LOAD_FAST_CHECK: { PyObject *value; value = GETLOCAL(oparg); @@ -103,7 +121,6 @@ } case TO_BOOL: { - static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; PyObject *res; value = stack_pointer[-1]; @@ -363,7 +380,6 @@ } case BINARY_SUBSCR: { - static_assert(INLINE_CACHE_ENTRIES_BINARY_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; PyObject *res; @@ -557,7 +573,6 @@ } case STORE_SUBSCR: { - static_assert(INLINE_CACHE_ENTRIES_STORE_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; PyObject *v; @@ -862,7 +877,6 @@ } case UNPACK_SEQUENCE: { - static_assert(INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE == 1, "incorrect cache size"); PyObject *seq; seq = stack_pointer[-1]; #if ENABLE_SPECIALIZATION @@ -950,7 +964,6 @@ } case STORE_ATTR: { - static_assert(INLINE_CACHE_ENTRIES_STORE_ATTR == 4, "incorrect cache size"); PyObject *owner; PyObject *v; owner = stack_pointer[-1]; @@ -1061,7 +1074,6 @@ } case LOAD_GLOBAL: { - static_assert(INLINE_CACHE_ENTRIES_LOAD_GLOBAL == 4, "incorrect cache size"); PyObject *res; PyObject *null = NULL; #if ENABLE_SPECIALIZATION @@ -1554,7 +1566,6 @@ } case LOAD_ATTR: { - static_assert(INLINE_CACHE_ENTRIES_LOAD_ATTR == 9, "incorrect cache size"); PyObject *owner; PyObject *attr; PyObject *self_or_null = NULL; @@ -1648,7 +1659,6 @@ } case COMPARE_OP: { - static_assert(INLINE_CACHE_ENTRIES_COMPARE_OP == 1, "incorrect cache size"); PyObject *right; PyObject *left; PyObject *res; @@ -2153,6 +2163,83 @@ break; } + case _CHECK_PEP_523: { + DEOPT_IF(tstate->interp->eval_frame, CALL); + break; + } + + case _CHECK_FUNCTION_EXACT_ARGS: { + PyObject *self_or_null; + PyObject *callable; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + uint32_t func_version = (uint32_t)operand; + ASSERT_KWNAMES_IS_NULL(); + DEOPT_IF(!PyFunction_Check(callable), CALL); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version, CALL); + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + break; + } + + case _CHECK_STACK_SPACE: { + PyObject *callable; + callable = stack_pointer[-2 - oparg]; + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + break; + } + + case _INIT_CALL_PY_EXACT_ARGS: { + PyObject **args; + PyObject *self_or_null; + PyObject *callable; + _PyInterpreterFrame *new_frame; + args = stack_pointer - oparg; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } + STAT_INC(CALL, hit); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + for (int i = 0; i < argcount; i++) { + new_frame->localsplus[i] = args[i]; + } + STACK_SHRINK(oparg); + STACK_SHRINK(1); + stack_pointer[-1] = (PyObject *)new_frame; + break; + } + + case _PUSH_FRAME: { + _PyInterpreterFrame *new_frame; + new_frame = (_PyInterpreterFrame *)stack_pointer[-1]; + STACK_SHRINK(1); + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. + frame->return_offset = 0; + assert(tstate->interp->eval_frame == NULL); + SAVE_FRAME_STATE(); // Signals to the code generator + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + break; + } + case CALL_NO_KW_TYPE_1: { PyObject **args; PyObject *null; @@ -2536,7 +2623,8 @@ goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; stack_pointer[-1] = func; break; @@ -2654,7 +2742,6 @@ } case BINARY_OP: { - static_assert(INLINE_CACHE_ENTRIES_BINARY_OP == 1, "incorrect cache size"); PyObject *rhs; PyObject *lhs; PyObject *res; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index cf20b869b8182f..77ea51388617e8 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -8,6 +8,7 @@ } TARGET(RESUME) { + #if TIER_ONE assert(tstate->cframe == &cframe); assert(frame == cframe.current_frame); /* Possibly combine this with eval breaker */ @@ -16,7 +17,9 @@ if (err) goto error; next_instr--; } - else if (oparg < 2) { + else + #endif + if (oparg < 2) { CHECK_EVAL_BREAKER(); } DISPATCH(); @@ -1191,13 +1194,13 @@ { PyGenObject *gen = (PyGenObject *)receiver; _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } if (Py_IsNone(v) && PyIter_Check(receiver)) { @@ -1237,13 +1240,13 @@ DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND); STAT_INC(SEND, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -3341,7 +3344,6 @@ DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER); STAT_INC(FOR_ITER, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; _PyFrame_StackPush(gen_frame, Py_None); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; @@ -3349,6 +3351,7 @@ SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER); assert(next_instr[oparg].op.code == END_FOR || next_instr[oparg].op.code == INSTRUMENTED_END_FOR); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); STACK_GROW(1); } @@ -3758,38 +3761,72 @@ TARGET(CALL_PY_EXACT_ARGS) { PREDICTED(CALL_PY_EXACT_ARGS); - PyObject **args; PyObject *self_or_null; PyObject *callable; - args = stack_pointer - oparg; + PyObject **args; + _PyInterpreterFrame *new_frame; + // _CHECK_PEP_523 + { + DEOPT_IF(tstate->interp->eval_frame, CALL); + } + // _CHECK_FUNCTION_EXACT_ARGS self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; - uint32_t func_version = read_u32(&next_instr[1].cache); - ASSERT_KWNAMES_IS_NULL(); - DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; + { + uint32_t func_version = read_u32(&next_instr[1].cache); + ASSERT_KWNAMES_IS_NULL(); + DEOPT_IF(!PyFunction_Check(callable), CALL); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version, CALL); + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + } + // _CHECK_STACK_SPACE + callable = stack_pointer[-2 - oparg]; + { + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } - DEOPT_IF(!PyFunction_Check(callable), CALL); - PyFunctionObject *func = (PyFunctionObject *)callable; - DEOPT_IF(func->func_version != func_version, CALL); - PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); - DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); - STAT_INC(CALL, hit); - _PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); - for (int i = 0; i < argcount; i++) { - new_frame->localsplus[i] = args[i]; + // _INIT_CALL_PY_EXACT_ARGS + args = stack_pointer - oparg; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + { + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } + STAT_INC(CALL, hit); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + for (int i = 0; i < argcount; i++) { + new_frame->localsplus[i] = args[i]; + } } - // Manipulate stack directly since we leave using DISPATCH_INLINED(). - STACK_SHRINK(oparg + 2); - SKIP_OVER(INLINE_CACHE_ENTRIES_CALL); - frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + // _PUSH_FRAME STACK_SHRINK(oparg); - STACK_SHRINK(1); + STACK_SHRINK(2); + next_instr += 3; + { + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. + frame->return_offset = 0; + assert(tstate->interp->eval_frame == NULL); + SAVE_FRAME_STATE(); // Signals to the code generator + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + } } TARGET(CALL_PY_WITH_DEFAULTS) { @@ -4527,7 +4564,8 @@ goto error; } - func_obj->func_version = ((PyCodeObject *)codeobj)->co_version; + _PyFunction_SetVersion( + func_obj, ((PyCodeObject *)codeobj)->co_version); func = (PyObject *)func_obj; stack_pointer[-1] = func; DISPATCH(); diff --git a/Python/optimizer.c b/Python/optimizer.c index 6c730aa14b9a47..25e375de84e2cf 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -444,6 +444,7 @@ translate_bytecode_to_trace( code->co_firstlineno, 2 * INSTR_IP(initial_instr, code)); +top: // Jump here after _PUSH_FRAME for (;;) { RESERVE_RAW(2, "epilogue"); // Always need space for SAVE_IP and EXIT_TRACE ADD_TO_TRACE(SAVE_IP, INSTR_IP(instr, code), 0); @@ -602,6 +603,10 @@ translate_bytecode_to_trace( case OPARG_BOTTOM: // Second half of super-instr oparg = orig_oparg & 0xF; break; + case OPARG_SAVE_IP: // op==SAVE_IP; oparg=next instr + oparg = INSTR_IP(instr + offset, code); + break; + default: fprintf(stderr, "opcode=%d, oparg=%d; nuops=%d, i=%d; size=%d, offset=%d\n", @@ -611,6 +616,39 @@ translate_bytecode_to_trace( Py_FatalError("garbled expansion"); } ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); + if (expansion->uops[i].uop == _PUSH_FRAME) { + assert(i + 1 == nuops); + int func_version_offset = + offsetof(_PyCallCache, func_version)/sizeof(_Py_CODEUNIT) + // Add one to account for the actual opcode/oparg pair: + + 1; + uint32_t func_version = read_u32(&instr[func_version_offset].cache); + PyFunctionObject *func = _PyFunction_LookupByVersion(func_version); + DPRINTF(3, "Function object: %p\n", func); + if (func != NULL) { + PyCodeObject *new_code = (PyCodeObject *)PyFunction_GET_CODE(func); + if (new_code == code) { + // Recursive call, bail (we could be here forever). + DPRINTF(2, "Bailing on recursive call to %s (%s:%d)\n", + PyUnicode_AsUTF8(new_code->co_qualname), + PyUnicode_AsUTF8(new_code->co_filename), + new_code->co_firstlineno); + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } + code = new_code; + initial_instr = instr = _PyCode_CODE(code); + DPRINTF(2, + "Continuing in %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * INSTR_IP(initial_instr, code)); + goto top; + } + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } } break; } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 0de3abf9407899..54889ea87fec91 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1191,6 +1191,7 @@ init_interp_main(PyThreadState *tstate) if (_Py_get_xoption(&config->xoptions, L"uops") != NULL) { enabled = 1; } + enabled = 1; // TEMPORARY: always enable if (enabled) { PyObject *opt = PyUnstable_Optimizer_NewUOpOptimizer(); if (opt == NULL) { diff --git a/Tools/cases_generator/flags.py b/Tools/cases_generator/flags.py index f7ebdeb0d65677..962f003b194dbd 100644 --- a/Tools/cases_generator/flags.py +++ b/Tools/cases_generator/flags.py @@ -92,7 +92,7 @@ def variable_used_unspecialized(node: parsing.Node, name: str) -> bool: if text == "#if": if ( i + 1 < len(node.tokens) - and node.tokens[i + 1].text == "ENABLE_SPECIALIZATION" + and node.tokens[i + 1].text in ("ENABLE_SPECIALIZATION", "TIER_ONE") ): skipping = True elif text in ("#else", "#endif"): diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index d35a16a80e8d00..6050fcf84479e9 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -22,6 +22,7 @@ PseudoInstruction, StackEffect, OverriddenInstructionPlaceHolder, + TIER_ONE, TIER_TWO, ) import parsing @@ -53,6 +54,7 @@ "OPARG_CACHE_4": 4, "OPARG_TOP": 5, "OPARG_BOTTOM": 6, + "OPARG_SAVE_IP": 7, } INSTR_FMT_PREFIX = "INSTR_FMT_" @@ -344,7 +346,9 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No if instr.kind == "inst" and instr.is_viable_uop(): # Construct a dummy Component -- input/output mappings are not used part = Component(instr, instr.active_caches) - self.write_macro_expansions(instr.name, [part]) + self.write_macro_expansions( + instr.name, [part], instr.cache_offset + ) elif instr.kind == "inst" and variable_used( instr.inst, "oparg1" ): @@ -354,7 +358,9 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No self.write_super_expansions(instr.name) case parsing.Macro(): mac = self.macro_instrs[thing.name] - self.write_macro_expansions(mac.name, mac.parts) + self.write_macro_expansions( + mac.name, mac.parts, mac.cache_offset + ) case parsing.Pseudo(): pass case _: @@ -428,7 +434,9 @@ def add(name: str) -> None: if instr.kind == "op" and instr.is_viable_uop(): add(instr.name) - def write_macro_expansions(self, name: str, parts: MacroParts) -> None: + def write_macro_expansions( + self, name: str, parts: MacroParts, cache_offset: int + ) -> None: """Write the macro expansions for a macro-instruction.""" # TODO: Refactor to share code with write_cody(), is_viaible_uop(), etc. offset = 0 # Cache effect offset @@ -448,7 +456,10 @@ def write_macro_expansions(self, name: str, parts: MacroParts) -> None: ) return if not part.active_caches: - size, offset = OPARG_SIZES["OPARG_FULL"], 0 + if part.instr.name == "SAVE_IP": + size, offset = OPARG_SIZES["OPARG_SAVE_IP"], cache_offset + else: + size, offset = OPARG_SIZES["OPARG_FULL"], 0 else: # If this assert triggers, is_viable_uops() lied assert len(part.active_caches) == 1, (name, part.instr.name) @@ -551,7 +562,9 @@ def write_instructions( case parsing.Macro(): n_macros += 1 mac = self.macro_instrs[thing.name] - stacking.write_macro_instr(mac, self.out, self.families.get(mac.name)) + stacking.write_macro_instr( + mac, self.out, self.families.get(mac.name) + ) # self.write_macro(self.macro_instrs[thing.name]) case parsing.Pseudo(): pass @@ -587,7 +600,9 @@ def write_executor_instructions( n_instrs += 1 self.out.emit("") with self.out.block(f"case {thing.name}:"): - instr.write(self.out, tier=TIER_TWO) + stacking.write_single_instr( + instr, self.out, tier=TIER_TWO + ) if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit("break;") @@ -620,8 +635,13 @@ def write_instr(self, instr: Instruction) -> None: with self.out.block(f"TARGET({name})"): if instr.predicted: self.out.emit(f"PREDICTED({name});") - instr.write(self.out) + self.out.static_assert_family_size( + instr.name, instr.family, instr.cache_offset + ) + stacking.write_single_instr(instr, self.out, tier=TIER_ONE) if not instr.always_exits: + if instr.cache_offset: + self.out.emit(f"next_instr += {instr.cache_offset};") if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit(f"DISPATCH();") diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index aa94dbb07ea1c0..260494ec612f8f 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -59,7 +59,8 @@ class Instruction: block_line: int # First line of block in original code # Computed by constructor - always_exits: bool + always_exits: str # If the block always exits, its last line; else "" + save_frame_state: bool # Whether the instruction uses SAVE_FRAME_STATE() has_deopt: bool cache_offset: int cache_effects: list[parsing.CacheEffect] @@ -83,6 +84,7 @@ def __init__(self, inst: parsing.InstDef): self.block ) self.always_exits = always_exits(self.block_text) + self.save_frame_state = variable_used(self.inst, "SAVE_FRAME_STATE") self.has_deopt = variable_used(self.inst, "DEOPT_IF") self.cache_effects = [ effect for effect in inst.inputs if isinstance(effect, parsing.CacheEffect) @@ -120,13 +122,13 @@ def __init__(self, inst: parsing.InstDef): def is_viable_uop(self) -> bool: """Whether this instruction is viable as a uop.""" dprint: typing.Callable[..., None] = lambda *args, **kwargs: None - # if self.name.startswith("CALL"): + # if "RESUME" in self.name: # dprint = print if self.name == "EXIT_TRACE": return True # This has 'return frame' but it's okay if self.always_exits: - dprint(f"Skipping {self.name} because it always exits") + dprint(f"Skipping {self.name} because it always exits: {self.always_exits}") return False if len(self.active_caches) > 1: # print(f"Skipping {self.name} because it has >1 cache entries") @@ -140,23 +142,6 @@ def is_viable_uop(self) -> bool: res = False return res - def write(self, out: Formatter, tier: Tiers = TIER_ONE) -> None: - """Write one instruction, sans prologue and epilogue.""" - - # Write a static assertion that a family's cache size is correct - out.static_assert_family_size(self.name, self.family, self.cache_offset) - - # Write input stack effect variable declarations and initializations - stacking.write_single_instr(self, out, tier) - - # Skip the rest if the block always exits - if self.always_exits: - return - - # Write cache effect - if tier == TIER_ONE and self.cache_offset: - out.emit(f"next_instr += {self.cache_offset};") - def write_body( self, out: Formatter, @@ -322,16 +307,16 @@ def extract_block_text(block: parsing.Block) -> tuple[list[str], bool, int]: return blocklines, check_eval_breaker, block_line -def always_exits(lines: list[str]) -> bool: +def always_exits(lines: list[str]) -> str: """Determine whether a block always ends in a return/goto/etc.""" if not lines: - return False + return "" line = lines[-1].rstrip() # Indent must match exactly (TODO: Do something better) if line[:12] != " " * 12: - return False + return "" line = line[12:] - return line.startswith( + if line.startswith( ( "goto ", "return ", @@ -340,4 +325,6 @@ def always_exits(lines: list[str]) -> bool: "Py_UNREACHABLE()", "ERROR_IF(true, ", ) - ) + ): + return line + return "" diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index d457ce01a8f438..7154d8fbab454f 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -1,6 +1,7 @@ import dataclasses import typing +from flags import variable_used_unspecialized from formatting import ( Formatter, UNUSED, @@ -146,6 +147,8 @@ class EffectManager: # Track offsets from stack pointer min_offset: StackOffset final_offset: StackOffset + # Link to previous manager + pred: "EffectManager | None" = None def __init__( self, @@ -167,7 +170,8 @@ def __init__( self.pokes.append(StackItem(offset=self.final_offset.clone(), effect=eff)) self.final_offset.higher(eff) - if pred: + self.pred = pred + while pred: # Replace push(x) + pop(y) with copy(x, y). # Check that the sources and destinations are disjoint. sources: set[str] = set() @@ -192,6 +196,11 @@ def __init__( sources, destinations, ) + # See if we can get more copies of a earlier predecessor. + if self.peeks and not pred.pokes and not pred.peeks: + pred = pred.pred + else: + pred = None # Break def adjust_deeper(self, eff: StackEffect) -> None: for peek in self.peeks: @@ -295,6 +304,7 @@ def write_single_instr( [Component(instr, instr.active_caches)], out, tier, + 0, ) except AssertionError as err: raise AssertionError(f"Error writing instruction {instr.name}") from err @@ -303,36 +313,31 @@ def write_single_instr( def write_macro_instr( mac: MacroInstruction, out: Formatter, family: Family | None ) -> None: - parts = [part for part in mac.parts if isinstance(part, Component)] - - cache_adjust = 0 - for part in mac.parts: - match part: - case CacheEffect(size=size): - cache_adjust += size - case Component(instr=instr): - cache_adjust += instr.cache_offset - case _: - typing.assert_never(part) - + parts = [ + part + for part in mac.parts + if isinstance(part, Component) and part.instr.name != "SAVE_IP" + ] out.emit("") with out.block(f"TARGET({mac.name})"): if mac.predicted: out.emit(f"PREDICTED({mac.name});") - out.static_assert_family_size(mac.name, family, cache_adjust) + out.static_assert_family_size(mac.name, family, mac.cache_offset) try: - write_components(parts, out, TIER_ONE) + write_components(parts, out, TIER_ONE, mac.cache_offset) except AssertionError as err: raise AssertionError(f"Error writing macro {mac.name}") from err - if cache_adjust: - out.emit(f"next_instr += {cache_adjust};") - out.emit("DISPATCH();") + if not parts[-1].instr.always_exits and not parts[-1].instr.save_frame_state: + if mac.cache_offset: + out.emit(f"next_instr += {mac.cache_offset};") + out.emit("DISPATCH();") def write_components( parts: list[Component], out: Formatter, tier: Tiers, + cache_offset: int, ) -> None: managers = get_managers(parts) @@ -374,13 +379,22 @@ def write_components( poke.as_stack_effect(lax=True), ) + if mgr.instr.save_frame_state: + # Adjust stack to min_offset (input effects materialized) + out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) + # Use clone() since adjust_inverse() mutates final_offset + mgr.adjust_inverse(mgr.final_offset.clone()) + if cache_offset: + out.emit(f"next_instr += {cache_offset};") + if len(parts) == 1: mgr.instr.write_body(out, 0, mgr.active_caches, tier) else: with out.block(""): mgr.instr.write_body(out, -4, mgr.active_caches, tier) - if mgr is managers[-1]: + if mgr is managers[-1] and not mgr.instr.save_frame_state: + # TODO: Explain why this adjustment is needed. out.stack_adjust(mgr.final_offset.deep, mgr.final_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone())