From ddb794abf2b9896a9fe038c9a52d80e145646afb Mon Sep 17 00:00:00 2001 From: Thomas Wouters Date: Fri, 20 Dec 2024 14:45:52 +0100 Subject: [PATCH 1/3] Add a fast path to (single-mutex) critical section locking _iff_ the mutex is already held by the currently active, top-most critical section of this thread. This can matter a lot for indirectly recursive critical sections without intervening critical sections. --- Include/internal/pycore_critical_section.h | 26 +++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h index 9ba2fce56d3c9c..5646ccfb572da4 100644 --- a/Include/internal/pycore_critical_section.h +++ b/Include/internal/pycore_critical_section.h @@ -109,8 +109,18 @@ _PyCriticalSection_IsActive(uintptr_t tag) static inline void _PyCriticalSection_BeginMutex(PyCriticalSection *c, PyMutex *m) { + // As an optimisation for locking the same object recursively, skip + // locking if the mutex is currently locked and the topmost, active, + // single critical section. + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate->critical_section && + !(tstate->critical_section & _Py_CRITICAL_SECTION_MASK) && + ((PyCriticalSection *)tstate->critical_section)->_cs_mutex == m) { + c->_cs_mutex = NULL; + c->_cs_prev = 0; + return; + } if (PyMutex_LockFast(m)) { - PyThreadState *tstate = _PyThreadState_GET(); c->_cs_mutex = m; c->_cs_prev = tstate->critical_section; tstate->critical_section = (uintptr_t)c; @@ -145,6 +155,12 @@ _PyCriticalSection_Pop(PyCriticalSection *c) static inline void _PyCriticalSection_End(PyCriticalSection *c) { + // If the mutex is NULL, we used the fast path in + // _PyCriticalSection_BeginMutex for locks already held in the top-most + // critical section, and we shouldn't unlock or pop this critical section. + if (c->_cs_mutex == NULL) { + return; + } PyMutex_Unlock(c->_cs_mutex); _PyCriticalSection_Pop(c); } @@ -199,6 +215,14 @@ _PyCriticalSection2_Begin(PyCriticalSection2 *c, PyObject *a, PyObject *b) static inline void _PyCriticalSection2_End(PyCriticalSection2 *c) { + // if mutex1 is NULL, we used the fast path in + // _PyCriticalSection_BeginMutex for mutexes that are already held, + // which should only happen when mutex1 and mutex2 were the same mutex, + // and mutex2 should also be NULL. + if (c->_cs_base._cs_mutex == NULL) { + assert(c->_cs_mutex2 == NULL); + return; + } if (c->_cs_mutex2) { PyMutex_Unlock(c->_cs_mutex2); } From ecd12ace5ee105f7ef6651d3162af3b9b17adf9b Mon Sep 17 00:00:00 2001 From: Thomas Wouters Date: Fri, 20 Dec 2024 23:57:25 +0100 Subject: [PATCH 2/3] Move the newly added fast path to the start of the slow path, since it should be relatively rare and we don't want to burden the fastest path. --- Include/internal/pycore_critical_section.h | 16 +++------------ Python/critical_section.c | 24 +++++++++++++++------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h index 5646ccfb572da4..e66d6d805c1b3b 100644 --- a/Include/internal/pycore_critical_section.h +++ b/Include/internal/pycore_critical_section.h @@ -109,18 +109,8 @@ _PyCriticalSection_IsActive(uintptr_t tag) static inline void _PyCriticalSection_BeginMutex(PyCriticalSection *c, PyMutex *m) { - // As an optimisation for locking the same object recursively, skip - // locking if the mutex is currently locked and the topmost, active, - // single critical section. - PyThreadState *tstate = _PyThreadState_GET(); - if (tstate->critical_section && - !(tstate->critical_section & _Py_CRITICAL_SECTION_MASK) && - ((PyCriticalSection *)tstate->critical_section)->_cs_mutex == m) { - c->_cs_mutex = NULL; - c->_cs_prev = 0; - return; - } if (PyMutex_LockFast(m)) { + PyThreadState *tstate = _PyThreadState_GET(); c->_cs_mutex = m; c->_cs_prev = tstate->critical_section; tstate->critical_section = (uintptr_t)c; @@ -156,7 +146,7 @@ static inline void _PyCriticalSection_End(PyCriticalSection *c) { // If the mutex is NULL, we used the fast path in - // _PyCriticalSection_BeginMutex for locks already held in the top-most + // _PyCriticalSection_BeginSlow for locks already held in the top-most // critical section, and we shouldn't unlock or pop this critical section. if (c->_cs_mutex == NULL) { return; @@ -216,7 +206,7 @@ static inline void _PyCriticalSection2_End(PyCriticalSection2 *c) { // if mutex1 is NULL, we used the fast path in - // _PyCriticalSection_BeginMutex for mutexes that are already held, + // _PyCriticalSection_BeginSlow for mutexes that are already held, // which should only happen when mutex1 and mutex2 were the same mutex, // and mutex2 should also be NULL. if (c->_cs_base._cs_mutex == NULL) { diff --git a/Python/critical_section.c b/Python/critical_section.c index 62ed25523fd6dc..73857b85496316 100644 --- a/Python/critical_section.c +++ b/Python/critical_section.c @@ -8,11 +8,28 @@ static_assert(_Alignof(PyCriticalSection) >= 4, "critical section must be aligned to at least 4 bytes"); #endif +#ifdef Py_GIL_DISABLED +static PyCriticalSection * +untag_critical_section(uintptr_t tag) +{ + return (PyCriticalSection *)(tag & ~_Py_CRITICAL_SECTION_MASK); +} +#endif + void _PyCriticalSection_BeginSlow(PyCriticalSection *c, PyMutex *m) { #ifdef Py_GIL_DISABLED PyThreadState *tstate = _PyThreadState_GET(); + // As an optimisation for locking the same object recursively, skip + // locking if the mutex is currently locked by the top-most critical + // section. + if (tstate->critical_section && + untag_critical_section(tstate->critical_section)->_cs_mutex == m) { + c->_cs_mutex = NULL; + c->_cs_prev = 0; + return; + } c->_cs_mutex = NULL; c->_cs_prev = (uintptr_t)tstate->critical_section; tstate->critical_section = (uintptr_t)c; @@ -42,13 +59,6 @@ _PyCriticalSection2_BeginSlow(PyCriticalSection2 *c, PyMutex *m1, PyMutex *m2, #endif } -#ifdef Py_GIL_DISABLED -static PyCriticalSection * -untag_critical_section(uintptr_t tag) -{ - return (PyCriticalSection *)(tag & ~_Py_CRITICAL_SECTION_MASK); -} -#endif // Release all locks held by critical sections. This is called by // _PyThreadState_Detach. From b28153d016d45e4d3036f4024152c375a92fd6fe Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 23:07:38 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2024-12-20-23-07-33.gh-issue-114203.84NgoW.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-12-20-23-07-33.gh-issue-114203.84NgoW.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-12-20-23-07-33.gh-issue-114203.84NgoW.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-20-23-07-33.gh-issue-114203.84NgoW.rst new file mode 100644 index 00000000000000..6a9856e90c32bc --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-20-23-07-33.gh-issue-114203.84NgoW.rst @@ -0,0 +1 @@ +Optimize ``Py_BEGIN_CRITICAL_SECTION`` for simple recursive calls.