Skip to content

gh-105766: Add Locking Around Custom Allocators #105619

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

5 changes: 5 additions & 0 deletions Include/internal/pycore_pymem.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ struct _pymem_allocators {
debug_alloc_api_t obj;
} debug;
PyObjectArenaAllocator obj_arena;
unsigned int num_gils;
struct {
PyMemAllocatorEx mem;
PyMemAllocatorEx obj;
} wrapped_with_lock;
};


Expand Down
234 changes: 227 additions & 7 deletions Objects/obmalloc.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* Python's malloc wrappers (see pymem.h) */

#include "Python.h"
#include "pycore_ceval.h" // _PyEval_AcquireLock()
#include "pycore_code.h" // stats
#include "pycore_object.h" // _PyDebugAllocatorStats() definition
#include "pycore_obmalloc.h"
Expand Down Expand Up @@ -210,6 +211,8 @@ _PyMem_ArenaFree(void *Py_UNUSED(ctx), void *ptr,
#define _PyObject (_PyRuntime.allocators.standard.obj)
#define _PyMem_Debug (_PyRuntime.allocators.debug)
#define _PyObject_Arena (_PyRuntime.allocators.obj_arena)
#define _PyMem_Wrapped (_PyRuntime.allocators.wrapped_with_lock.mem)
#define _PyObject_Wrapped (_PyRuntime.allocators.wrapped_with_lock.obj)


/***************************/
Expand Down Expand Up @@ -531,14 +534,32 @@ PyMem_SetupDebugHooks(void)
PyThread_release_lock(ALLOCATORS_MUTEX);
}

static int has_locking_wrapper(PyMemAllocatorEx *allocator);

static void
get_allocator_unlocked(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
{
switch(domain)
{
case PYMEM_DOMAIN_RAW: *allocator = _PyMem_Raw; break;
case PYMEM_DOMAIN_MEM: *allocator = _PyMem; break;
case PYMEM_DOMAIN_OBJ: *allocator = _PyObject; break;
case PYMEM_DOMAIN_RAW:
*allocator = _PyMem_Raw;
break;
case PYMEM_DOMAIN_MEM:
if (has_locking_wrapper(&_PyMem)) {
*allocator = *(PyMemAllocatorEx *)_PyMem.ctx;
}
else {
*allocator = _PyMem;
}
break;
case PYMEM_DOMAIN_OBJ:
if (has_locking_wrapper(&_PyObject)) {
*allocator = *(PyMemAllocatorEx *)_PyObject.ctx;
}
else {
*allocator = _PyObject;
}
break;
default:
/* unknown domain: set all attributes to NULL */
allocator->ctx = NULL;
Expand All @@ -549,15 +570,28 @@ get_allocator_unlocked(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
}
}

static void maybe_add_locking_wrapper(
PyMemAllocatorDomain, PyMemAllocatorEx *, PyMemAllocatorEx *);

static void
set_allocator_unlocked(PyMemAllocatorDomain domain, PyMemAllocatorEx *allocator)
{
switch(domain)
{
case PYMEM_DOMAIN_RAW: _PyMem_Raw = *allocator; break;
case PYMEM_DOMAIN_MEM: _PyMem = *allocator; break;
case PYMEM_DOMAIN_OBJ: _PyObject = *allocator; break;
/* ignore unknown domain */
case PYMEM_DOMAIN_RAW:
_PyMem_Raw = *allocator;
break;
case PYMEM_DOMAIN_MEM:
_PyMem = *allocator;
maybe_add_locking_wrapper(domain, &_PyMem, &_PyMem_Wrapped);
break;
case PYMEM_DOMAIN_OBJ:
_PyObject = *allocator;
maybe_add_locking_wrapper(domain, &_PyObject, &_PyObject_Wrapped);
break;
default:
/* ignore unknown domain */
return;
}
}

Expand Down Expand Up @@ -628,6 +662,192 @@ PyObject_SetArenaAllocator(PyObjectArenaAllocator *allocator)
* around these uses of the runtime-global allocators state. */


/************************************/
/* locking around custom allocators */
/************************************/

static inline int
should_lock(PyInterpreterState *interp)
{
return interp->ceval.gil != _PyInterpreterState_Main()->ceval.gil;
}

static PyThreadState *
get_tstate_for_main_gil(void)
{
PyThreadState *tstate = _PyRuntimeState_GetFinalizing(&_PyRuntime);
if (tstate == NULL) {
/* To use its GIL, we only need the pointer and one field. */
static const PyThreadState _main_tstate = {
.interp = &_PyRuntime._main_interpreter,
};
tstate = (PyThreadState *)&_main_tstate;
}
return tstate;
}

static inline void
acquire_custom_allocator_lock(PyThreadState *tstate)
{
_PyEval_AcquireLock(tstate);
}

static inline void
release_custom_allocator_lock(PyThreadState *tstate)
{
_PyEval_ReleaseLock(tstate->interp, tstate);
}

static void *
_PyMem_MallocLocked(void *ctx, size_t size)
{
PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
if (_PyRuntime.allocators.num_gils > 1) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You lock updates (writes) to this value. Is it safe to use without atomic access or a lock? (requiring an atomic read here would presumably be a performance hit?).

Rather than always checking... could the logic that does num_gils++ with the (main runtime) lock held also do:

... num_gils++;
if (num_gils == 2 && !has_locking_wrapper(...)) {
    maybe_add_locking_wrapper(...);
}

such that the wrapped pointer switch happens upon first creation of an additional gil while the main runtime gil (the only gil prior to the current code running) is still held.

I'm not sure it is worth doing the opposite during finalization. Once wrapped due to per subinterpreter GILs, just stay wrapped. At least the wrappers won't have this conditional anymore.

I suspect this thought is either overoptimization... or actually necessary to avoid locking/atomic access to num_gils.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a great idea. I'll try it out.

PyThreadState *tstate = _PyThreadState_GET();
if (should_lock(tstate->interp)) {
tstate = get_tstate_for_main_gil();
acquire_custom_allocator_lock(tstate);
void *ptr = wrapped->malloc(wrapped->ctx, size);
release_custom_allocator_lock(tstate);
return ptr;
}
}
return wrapped->malloc(wrapped->ctx, size);
}

static void *
_PyMem_CallocLocked(void *ctx, size_t nelem, size_t elsize)
{
PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
if (_PyRuntime.allocators.num_gils > 1) {
PyThreadState *tstate = _PyThreadState_GET();
if (should_lock(tstate->interp)) {
tstate = get_tstate_for_main_gil();
acquire_custom_allocator_lock(tstate);
void *ptr = wrapped->calloc(wrapped->ctx, nelem, elsize);
release_custom_allocator_lock(tstate);
return ptr;
}
}
return wrapped->calloc(wrapped->ctx, nelem, elsize);
}

static void *
_PyMem_ReallocLocked(void *ctx, void *ptr, size_t new_size)
{
PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
if (_PyRuntime.allocators.num_gils > 1) {
PyThreadState *tstate = _PyThreadState_GET();
if (should_lock(tstate->interp)) {
tstate = get_tstate_for_main_gil();
acquire_custom_allocator_lock(tstate);
ptr = wrapped->realloc(wrapped->ctx, ptr, new_size);
release_custom_allocator_lock(tstate);
return ptr;
}
}
return wrapped->realloc(wrapped->ctx, ptr, new_size);
}

static void
_PyMem_FreeLocked(void *ctx, void *ptr)
{
PyMemAllocatorEx *wrapped = (PyMemAllocatorEx *)ctx;
if (_PyRuntime.allocators.num_gils > 1) {
PyThreadState *tstate = _PyThreadState_GET();
if (should_lock(tstate->interp)) {
tstate = get_tstate_for_main_gil();
acquire_custom_allocator_lock(tstate);
wrapped->free(wrapped->ctx, ptr);
release_custom_allocator_lock(tstate);
return;
}
}
wrapped->free(wrapped->ctx, ptr);
}

static int
has_locking_wrapper(PyMemAllocatorEx *allocator)
{
if (allocator->ctx == NULL) {
return 0;
}
return (allocator->malloc == _PyMem_MallocLocked
|| allocator->calloc == _PyMem_CallocLocked
|| allocator->realloc == _PyMem_ReallocLocked
|| allocator->free == _PyMem_FreeLocked);
}

static void
maybe_add_locking_wrapper(PyMemAllocatorDomain domain,
PyMemAllocatorEx *allocator,
PyMemAllocatorEx *wrapped)
{
assert(domain == PYMEM_DOMAIN_MEM || domain == PYMEM_DOMAIN_OBJ);

*wrapped = (PyMemAllocatorEx){0};

if (allocator->malloc == _PyMem_DebugMalloc) {
/* The debug allocator only wraps an already set allocator,
* which would have gone through this function already. */
return;
}

void *ctx = allocator->ctx;

/* What is the likelihood of reentrancy with the wrapper funcs?
* For now we assume it is effectively zero. */

if (allocator->malloc != _PyMem_RawMalloc
#ifdef WITH_PYMALLOC
&& allocator->malloc != _PyObject_Malloc
#endif
&& allocator->malloc != _PyMem_MallocLocked)
{
wrapped->ctx = ctx;
wrapped->malloc = allocator->malloc;
allocator->ctx = wrapped;
allocator->malloc = _PyMem_MallocLocked;
}

if (allocator->calloc != _PyMem_RawCalloc
#ifdef WITH_PYMALLOC
&& allocator->calloc != _PyObject_Calloc
#endif
&& allocator->calloc != _PyMem_CallocLocked)
{
wrapped->ctx = ctx;
wrapped->calloc = allocator->calloc;
allocator->ctx = wrapped;
allocator->calloc = _PyMem_CallocLocked;
}

if (allocator->realloc != _PyMem_RawRealloc
#ifdef WITH_PYMALLOC
&& allocator->realloc != _PyObject_Realloc
#endif
&& allocator->realloc != _PyMem_ReallocLocked)
{
wrapped->ctx = ctx;
wrapped->realloc = allocator->realloc;
allocator->ctx = wrapped;
allocator->realloc = _PyMem_ReallocLocked;
}

if (allocator->free != _PyMem_RawFree
#ifdef WITH_PYMALLOC
&& allocator->free != _PyObject_Free
#endif
&& allocator->free != _PyMem_FreeLocked)
{
wrapped->ctx = ctx;
wrapped->free = allocator->free;
allocator->ctx = wrapped;
allocator->free = _PyMem_FreeLocked;
}
}


/*************************/
/* the "arena" allocator */
/*************************/
Expand Down
23 changes: 22 additions & 1 deletion Python/pylifecycle.c
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ static PyStatus
init_interp_create_gil(PyThreadState *tstate, int gil)
{
PyStatus status;
_PyRuntimeState *runtime = tstate->interp->runtime;

/* finalize_interp_delete() comment explains why _PyEval_FiniGIL() is
only called here. */
Expand Down Expand Up @@ -619,7 +620,19 @@ init_interp_create_gil(PyThreadState *tstate, int gil)
return status;
}

return _PyStatus_OK();
if (own_gil) {
HEAD_LOCK(runtime);
if (runtime->allocators.num_gils == INT_MAX) {
status = _PyStatus_ERR("allocators GIL overflow");
}
else {
status =_PyStatus_OK();
runtime->allocators.num_gils++;
}
HEAD_UNLOCK(runtime);
}

return status;
}


Expand Down Expand Up @@ -1742,6 +1755,14 @@ finalize_interp_delete(PyInterpreterState *interp)
/* Cleanup auto-thread-state */
_PyGILState_Fini(interp);

_PyRuntimeState *runtime = interp->runtime;
if (interp->ceval.own_gil) {
HEAD_LOCK(runtime);
assert(runtime->allocators.num_gils > 0);
runtime->allocators.num_gils--;
HEAD_UNLOCK(runtime);
}

/* We can't call _PyEval_FiniGIL() here because destroying the GIL lock can
fail when it is being awaited by another running daemon thread (see
bpo-9901). Instead pycore_create_interpreter() destroys the previously
Expand Down