Skip to content

Commit 95e8f89

Browse files
b-passpre-commit-ci[bot]henryiiirwgk
authored
Support for sub-interpreters (#5564)
* Allow per-interpreter internals/local_internals * Significant rewrite to avoid using thread_locals as much as possible. Since we can avoid them by checking this atomic, the cmake config conditional shouldn't be necessary. The slower path (with thread_locals and extra checks) only comes in when a second interpreter is actually instanciated. * Add a test for per-interpreter GIL Uses two extra threads to demonstrate that neither shares a GIL. * Fix for nonconforming std::atomic constructors on some compilers * style: pre-commit fixes * Fix initializer to make MSVC happy. * Switch to gil_scoped_acquire_simple, get rid of old copy of it from internals.h * Use the PyThreadState's interp member rather than the thread state itself. * Be more explicit about the type of the internalspp * Suggested renamings and rewordings * Rename find_internals_pp and change it to take in the state dict reference * Use the old raise_from instead of pybind11_fail * Move most of the internals initialization into its constructor. * Move round_up_to_next_pow2 function upwards * Remove redundant forward decl * Add a python-driven subinterpreter test * Disable the python subinterpreter test on emscripten Can't load the native-built cpp modules. * Switch the internals pointer pointer to a unique_ptr pointer * Spelling * Fix clang-tidy warning, compare pointer to nullptr * Rename get_interpreter_counter to get_num_interpreters_seen * Try simplifying the test's cmake set_target_properties * Replace mod_* tags with a single tag w/enum Update tests accordingly * Add a test for shared-GIL (legacy) subinterpreters * Update test to work around differences in the various versions of interpreters modules * Fix unused parameter * Rename tests and associated test modules. * Switch get_internals_pp to a template function * Rename curtstate to cur_tstate * refactor: use simpler names Signed-off-by: Henry Schreiner <[email protected]> * style: pre-commit fixes * fix: return class, not enum Co-authored-by: Ralf W. Grosse-Kunstleve <[email protected]> Signed-off-by: Henry Schreiner <[email protected]> * Have to join these threads to make sure they are totally done before the test returns. * Wrap module_def initialization in a static so it only happens once. If it happens concurrently in multiple threads, badness ensues.... * style: pre-commit fixes --------- Signed-off-by: Henry Schreiner <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Henry Schreiner <[email protected]> Co-authored-by: Ralf W. Grosse-Kunstleve <[email protected]>
1 parent 05a6a03 commit 95e8f89

11 files changed

+690
-136
lines changed

include/pybind11/detail/common.h

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,13 @@
232232
# define PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
233233
#endif
234234

235+
// Slightly faster code paths are available when PYBIND11_SUBINTERPRETER_SUPPORT is *not* defined,
236+
// so avoid defining it for implementations that do not support subinterpreters.
237+
// However, defining it unnecessarily is not expected to break anything.
238+
#if PY_VERSION_HEX >= 0x030C0000 && !defined(PYPY_VERSION) && !defined(GRAALVM_PYTHON)
239+
# define PYBIND11_SUBINTERPRETER_SUPPORT
240+
#endif
241+
235242
// #define PYBIND11_STR_LEGACY_PERMISSIVE
236243
// If DEFINED, pybind11::str can hold PyUnicodeObject or PyBytesObject
237244
// (probably surprising and never documented, but this was the
@@ -394,19 +401,22 @@ PYBIND11_WARNING_DISABLE_CLANG("-Wgnu-zero-variadic-macro-arguments")
394401
PYBIND11_PLUGIN_IMPL(name) { \
395402
PYBIND11_CHECK_PYTHON_VERSION \
396403
PYBIND11_ENSURE_INTERNALS_READY \
397-
auto &slots = PYBIND11_CONCAT(pybind11_module_slots_, name); \
398-
slots[0] \
399-
= {Py_mod_exec, reinterpret_cast<void *>(&PYBIND11_CONCAT(pybind11_exec_, name))}; \
400-
slots[1] = {0, nullptr}; \
401-
auto m = ::pybind11::module_::initialize_multiphase_module_def( \
402-
PYBIND11_TOSTRING(name), \
403-
nullptr, \
404-
&PYBIND11_CONCAT(pybind11_module_def_, name), \
405-
slots, \
406-
##__VA_ARGS__); \
407-
return m.ptr(); \
404+
static auto result = []() { \
405+
auto &slots = PYBIND11_CONCAT(pybind11_module_slots_, name); \
406+
slots[0] = {Py_mod_exec, \
407+
reinterpret_cast<void *>(&PYBIND11_CONCAT(pybind11_exec_, name))}; \
408+
slots[1] = {0, nullptr}; \
409+
return ::pybind11::module_::initialize_multiphase_module_def( \
410+
PYBIND11_TOSTRING(name), \
411+
nullptr, \
412+
&PYBIND11_CONCAT(pybind11_module_def_, name), \
413+
slots, \
414+
##__VA_ARGS__); \
415+
}(); \
416+
return result.ptr(); \
408417
} \
409418
int PYBIND11_CONCAT(pybind11_exec_, name)(PyObject * pm) { \
419+
pybind11::detail::get_num_interpreters_seen() += 1; \
410420
try { \
411421
auto m = pybind11::reinterpret_borrow<::pybind11::module_>(pm); \
412422
PYBIND11_CONCAT(pybind11_init_, name)(m); \

include/pybind11/detail/internals.h

Lines changed: 171 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "common.h"
1717

18+
#include <atomic>
1819
#include <exception>
1920
#include <mutex>
2021
#include <thread>
@@ -53,6 +54,7 @@ constexpr const char *internals_function_record_capsule_name = "pybind11_functio
5354
inline PyTypeObject *make_static_property_type();
5455
inline PyTypeObject *make_default_metaclass();
5556
inline PyObject *make_object_base_type(PyTypeObject *metaclass);
57+
inline void translate_exception(std::exception_ptr p);
5658

5759
// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
5860
// Thread Specific Storage (TSS) API.
@@ -149,6 +151,20 @@ struct instance_map_shard {
149151

150152
static_assert(sizeof(instance_map_shard) % 64 == 0,
151153
"instance_map_shard size is not a multiple of 64 bytes");
154+
155+
inline uint64_t round_up_to_next_pow2(uint64_t x) {
156+
// Round-up to the next power of two.
157+
// See https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
158+
x--;
159+
x |= (x >> 1);
160+
x |= (x >> 2);
161+
x |= (x >> 4);
162+
x |= (x >> 8);
163+
x |= (x >> 16);
164+
x |= (x >> 32);
165+
x++;
166+
return x;
167+
}
152168
#endif
153169

154170
/// Internal data structure used to track registered instances and types.
@@ -178,9 +194,9 @@ struct internals {
178194
// extensions
179195
std::forward_list<std::string> static_strings; // Stores the std::strings backing
180196
// detail::c_str()
181-
PyTypeObject *static_property_type;
182-
PyTypeObject *default_metaclass;
183-
PyObject *instance_base;
197+
PyTypeObject *static_property_type = nullptr;
198+
PyTypeObject *default_metaclass = nullptr;
199+
PyObject *instance_base = nullptr;
184200
// Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
185201
PYBIND11_TLS_KEY_INIT(tstate)
186202
PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
@@ -189,7 +205,36 @@ struct internals {
189205

190206
type_map<PyObject *> native_enum_type_map;
191207

192-
internals() = default;
208+
internals() {
209+
PyThreadState *cur_tstate = PyThreadState_Get();
210+
// NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
211+
if (!PYBIND11_TLS_KEY_CREATE(tstate)) {
212+
pybind11_fail(
213+
"internals constructor: could not successfully initialize the tstate TSS key!");
214+
}
215+
PYBIND11_TLS_REPLACE_VALUE(tstate, cur_tstate);
216+
217+
// NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
218+
if (!PYBIND11_TLS_KEY_CREATE(loader_life_support_tls_key)) {
219+
pybind11_fail("internals constructor: could not successfully initialize the "
220+
"loader_life_support TSS key!");
221+
}
222+
223+
istate = cur_tstate->interp;
224+
registered_exception_translators.push_front(&translate_exception);
225+
static_property_type = make_static_property_type();
226+
default_metaclass = make_default_metaclass();
227+
#ifdef Py_GIL_DISABLED
228+
// Scale proportional to the number of cores. 2x is a heuristic to reduce contention.
229+
auto num_shards
230+
= static_cast<size_t>(round_up_to_next_pow2(2 * std::thread::hardware_concurrency()));
231+
if (num_shards == 0) {
232+
num_shards = 1;
233+
}
234+
instance_shards.reset(new instance_map_shard[num_shards]);
235+
instance_shards_mask = num_shards - 1;
236+
#endif
237+
}
193238
internals(const internals &other) = delete;
194239
internals &operator=(const internals &other) = delete;
195240
~internals() {
@@ -206,6 +251,17 @@ struct internals {
206251
}
207252
};
208253

254+
// the internals struct (above) is shared between all the modules. local_internals are only
255+
// for a single module. Any changes made to internals may require an update to
256+
// PYBIND11_INTERNALS_VERSION, breaking backwards compatibility. local_internals is, by design,
257+
// restricted to a single module. Whether a module has local internals or not should not
258+
// impact any other modules, because the only things accessing the local internals is the
259+
// module that contains them.
260+
struct local_internals {
261+
type_map<type_info *> registered_types_cpp;
262+
std::forward_list<ExceptionTranslator> registered_exception_translators;
263+
};
264+
209265
enum class holder_enum_t : uint8_t {
210266
undefined,
211267
std_unique_ptr, // Default, lacking interop with std::shared_ptr.
@@ -249,15 +305,49 @@ struct type_info {
249305
"__pybind11_module_local_v" PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) \
250306
PYBIND11_COMPILER_TYPE_LEADING_UNDERSCORE PYBIND11_PLATFORM_ABI_ID "__"
251307

252-
/// Each module locally stores a pointer to the `internals` data. The data
253-
/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
254-
inline internals **&get_internals_pp() {
255-
static internals **internals_pp = nullptr;
256-
return internals_pp;
308+
inline PyThreadState *get_thread_state_unchecked() {
309+
#if defined(PYPY_VERSION) || defined(GRAALVM_PYTHON)
310+
return PyThreadState_GET();
311+
#elif PY_VERSION_HEX < 0x030D0000
312+
return _PyThreadState_UncheckedGet();
313+
#else
314+
return PyThreadState_GetUnchecked();
315+
#endif
257316
}
258317

259-
// forward decl
260-
inline void translate_exception(std::exception_ptr);
318+
/// We use this counter to figure out if there are or have been multiple subinterpreters active at
319+
/// any point. This must never decrease while any interpreter may be running in any thread!
320+
inline std::atomic<int> &get_num_interpreters_seen() {
321+
static std::atomic<int> counter(0);
322+
return counter;
323+
}
324+
325+
template <typename InternalsType>
326+
inline std::unique_ptr<InternalsType> *&get_internals_pp() {
327+
#ifdef PYBIND11_SUBINTERPRETER_SUPPORT
328+
if (get_num_interpreters_seen() > 1) {
329+
// Internals is one per interpreter. When multiple interpreters are alive in different
330+
// threads we have to allow them to have different internals, so we need a thread_local.
331+
static thread_local std::unique_ptr<InternalsType> *t_internals_pp = nullptr;
332+
static thread_local PyInterpreterState *istate_cached = nullptr;
333+
// Whenever the interpreter changes on the current thread we need to invalidate the
334+
// internals_pp so that it can be pulled from the interpreter's state dict. That is slow,
335+
// so we use the current PyThreadState to check if it is necessary. The caller will see a
336+
// null return and do the fetch from the state dict or create a new one (as needed).
337+
auto *tstate = get_thread_state_unchecked();
338+
if (!tstate) {
339+
istate_cached = nullptr;
340+
t_internals_pp = nullptr;
341+
} else if (tstate->interp != istate_cached) {
342+
istate_cached = tstate->interp;
343+
t_internals_pp = nullptr;
344+
}
345+
return t_internals_pp;
346+
}
347+
#endif
348+
static std::unique_ptr<InternalsType> *s_internals_pp = nullptr;
349+
return s_internals_pp;
350+
}
261351

262352
template <class T,
263353
enable_if_t<std::is_same<std::nested_exception, remove_cvref_t<T>>::value, int> = 0>
@@ -384,49 +474,46 @@ inline object get_python_state_dict() {
384474
return state_dict;
385475
}
386476

387-
inline object get_internals_obj_from_state_dict(handle state_dict) {
388-
return reinterpret_steal<object>(
389-
dict_getitemstringref(state_dict.ptr(), PYBIND11_INTERNALS_ID));
390-
}
391-
392-
inline internals **get_internals_pp_from_capsule(handle obj) {
393-
void *raw_ptr = PyCapsule_GetPointer(obj.ptr(), /*name=*/nullptr);
394-
if (raw_ptr == nullptr) {
395-
raise_from(PyExc_SystemError, "pybind11::detail::get_internals_pp_from_capsule() FAILED");
396-
throw error_already_set();
477+
template <typename InternalsType>
478+
inline std::unique_ptr<InternalsType> *
479+
get_internals_pp_from_capsule_in_state_dict(dict &state_dict, char const *state_dict_key) {
480+
auto internals_obj
481+
= reinterpret_steal<object>(dict_getitemstringref(state_dict.ptr(), state_dict_key));
482+
if (internals_obj) {
483+
void *raw_ptr = PyCapsule_GetPointer(internals_obj.ptr(), /*name=*/nullptr);
484+
if (!raw_ptr) {
485+
raise_from(PyExc_SystemError,
486+
"pybind11::detail::get_internals_pp_from_capsule_in_state_dict() FAILED");
487+
throw error_already_set();
488+
}
489+
return reinterpret_cast<std::unique_ptr<InternalsType> *>(raw_ptr);
397490
}
398-
return static_cast<internals **>(raw_ptr);
399-
}
400-
401-
inline uint64_t round_up_to_next_pow2(uint64_t x) {
402-
// Round-up to the next power of two.
403-
// See https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
404-
x--;
405-
x |= (x >> 1);
406-
x |= (x >> 2);
407-
x |= (x >> 4);
408-
x |= (x >> 8);
409-
x |= (x >> 16);
410-
x |= (x >> 32);
411-
x++;
412-
return x;
491+
return nullptr;
413492
}
414493

415494
/// Return a reference to the current `internals` data
416495
PYBIND11_NOINLINE internals &get_internals() {
417-
auto **&internals_pp = get_internals_pp();
496+
auto *&internals_pp = get_internals_pp<internals>();
418497
if (internals_pp && *internals_pp) {
498+
// This is the fast path, everything is already setup, just return it
419499
return **internals_pp;
420500
}
421501

502+
// Slow path, something needs fetched from the state dict or created
503+
504+
// Cannot use py::gil_scoped_acquire inside get_internals since that calls get_internals.
422505
gil_scoped_acquire_simple gil;
423506
error_scope err_scope;
424507

425508
dict state_dict = get_python_state_dict();
426-
if (object internals_obj = get_internals_obj_from_state_dict(state_dict)) {
427-
internals_pp = get_internals_pp_from_capsule(internals_obj);
509+
internals_pp = get_internals_pp_from_capsule_in_state_dict<internals>(state_dict,
510+
PYBIND11_INTERNALS_ID);
511+
if (!internals_pp) {
512+
internals_pp = new std::unique_ptr<internals>;
513+
state_dict[PYBIND11_INTERNALS_ID] = capsule(reinterpret_cast<void *>(internals_pp));
428514
}
429-
if (internals_pp && *internals_pp) {
515+
516+
if (*internals_pp) {
430517
// We loaded the internals through `state_dict`, which means that our `error_already_set`
431518
// and `builtin_exception` may be different local classes than the ones set up in the
432519
// initial exception translator, below, so add another for our local exception classes.
@@ -435,68 +522,61 @@ PYBIND11_NOINLINE internals &get_internals() {
435522
// libc++ with CPython doesn't require this (types are explicitly exported)
436523
// libc++ with PyPy still need it, awaiting further investigation
437524
#if !defined(__GLIBCXX__)
438-
(*internals_pp)->registered_exception_translators.push_front(&translate_local_exception);
525+
if ((*internals_pp)->registered_exception_translators.empty()
526+
|| (*internals_pp)->registered_exception_translators.front()
527+
!= &translate_local_exception) {
528+
(*internals_pp)
529+
->registered_exception_translators.push_front(&translate_local_exception);
530+
}
439531
#endif
440532
} else {
441-
if (!internals_pp) {
442-
internals_pp = new internals *();
443-
}
444-
auto *&internals_ptr = *internals_pp;
445-
internals_ptr = new internals();
533+
auto &internals_ptr = *internals_pp;
534+
internals_ptr.reset(new internals());
446535

447-
PyThreadState *tstate = PyThreadState_Get();
448-
// NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
449-
if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->tstate)) {
450-
pybind11_fail("get_internals: could not successfully initialize the tstate TSS key!");
451-
}
452-
PYBIND11_TLS_REPLACE_VALUE(internals_ptr->tstate, tstate);
453-
454-
// NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
455-
if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->loader_life_support_tls_key)) {
456-
pybind11_fail("get_internals: could not successfully initialize the "
457-
"loader_life_support TSS key!");
458-
}
459-
460-
internals_ptr->istate = tstate->interp;
461-
state_dict[PYBIND11_INTERNALS_ID] = capsule(reinterpret_cast<void *>(internals_pp));
462-
internals_ptr->registered_exception_translators.push_front(&translate_exception);
463-
internals_ptr->static_property_type = make_static_property_type();
464-
internals_ptr->default_metaclass = make_default_metaclass();
465-
internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
466-
#ifdef Py_GIL_DISABLED
467-
// Scale proportional to the number of cores. 2x is a heuristic to reduce contention.
468-
auto num_shards
469-
= static_cast<size_t>(round_up_to_next_pow2(2 * std::thread::hardware_concurrency()));
470-
if (num_shards == 0) {
471-
num_shards = 1;
536+
if (!internals_ptr->instance_base) {
537+
// This calls get_internals, so cannot be called from within the internals constructor
538+
// called above because internals_ptr must be set before get_internals is called again
539+
internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
472540
}
473-
internals_ptr->instance_shards.reset(new instance_map_shard[num_shards]);
474-
internals_ptr->instance_shards_mask = num_shards - 1;
475-
#endif // Py_GIL_DISABLED
476541
}
542+
477543
return **internals_pp;
478544
}
479545

480-
// the internals struct (above) is shared between all the modules. local_internals are only
481-
// for a single module. Any changes made to internals may require an update to
482-
// PYBIND11_INTERNALS_VERSION, breaking backwards compatibility. local_internals is, by design,
483-
// restricted to a single module. Whether a module has local internals or not should not
484-
// impact any other modules, because the only things accessing the local internals is the
485-
// module that contains them.
486-
struct local_internals {
487-
type_map<type_info *> registered_types_cpp;
488-
std::forward_list<ExceptionTranslator> registered_exception_translators;
489-
};
546+
/// A string key uniquely describing this module
547+
inline char const *get_local_internals_id() {
548+
// Use the address of this static itself as part of the key, so that the value is uniquely tied
549+
// to where the module is loaded in memory
550+
static const std::string this_module_idstr
551+
= PYBIND11_MODULE_LOCAL_ID
552+
+ std::to_string(reinterpret_cast<uintptr_t>(&this_module_idstr));
553+
return this_module_idstr.c_str();
554+
}
490555

491556
/// Works like `get_internals`, but for things which are locally registered.
492557
inline local_internals &get_local_internals() {
493-
// Current static can be created in the interpreter finalization routine. If the later will be
494-
// destroyed in another static variable destructor, creation of this static there will cause
495-
// static deinitialization fiasco. In order to avoid it we avoid destruction of the
496-
// local_internals static. One can read more about the problem and current solution here:
497-
// https://google.github.io/styleguide/cppguide.html#Static_and_Global_Variables
498-
static auto *locals = new local_internals();
499-
return *locals;
558+
auto *&local_internals_pp = get_internals_pp<local_internals>();
559+
if (local_internals_pp && *local_internals_pp) {
560+
return **local_internals_pp;
561+
}
562+
563+
// Cannot use py::gil_scoped_acquire inside get_internals since that calls get_internals.
564+
gil_scoped_acquire_simple gil;
565+
error_scope err_scope;
566+
567+
dict state_dict = get_python_state_dict();
568+
local_internals_pp = get_internals_pp_from_capsule_in_state_dict<local_internals>(
569+
state_dict, get_local_internals_id());
570+
if (!local_internals_pp) {
571+
local_internals_pp = new std::unique_ptr<local_internals>;
572+
state_dict[get_local_internals_id()]
573+
= capsule(reinterpret_cast<void *>(local_internals_pp));
574+
}
575+
if (!*local_internals_pp) {
576+
local_internals_pp->reset(new local_internals());
577+
}
578+
579+
return **local_internals_pp;
500580
}
501581

502582
#ifdef Py_GIL_DISABLED

0 commit comments

Comments
 (0)