From 729654cf6f1b472f47acc98c3ab4890afbf3f08e Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sat, 13 Dec 2025 20:06:35 +0800 Subject: [PATCH 01/16] Add new argument to `gil_safe_call_once_and_store::call_once_and_store_result` --- include/pybind11/gil_safe_call_once.h | 29 +++++++++++++++++---------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 2abd8fc326..06fb9ef75e 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -8,8 +8,12 @@ #include #include -#ifdef Py_GIL_DISABLED +#if defined(Py_GIL_DISABLED) || defined(PYBIND11_HAS_SUBINTERPRETER_SUPPORT) # include + +using atomic_bool = std::atomic_bool; +#else +using atomic_bool = bool; #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -53,7 +57,8 @@ class gil_safe_call_once_and_store { public: // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. template - gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn) { + gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, + void (*finalize_fn)(T &) = nullptr) { if (!is_initialized_) { // This read is guarded by the GIL. // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. @@ -61,8 +66,9 @@ class gil_safe_call_once_and_store { std::call_once(once_flag_, [&] { // Only one thread will ever enter here. gil_scoped_acquire gil_acq; - ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. - is_initialized_ = true; // This write is guarded by the GIL. + ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. + finalize_fn_ = finalize_fn; // Store the finalizer. + is_initialized_ = true; // This write is guarded by the GIL. }); // All threads will observe `is_initialized_` as true here. } @@ -83,20 +89,21 @@ class gil_safe_call_once_and_store { } constexpr gil_safe_call_once_and_store() = default; - PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { + if (is_initialized_ && finalize_fn_ != nullptr) { + finalize_fn_(*reinterpret_cast(storage_)); + } + } private: alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; -#ifdef Py_GIL_DISABLED - std::atomic_bool -#else - bool -#endif - is_initialized_{false}; + void (*finalize_fn_)(T &) = nullptr; + // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, // but the latter does not have the triviality properties of former, // therefore `std::optional` is not a viable alternative here. + atomic_bool is_initialized_{false}; }; PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) From d2b76050a11ed5284903c69f9d9e01054d5754f6 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 01:03:04 +0800 Subject: [PATCH 02/16] Add per-interpreter storage for `gil_safe_call_once_and_store` --- include/pybind11/detail/internals.h | 39 +++++++++++- include/pybind11/gil_safe_call_once.h | 91 ++++++++++++++++++++++++++- 2 files changed, 126 insertions(+), 4 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 858de67525..d5c4da1acf 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,7 +39,7 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION -# define PYBIND11_INTERNALS_VERSION 11 +# define PYBIND11_INTERNALS_VERSION 12 #endif #if PYBIND11_INTERNALS_VERSION < 11 @@ -234,6 +234,34 @@ inline uint64_t round_up_to_next_pow2(uint64_t x) { class loader_life_support; +struct call_once_storage_base { + call_once_storage_base() = default; + virtual ~call_once_storage_base() = default; + call_once_storage_base(const call_once_storage_base &) = delete; + call_once_storage_base(call_once_storage_base &&) = delete; + call_once_storage_base &operator=(const call_once_storage_base &) = delete; + call_once_storage_base &operator=(call_once_storage_base &&) = delete; +}; + +template +struct call_once_storage : call_once_storage_base { + void (*finalize)(T &) = nullptr; + alignas(T) char storage[sizeof(T)] = {0}; + + call_once_storage() = default; + ~call_once_storage() override { + if (finalize != nullptr) { + finalize(*reinterpret_cast(storage)); + } + memset(storage, 0, sizeof(T)); + finalize = nullptr; + }; + call_once_storage(const call_once_storage &) = delete; + call_once_storage(call_once_storage &&) = delete; + call_once_storage &operator=(const call_once_storage &) = delete; + call_once_storage &operator=(call_once_storage &&) = delete; +}; + /// Internal data structure used to track registered instances and types. /// Whenever binary incompatible changes are made to this structure, /// `PYBIND11_INTERNALS_VERSION` must be incremented. @@ -283,6 +311,8 @@ struct internals { type_map native_enum_type_map; + std::unordered_map call_once_storage_map; + internals() : static_property_type(make_static_property_type()), default_metaclass(make_default_metaclass()) { @@ -308,7 +338,12 @@ struct internals { internals(internals &&other) = delete; internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; - ~internals() = default; + ~internals() { + for (auto &[_, storage_ptr] : call_once_storage_map) { + delete storage_ptr; + } + call_once_storage_map.clear(); + } }; // the internals struct (above) is shared between all the modules. local_internals are only diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 06fb9ef75e..a848404eaf 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -3,6 +3,7 @@ #pragma once #include "detail/common.h" +#include "detail/internals.h" #include "gil.h" #include @@ -52,6 +53,7 @@ PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) // functions, which is usually the case. // // For in-depth background, see docs/advanced/deadlock.md +#ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT template class gil_safe_call_once_and_store { public: @@ -59,6 +61,7 @@ class gil_safe_call_once_and_store { template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, void (*finalize_fn)(T &) = nullptr) { + if (!is_initialized_) { // This read is guarded by the GIL. // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. @@ -80,10 +83,10 @@ class gil_safe_call_once_and_store { T &get_stored() { assert(is_initialized_); PYBIND11_WARNING_PUSH -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 +# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 // Needed for gcc 4.8.5 PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing") -#endif +# endif return *reinterpret_cast(storage_); PYBIND11_WARNING_POP } @@ -96,6 +99,7 @@ class gil_safe_call_once_and_store { } private: + // Global static storage (per process) when subinterpreter support is disabled. alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; void (*finalize_fn_)(T &) = nullptr; @@ -105,5 +109,88 @@ class gil_safe_call_once_and_store { // therefore `std::optional` is not a viable alternative here. atomic_bool is_initialized_{false}; }; +#else +// Subinterpreter support is enabled. +// In this case, we should store the result per-interpreter instead of globally, because +// each subinterpreter has its own separate state. The cached object may not shareable +// across interpreters (e.g., imported modules and their members). +template +class gil_safe_call_once_and_store { +public: + // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. + template + gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, + void (*finalize_fn)(T &) = nullptr) { + if (!is_initialized_by_atleast_one_interpreter_ + || detail::get_num_interpreters_seen() > 1) { + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + auto it = storage_map.find(key); + if (it == storage_map.end()) { + gil_scoped_release gil_rel; // Needed to establish lock ordering. + { + // Only one thread will ever enter here. + gil_scoped_acquire gil_acq; + auto s = new detail::call_once_storage{}; + ::new (s->storage) T(fn()); // fn may release, but will reacquire, the GIL. + s->finalize = finalize_fn; + last_storage_ = reinterpret_cast(s->storage); + storage_map.emplace(key, s); + }; + } + is_initialized_by_atleast_one_interpreter_ = true; + }); + // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. + } + // Intentionally not returning `T &` to ensure the calling code is self-documenting. + return *this; + } + // This must only be called after `call_once_and_store_result()` was called. + T &get_stored() { + T *result = last_storage_; + if (!is_initialized_by_atleast_one_interpreter_ + || detail::get_num_interpreters_seen() > 1) { + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + auto it = storage_map.find(key); + assert(it != storage_map.end()); + auto *s = static_cast *>(it->second); + result = last_storage_ = reinterpret_cast(s->storage); + }); + } + assert(result != nullptr); + return *result; + } + + constexpr gil_safe_call_once_and_store() = default; + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { + if (is_initialized_by_atleast_one_interpreter_) { + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + auto it = storage_map.find(key); + if (it != storage_map.end()) { + delete it->second; + storage_map.erase(it); + } + }); + } + } + +private: + // No storage needed when subinterpreter support is enabled. + // The actual storage is stored in the per-interpreter state dict in + // `internals.call_once_storage_map`. + + // Fast local cache to avoid repeated lookups when there are no multiple interpreters. + // This is only valid if there is a single interpreter. Otherwise, it is not used. + T *last_storage_ = nullptr; + // This flag is true if the value has been initialized by any interpreter (may not be the + // current one). + atomic_bool is_initialized_by_atleast_one_interpreter_{false}; +}; +#endif PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) From e7417606e8979f948be47d0512bfaf9a21d2953f Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 10:56:19 +0800 Subject: [PATCH 03/16] Make `~gil_safe_call_once_and_store` a no-op --- include/pybind11/gil_safe_call_once.h | 57 ++++++++++----------------- 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index a848404eaf..10ba995dcc 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -60,8 +60,7 @@ class gil_safe_call_once_and_store { // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, - void (*finalize_fn)(T &) = nullptr) { - + void (*)(T &) /*unused*/ = nullptr) { if (!is_initialized_) { // This read is guarded by the GIL. // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. @@ -69,9 +68,8 @@ class gil_safe_call_once_and_store { std::call_once(once_flag_, [&] { // Only one thread will ever enter here. gil_scoped_acquire gil_acq; - ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. - finalize_fn_ = finalize_fn; // Store the finalizer. - is_initialized_ = true; // This write is guarded by the GIL. + ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. + is_initialized_ = true; // This write is guarded by the GIL. }); // All threads will observe `is_initialized_` as true here. } @@ -92,17 +90,15 @@ class gil_safe_call_once_and_store { } constexpr gil_safe_call_once_and_store() = default; - PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { - if (is_initialized_ && finalize_fn_ != nullptr) { - finalize_fn_(*reinterpret_cast(storage_)); - } - } + // The instance is a global static, so its destructor runs when the process + // is terminating. Therefore, do nothing here because the Python interpreter + // may have been finalized already. + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: // Global static storage (per process) when subinterpreter support is disabled. alignas(T) char storage_[sizeof(T)] = {}; std::once_flag once_flag_; - void (*finalize_fn_)(T &) = nullptr; // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, // but the latter does not have the triviality properties of former, @@ -124,19 +120,19 @@ class gil_safe_call_once_and_store { if (!is_initialized_by_atleast_one_interpreter_ || detail::get_num_interpreters_seen() > 1) { detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(key); + auto it = storage_map.find(k); if (it == storage_map.end()) { gil_scoped_release gil_rel; // Needed to establish lock ordering. { // Only one thread will ever enter here. gil_scoped_acquire gil_acq; - auto s = new detail::call_once_storage{}; - ::new (s->storage) T(fn()); // fn may release, but will reacquire, the GIL. - s->finalize = finalize_fn; - last_storage_ = reinterpret_cast(s->storage); - storage_map.emplace(key, s); + auto v = new detail::call_once_storage{}; + ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. + v->finalize = finalize_fn; + last_storage_ = reinterpret_cast(v->storage); + storage_map.emplace(k, v); }; } is_initialized_by_atleast_one_interpreter_ = true; @@ -153,12 +149,10 @@ class gil_safe_call_once_and_store { if (!is_initialized_by_atleast_one_interpreter_ || detail::get_num_interpreters_seen() > 1) { detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(key); - assert(it != storage_map.end()); - auto *s = static_cast *>(it->second); - result = last_storage_ = reinterpret_cast(s->storage); + auto *v = static_cast *>(storage_map.at(k)); + result = last_storage_ = reinterpret_cast(v->storage); }); } assert(result != nullptr); @@ -166,19 +160,10 @@ class gil_safe_call_once_and_store { } constexpr gil_safe_call_once_and_store() = default; - PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() { - if (is_initialized_by_atleast_one_interpreter_) { - detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(key); - if (it != storage_map.end()) { - delete it->second; - storage_map.erase(it); - } - }); - } - } + // The instance is a global static, so its destructor runs when the process + // is terminating. Therefore, do nothing here because the Python interpreter + // may have been finalized already. + PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: // No storage needed when subinterpreter support is enabled. From 5d1d6782b9fa7cc6f705b6adad1828ac2c66ec5a Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 12:15:30 +0800 Subject: [PATCH 04/16] Fix C++11 compatibility --- include/pybind11/detail/internals.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index d5c4da1acf..046e47314f 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -339,8 +339,8 @@ struct internals { internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; ~internals() { - for (auto &[_, storage_ptr] : call_once_storage_map) { - delete storage_ptr; + for (auto &entry : call_once_storage_map) { + delete entry.second; } call_once_storage_map.clear(); } From 0bac82df687e2bdd919c653b9ee0d1fecd155fa5 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 13:06:15 +0800 Subject: [PATCH 05/16] Improve thread-safety and add default finalizer --- include/pybind11/detail/internals.h | 11 +++-- include/pybind11/gil_safe_call_once.h | 60 +++++++++++++++++---------- 2 files changed, 46 insertions(+), 25 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 046e47314f..dd0c2af957 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -247,14 +247,17 @@ template struct call_once_storage : call_once_storage_base { void (*finalize)(T &) = nullptr; alignas(T) char storage[sizeof(T)] = {0}; + std::atomic_bool is_initialized{false}; call_once_storage() = default; ~call_once_storage() override { - if (finalize != nullptr) { - finalize(*reinterpret_cast(storage)); + if (is_initialized) { + if (finalize != nullptr) { + finalize(*reinterpret_cast(storage)); + } else { + reinterpret_cast(storage)->~T(); + } } - memset(storage, 0, sizeof(T)); - finalize = nullptr; }; call_once_storage(const call_once_storage &) = delete; call_once_storage(call_once_storage &&) = delete; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 10ba995dcc..5904f97ba4 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -54,6 +54,11 @@ PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) // // For in-depth background, see docs/advanced/deadlock.md #ifndef PYBIND11_HAS_SUBINTERPRETER_SUPPORT +// Subinterpreter support is disabled. +// In this case, we can store the result globally, because there is only a single interpreter. +// +// The life span of the stored result is the entire process lifetime. It is leaked on process +// termination to avoid destructor calls after the Python interpreter was finalized. template class gil_safe_call_once_and_store { public: @@ -107,9 +112,12 @@ class gil_safe_call_once_and_store { }; #else // Subinterpreter support is enabled. -// In this case, we should store the result per-interpreter instead of globally, because -// each subinterpreter has its own separate state. The cached object may not shareable -// across interpreters (e.g., imported modules and their members). +// In this case, we should store the result per-interpreter instead of globally, because each +// subinterpreter has its own separate state. The cached result may not shareable across +// interpreters (e.g., imported modules and their members). +// +// The life span of the stored result is the entire interpreter lifetime. An additional +// `finalize_fn` can be provided to clean up the stored result when the interpreter is destroyed. template class gil_safe_call_once_and_store { public: @@ -117,26 +125,32 @@ class gil_safe_call_once_and_store { template gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn, void (*finalize_fn)(T &) = nullptr) { - if (!is_initialized_by_atleast_one_interpreter_ - || detail::get_num_interpreters_seen() > 1) { - detail::with_internals([&](detail::internals &internals) { - const void *k = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - auto it = storage_map.find(k); - if (it == storage_map.end()) { - gil_scoped_release gil_rel; // Needed to establish lock ordering. - { - // Only one thread will ever enter here. - gil_scoped_acquire gil_acq; + if (!is_last_storage_valid()) { + // Multiple threads may enter here, because the GIL is released in the next line and + // CPython API calls in the `fn()` call below may release and reacquire the GIL. + gil_scoped_release gil_rel; // Needed to establish lock ordering. + { + gil_scoped_acquire gil_acq; + detail::with_internals([&](detail::internals &internals) { + // The concurrency control is done inside `detail::with_internals`. + // At most one thread will enter here at a time. + const void *k = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + // There can be multiple threads going through here, but only one each at a + // time. So only one thread will create the storage. Other threads will find it + // already created. + auto it = storage_map.find(k); + if (it == storage_map.end()) { auto v = new detail::call_once_storage{}; ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. v->finalize = finalize_fn; last_storage_ = reinterpret_cast(v->storage); + v->is_initialized = true; storage_map.emplace(k, v); - }; - } - is_initialized_by_atleast_one_interpreter_ = true; - }); + } + is_initialized_by_atleast_one_interpreter_ = true; + }); + } // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. } // Intentionally not returning `T &` to ensure the calling code is self-documenting. @@ -146,8 +160,7 @@ class gil_safe_call_once_and_store { // This must only be called after `call_once_and_store_result()` was called. T &get_stored() { T *result = last_storage_; - if (!is_initialized_by_atleast_one_interpreter_ - || detail::get_num_interpreters_seen() > 1) { + if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; @@ -159,13 +172,18 @@ class gil_safe_call_once_and_store { return *result; } - constexpr gil_safe_call_once_and_store() = default; + gil_safe_call_once_and_store() = default; // The instance is a global static, so its destructor runs when the process // is terminating. Therefore, do nothing here because the Python interpreter // may have been finalized already. PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; private: + bool is_last_storage_valid() const { + return is_initialized_by_atleast_one_interpreter_ + && detail::get_num_interpreters_seen() <= 1 && last_storage_ != nullptr; + } + // No storage needed when subinterpreter support is enabled. // The actual storage is stored in the per-interpreter state dict in // `internals.call_once_storage_map`. From be971103aad809575d22db6bcc5aa56c8215b2c4 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 14 Dec 2025 16:24:06 +0800 Subject: [PATCH 06/16] Try fix thread-safety --- include/pybind11/gil_safe_call_once.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 5904f97ba4..2bedb6d665 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -130,7 +130,6 @@ class gil_safe_call_once_and_store { // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. { - gil_scoped_acquire gil_acq; detail::with_internals([&](detail::internals &internals) { // The concurrency control is done inside `detail::with_internals`. // At most one thread will enter here at a time. @@ -141,10 +140,11 @@ class gil_safe_call_once_and_store { // already created. auto it = storage_map.find(k); if (it == storage_map.end()) { + gil_scoped_acquire gil_acq; auto v = new detail::call_once_storage{}; ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. v->finalize = finalize_fn; - last_storage_ = reinterpret_cast(v->storage); + last_storage_ptr_ = reinterpret_cast(v->storage); v->is_initialized = true; storage_map.emplace(k, v); } @@ -159,13 +159,13 @@ class gil_safe_call_once_and_store { // This must only be called after `call_once_and_store_result()` was called. T &get_stored() { - T *result = last_storage_; + T *result = last_storage_ptr_; if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { const void *k = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; auto *v = static_cast *>(storage_map.at(k)); - result = last_storage_ = reinterpret_cast(v->storage); + result = last_storage_ptr_ = reinterpret_cast(v->storage); }); } assert(result != nullptr); @@ -181,7 +181,7 @@ class gil_safe_call_once_and_store { private: bool is_last_storage_valid() const { return is_initialized_by_atleast_one_interpreter_ - && detail::get_num_interpreters_seen() <= 1 && last_storage_ != nullptr; + && detail::get_num_interpreters_seen() <= 1; } // No storage needed when subinterpreter support is enabled. @@ -190,7 +190,7 @@ class gil_safe_call_once_and_store { // Fast local cache to avoid repeated lookups when there are no multiple interpreters. // This is only valid if there is a single interpreter. Otherwise, it is not used. - T *last_storage_ = nullptr; + T *last_storage_ptr_ = nullptr; // This flag is true if the value has been initialized by any interpreter (may not be the // current one). atomic_bool is_initialized_by_atleast_one_interpreter_{false}; From 3e77ce953a740fe2182af686723901bde05cc2a5 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 01:15:46 +0800 Subject: [PATCH 07/16] Try fix thread-safety --- include/pybind11/detail/internals.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index dd0c2af957..b5e9d6eb7c 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -602,27 +602,26 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - if (get_num_interpreters_seen() > 1) { - // Whenever the interpreter changes on the current thread we need to invalidate the - // internals_pp so that it can be pulled from the interpreter's state dict. That is - // slow, so we use the current PyThreadState to check if it is necessary. - auto *tstate = get_thread_state_unchecked(); - if (!tstate || tstate->interp != last_istate_tls()) { - gil_scoped_acquire_simple gil; - if (!tstate) { - tstate = get_thread_state_unchecked(); - } - last_istate_tls() = tstate->interp; - internals_p_tls() = get_or_create_pp_in_state_dict(); + // Whenever the interpreter changes on the current thread we need to invalidate the + // internals_pp so that it can be pulled from the interpreter's state dict. That is + // slow, so we use the current PyThreadState to check if it is necessary. + auto *tstate = get_thread_state_unchecked(); + if (!tstate || tstate->interp != last_istate_tls()) { + gil_scoped_acquire_simple gil; + if (!tstate) { + tstate = get_thread_state_unchecked(); } - return internals_p_tls(); + last_istate_tls() = tstate->interp; + internals_p_tls() = get_or_create_pp_in_state_dict(); } -#endif + return internals_p_tls(); +#else if (!internals_singleton_pp_) { gil_scoped_acquire_simple gil; internals_singleton_pp_ = get_or_create_pp_in_state_dict(); } return internals_singleton_pp_; +#endif } /// Drop all the references we're currently holding. From d5b8813a66f2b66dcc7419e87d16401356033159 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 14:36:35 +0800 Subject: [PATCH 08/16] Add a warning comment --- include/pybind11/detail/internals.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index b5e9d6eb7c..e22e94ffe5 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -602,6 +602,19 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT + // WARNING: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for + // the single-interpreter case. + // + // For multi-interpreter support, the subinterpreters can be initialized concurrently, and + // the first time this function may not be called in the main interpreter. + // For example, a clean main interpreter that does not import any pybind11 module and then + // spawns multiple subinterpreters using `InterpreterPoolExecutor` that each imports a + // pybind11 module concurrently. + // + // Multiple subinterpreters may observe `get_num_interpreters_seen() <= 1` at the same + // time, while `get_num_interpreters_seen() += 1` in `PYBIND11_MODULE(...)` is called + // later. + // Whenever the interpreter changes on the current thread we need to invalidate the // internals_pp so that it can be pulled from the interpreter's state dict. That is // slow, so we use the current PyThreadState to check if it is necessary. From f6d0f88bd6a29858e92da1503362bfdab2a86c39 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 18:16:16 +0800 Subject: [PATCH 09/16] Simplify `PYBIND11_INTERNALS_VERSION >= 12` --- include/pybind11/detail/class.h | 2 -- include/pybind11/detail/internals.h | 11 ++--------- include/pybind11/detail/type_caster_base.h | 10 ++-------- include/pybind11/gil_safe_call_once.h | 1 + include/pybind11/pybind11.h | 4 ---- 5 files changed, 5 insertions(+), 23 deletions(-) diff --git a/include/pybind11/detail/class.h b/include/pybind11/detail/class.h index 21e966cfea..1cd9af0bd1 100644 --- a/include/pybind11/detail/class.h +++ b/include/pybind11/detail/class.h @@ -226,14 +226,12 @@ extern "C" inline void pybind11_meta_dealloc(PyObject *obj) { local_internals.registered_types_cpp.erase(tinfo->cpptype); } else { internals.registered_types_cpp.erase(tindex); -#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast.erase(tinfo->cpptype); for (const std::type_info *alias : tinfo->alias_chain) { auto num_erased = internals.registered_types_cpp_fast.erase(alias); (void) num_erased; assert(num_erased > 0); } -#endif } internals.registered_types_py.erase(tinfo->type); diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index e22e94ffe5..b67b9ce6d4 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -42,8 +42,8 @@ # define PYBIND11_INTERNALS_VERSION 12 #endif -#if PYBIND11_INTERNALS_VERSION < 11 -# error "PYBIND11_INTERNALS_VERSION 11 is the minimum for all platforms for pybind11v3." +#if PYBIND11_INTERNALS_VERSION < 12 +# error "PYBIND11_INTERNALS_VERSION 12 is the minimum for all platforms for pybind11v3." #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -273,14 +273,12 @@ struct internals { pymutex mutex; pymutex exception_translator_mutex; #endif -#if PYBIND11_INTERNALS_VERSION >= 12 // non-normative but fast "hint" for registered_types_cpp. Meant // to be used as the first level of a two-level lookup: successful // lookups are correct, but unsuccessful lookups need to try // registered_types_cpp and then backfill this map if they find // anything. fast_type_map registered_types_cpp_fast; -#endif // std::type_index -> pybind11's type information type_map registered_types_cpp; @@ -306,9 +304,6 @@ struct internals { PyObject *instance_base = nullptr; // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: thread_specific_storage tstate; -#if PYBIND11_INTERNALS_VERSION <= 11 - thread_specific_storage loader_life_support_tls; // OBSOLETE (PR #5830) -#endif // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: PyInterpreterState *istate = nullptr; @@ -396,7 +391,6 @@ struct type_info { void *(*module_local_load)(PyObject *, const type_info *) = nullptr; holder_enum_t holder_enum_v = holder_enum_t::undefined; -#if PYBIND11_INTERNALS_VERSION >= 12 // When a type appears in multiple DSOs, // internals::registered_types_cpp_fast will have multiple distinct // keys (the std::type_info from each DSO) mapped to the same @@ -407,7 +401,6 @@ struct type_info { // nb_alias_chain` added in // https://github.com/wjakob/nanobind/commit/b515b1f7f2f4ecc0357818e6201c94a9f4cbfdc2 std::forward_list alias_chain; -#endif /* A simple type never occurs as a (direct or indirect) parent * of a class that makes use of multiple inheritance. diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index b0c59e1138..21b7f0950e 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -227,32 +227,26 @@ inline detail::type_info *get_global_type_info_lock_held(const std::type_info &t // next time. detail::type_info *type_info = nullptr; auto &internals = get_internals(); -#if PYBIND11_INTERNALS_VERSION >= 12 auto &fast_types = internals.registered_types_cpp_fast; -#endif auto &types = internals.registered_types_cpp; -#if PYBIND11_INTERNALS_VERSION >= 12 auto fast_it = fast_types.find(&tp); if (fast_it != fast_types.end()) { -# ifndef NDEBUG +#ifndef NDEBUG auto types_it = types.find(std::type_index(tp)); assert(types_it != types.end()); assert(types_it->second == fast_it->second); -# endif +#endif return fast_it->second; } -#endif // PYBIND11_INTERNALS_VERSION >= 12 auto it = types.find(std::type_index(tp)); if (it != types.end()) { -#if PYBIND11_INTERNALS_VERSION >= 12 // We found the type in the slow map but not the fast one, so // some other DSO added it (otherwise it would be in the fast // map under &tp) and therefore we must be an alias. Record // that. it->second->alias_chain.push_front(&tp); fast_types.emplace(&tp, it->second); -#endif type_info = it->second; } return type_info; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 2bedb6d665..a0d74bc6f3 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -196,4 +196,5 @@ class gil_safe_call_once_and_store { atomic_bool is_initialized_by_atleast_one_interpreter_{false}; }; #endif + PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index 91b38d91ed..8bd62c85c9 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1692,9 +1692,7 @@ class generic_type : public object { local_internals.registered_types_cpp[rec.type] = tinfo; } else { internals.registered_types_cpp[tindex] = tinfo; -#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[rec.type] = tinfo; -#endif } PYBIND11_WARNING_PUSH @@ -2201,9 +2199,7 @@ class class_ : public detail::generic_type { type_info *const val = internals.registered_types_cpp[std::type_index(typeid(type))]; internals.registered_types_cpp[std::type_index(typeid(type_alias))] = val; -#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[&typeid(type_alias)] = val; -#endif } }); } From 7d8339eff5998b33c5455c3f6937756e3168d6fa Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 18:45:23 +0800 Subject: [PATCH 10/16] Try fix thread-safety --- include/pybind11/detail/internals.h | 4 ++-- include/pybind11/gil_safe_call_once.h | 28 +++++++++++++-------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index b67b9ce6d4..4abf7d41df 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -245,8 +245,8 @@ struct call_once_storage_base { template struct call_once_storage : call_once_storage_base { - void (*finalize)(T &) = nullptr; alignas(T) char storage[sizeof(T)] = {0}; + void (*finalize)(T &) = nullptr; std::atomic_bool is_initialized{false}; call_once_storage() = default; @@ -337,7 +337,7 @@ struct internals { internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; ~internals() { - for (auto &entry : call_once_storage_map) { + for (const auto &entry : call_once_storage_map) { delete entry.second; } call_once_storage_map.clear(); diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index a0d74bc6f3..98e7149947 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -131,22 +131,22 @@ class gil_safe_call_once_and_store { gil_scoped_release gil_rel; // Needed to establish lock ordering. { detail::with_internals([&](detail::internals &internals) { - // The concurrency control is done inside `detail::with_internals`. - // At most one thread will enter here at a time. - const void *k = reinterpret_cast(this); + const void *key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - // There can be multiple threads going through here, but only one each at a - // time. So only one thread will create the storage. Other threads will find it - // already created. - auto it = storage_map.find(k); - if (it == storage_map.end()) { + // There can be multiple threads going through here. + if (storage_map.find(key) == storage_map.end()) { gil_scoped_acquire gil_acq; - auto v = new detail::call_once_storage{}; - ::new (v->storage) T(fn()); // fn may release, but will reacquire, the GIL. - v->finalize = finalize_fn; - last_storage_ptr_ = reinterpret_cast(v->storage); - v->is_initialized = true; - storage_map.emplace(k, v); + // Only one thread will enter here at a time. + // Fast recheck to avoid double work. + if (storage_map.find(key) == storage_map.end()) { + auto value = new detail::call_once_storage{}; + // fn may release, but will reacquire, the GIL. + ::new (value->storage) T(fn()); + value->finalize = finalize_fn; + value->is_initialized = true; + storage_map.emplace(key, value); + last_storage_ptr_ = reinterpret_cast(value->storage); + } } is_initialized_by_atleast_one_interpreter_ = true; }); From 1920f4345a61acbd444f0d3309a124ac7dee895d Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Mon, 15 Dec 2025 19:48:42 +0800 Subject: [PATCH 11/16] Try fix thread-safety --- include/pybind11/detail/internals.h | 3 +- include/pybind11/gil_safe_call_once.h | 50 +++++++++++++++------------ 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 4abf7d41df..802a57e3e5 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -245,7 +245,8 @@ struct call_once_storage_base { template struct call_once_storage : call_once_storage_base { - alignas(T) char storage[sizeof(T)] = {0}; + alignas(T) char storage[sizeof(T)] = {}; + std::once_flag once_flag; void (*finalize)(T &) = nullptr; std::atomic_bool is_initialized{false}; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 98e7149947..e00bbb9f06 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -129,28 +129,34 @@ class gil_safe_call_once_and_store { // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. - { - detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - // There can be multiple threads going through here. - if (storage_map.find(key) == storage_map.end()) { - gil_scoped_acquire gil_acq; - // Only one thread will enter here at a time. - // Fast recheck to avoid double work. - if (storage_map.find(key) == storage_map.end()) { - auto value = new detail::call_once_storage{}; - // fn may release, but will reacquire, the GIL. - ::new (value->storage) T(fn()); - value->finalize = finalize_fn; - value->is_initialized = true; - storage_map.emplace(key, value); - last_storage_ptr_ = reinterpret_cast(value->storage); - } + detail::with_internals([&](detail::internals &internals) { + const void *key = reinterpret_cast(this); + auto &storage_map = internals.call_once_storage_map; + // There can be multiple threads going through here. + detail::call_once_storage *value = nullptr; + { + gil_scoped_acquire gil_acq; + // Only one thread will enter here at a time. + const auto it = storage_map.find(key); + if (it != storage_map.end()) { + value = static_cast *>(it->second); + } else { + value = new detail::call_once_storage{}; + storage_map.emplace(key, value); } + } + assert(value != nullptr); + std::call_once(value->once_flag, [&] { + // Only one thread will ever enter here. + gil_scoped_acquire gil_acq; + // fn may release, but will reacquire, the GIL. + ::new (value->storage) T(fn()); + value->finalize = finalize_fn; + value->is_initialized = true; + last_storage_ptr_ = reinterpret_cast(value->storage); is_initialized_by_atleast_one_interpreter_ = true; }); - } + }); // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. } // Intentionally not returning `T &` to ensure the calling code is self-documenting. @@ -162,10 +168,10 @@ class gil_safe_call_once_and_store { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { - const void *k = reinterpret_cast(this); + const void *key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; - auto *v = static_cast *>(storage_map.at(k)); - result = last_storage_ptr_ = reinterpret_cast(v->storage); + auto *value = static_cast *>(storage_map.at(key)); + result = last_storage_ptr_ = reinterpret_cast(value->storage); }); } assert(result != nullptr); From a6754ba40d2326c3680984b52a6f893cd89d57bd Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Tue, 16 Dec 2025 15:57:09 +0800 Subject: [PATCH 12/16] Revert get_pp() --- include/pybind11/detail/internals.h | 56 +++++++++++++++++------------ 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 802a57e3e5..c157bf53cb 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -596,39 +596,46 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - // WARNING: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for - // the single-interpreter case. + // FIXME: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for + // the multi-interpreter case. The singleton may be initialized by a subinterpreter not the + // main interpreter. // // For multi-interpreter support, the subinterpreters can be initialized concurrently, and // the first time this function may not be called in the main interpreter. // For example, a clean main interpreter that does not import any pybind11 module and then // spawns multiple subinterpreters using `InterpreterPoolExecutor` that each imports a // pybind11 module concurrently. - // - // Multiple subinterpreters may observe `get_num_interpreters_seen() <= 1` at the same - // time, while `get_num_interpreters_seen() += 1` in `PYBIND11_MODULE(...)` is called - // later. - - // Whenever the interpreter changes on the current thread we need to invalidate the - // internals_pp so that it can be pulled from the interpreter's state dict. That is - // slow, so we use the current PyThreadState to check if it is necessary. - auto *tstate = get_thread_state_unchecked(); - if (!tstate || tstate->interp != last_istate_tls()) { - gil_scoped_acquire_simple gil; - if (!tstate) { - tstate = get_thread_state_unchecked(); + if (get_num_interpreters_seen() > 1) { + // Whenever the interpreter changes on the current thread we need to invalidate the + // internals_pp so that it can be pulled from the interpreter's state dict. That is + // slow, so we use the current PyThreadState to check if it is necessary. + auto *tstate = get_thread_state_unchecked(); + if (!tstate || tstate->interp != last_istate_tls()) { + gil_scoped_acquire_simple gil; + if (!tstate) { + tstate = get_thread_state_unchecked(); + } + last_istate_tls() = tstate->interp; + internals_p_tls() = get_or_create_pp_in_state_dict(); } - last_istate_tls() = tstate->interp; - internals_p_tls() = get_or_create_pp_in_state_dict(); + return internals_p_tls(); } - return internals_p_tls(); -#else - if (!internals_singleton_pp_) { - gil_scoped_acquire_simple gil; - internals_singleton_pp_ = get_or_create_pp_in_state_dict(); +#endif + return get_pp_for_main_interpreter(); + } + + /// Get the pointer-to-pointer for the main interpreter, allocating it if it does not already + /// exist. May acquire the GIL. Will never return nullptr. + std::unique_ptr *get_pp_for_main_interpreter() { + // This function **assumes** that the current thread is running in the main interpreter. + if (!seen_main_interpreter_) { + std::call_once(seen_main_interpreter_flag_, [&] { + gil_scoped_acquire_simple gil; + internals_singleton_pp_ = get_or_create_pp_in_state_dict(); + seen_main_interpreter_ = true; + }); } return internals_singleton_pp_; -#endif } /// Drop all the references we're currently holding. @@ -705,6 +712,9 @@ class internals_pp_manager { char const *holder_id_ = nullptr; on_fetch_function *on_fetch_ = nullptr; std::unique_ptr *internals_singleton_pp_; + + std::once_flag seen_main_interpreter_flag_; + std::atomic_bool seen_main_interpreter_{false}; }; // If We loaded the internals through `state_dict`, our `error_already_set` From 1aed3ab1b4682ab61cd41a00284d5a6f1b63e1d1 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Tue, 16 Dec 2025 16:33:14 +0800 Subject: [PATCH 13/16] Update comments --- include/pybind11/detail/internals.h | 9 +++++++-- include/pybind11/gil_safe_call_once.h | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index c157bf53cb..4ff904607a 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -627,19 +627,23 @@ class internals_pp_manager { /// Get the pointer-to-pointer for the main interpreter, allocating it if it does not already /// exist. May acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp_for_main_interpreter() { - // This function **assumes** that the current thread is running in the main interpreter. if (!seen_main_interpreter_) { + // The first call to this function **MUST** be from the main interpreter. + // Here we **ASSUME** that the current thread is running in the main interpreter. + // The caller is responsible for ensuring this. std::call_once(seen_main_interpreter_flag_, [&] { gil_scoped_acquire_simple gil; internals_singleton_pp_ = get_or_create_pp_in_state_dict(); seen_main_interpreter_ = true; }); } + // This is shared between all threads and all interpreters. return internals_singleton_pp_; } /// Drop all the references we're currently holding. void unref() { + // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { last_istate_tls() = nullptr; @@ -651,6 +655,7 @@ class internals_pp_manager { } void destroy() { + // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { auto *tstate = get_thread_state_unchecked(); @@ -711,8 +716,8 @@ class internals_pp_manager { char const *holder_id_ = nullptr; on_fetch_function *on_fetch_ = nullptr; + // Pointer to the singleton internals for the main interpreter std::unique_ptr *internals_singleton_pp_; - std::once_flag seen_main_interpreter_flag_; std::atomic_bool seen_main_interpreter_{false}; }; diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index e00bbb9f06..68314c6f8d 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -130,7 +130,7 @@ class gil_safe_call_once_and_store { // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *const key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; // There can be multiple threads going through here. detail::call_once_storage *value = nullptr; @@ -168,7 +168,7 @@ class gil_safe_call_once_and_store { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { detail::with_internals([&](detail::internals &internals) { - const void *key = reinterpret_cast(this); + const void *const key = reinterpret_cast(this); auto &storage_map = internals.call_once_storage_map; auto *value = static_cast *>(storage_map.at(key)); result = last_storage_ptr_ = reinterpret_cast(value->storage); From b61e902dce793e2b82b1a0f6e9ba8ffb5c875894 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 17 Dec 2025 12:02:29 +0800 Subject: [PATCH 14/16] Move call-once storage out of internals --- include/pybind11/detail/internals.h | 41 +------- include/pybind11/gil_safe_call_once.h | 140 +++++++++++++++++++------- 2 files changed, 104 insertions(+), 77 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 4ff904607a..11a2ee4c92 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -234,38 +234,6 @@ inline uint64_t round_up_to_next_pow2(uint64_t x) { class loader_life_support; -struct call_once_storage_base { - call_once_storage_base() = default; - virtual ~call_once_storage_base() = default; - call_once_storage_base(const call_once_storage_base &) = delete; - call_once_storage_base(call_once_storage_base &&) = delete; - call_once_storage_base &operator=(const call_once_storage_base &) = delete; - call_once_storage_base &operator=(call_once_storage_base &&) = delete; -}; - -template -struct call_once_storage : call_once_storage_base { - alignas(T) char storage[sizeof(T)] = {}; - std::once_flag once_flag; - void (*finalize)(T &) = nullptr; - std::atomic_bool is_initialized{false}; - - call_once_storage() = default; - ~call_once_storage() override { - if (is_initialized) { - if (finalize != nullptr) { - finalize(*reinterpret_cast(storage)); - } else { - reinterpret_cast(storage)->~T(); - } - } - }; - call_once_storage(const call_once_storage &) = delete; - call_once_storage(call_once_storage &&) = delete; - call_once_storage &operator=(const call_once_storage &) = delete; - call_once_storage &operator=(call_once_storage &&) = delete; -}; - /// Internal data structure used to track registered instances and types. /// Whenever binary incompatible changes are made to this structure, /// `PYBIND11_INTERNALS_VERSION` must be incremented. @@ -310,8 +278,6 @@ struct internals { type_map native_enum_type_map; - std::unordered_map call_once_storage_map; - internals() : static_property_type(make_static_property_type()), default_metaclass(make_default_metaclass()) { @@ -337,12 +303,7 @@ struct internals { internals(internals &&other) = delete; internals &operator=(const internals &other) = delete; internals &operator=(internals &&other) = delete; - ~internals() { - for (const auto &entry : call_once_storage_map) { - delete entry.second; - } - call_once_storage_map.clear(); - } + ~internals() = default; }; // the internals struct (above) is shared between all the modules. local_internals are only diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h index 68314c6f8d..2268ca3ac7 100644 --- a/include/pybind11/gil_safe_call_once.h +++ b/include/pybind11/gil_safe_call_once.h @@ -115,7 +115,45 @@ class gil_safe_call_once_and_store { // In this case, we should store the result per-interpreter instead of globally, because each // subinterpreter has its own separate state. The cached result may not shareable across // interpreters (e.g., imported modules and their members). -// + +struct call_once_storage_base { + call_once_storage_base() = default; + virtual ~call_once_storage_base() = default; + call_once_storage_base(const call_once_storage_base &) = delete; + call_once_storage_base(call_once_storage_base &&) = delete; + call_once_storage_base &operator=(const call_once_storage_base &) = delete; + call_once_storage_base &operator=(call_once_storage_base &&) = delete; +}; + +template +struct call_once_storage : call_once_storage_base { + alignas(T) char storage[sizeof(T)] = {}; + std::once_flag once_flag; + void (*finalize)(T &) = nullptr; + std::atomic_bool is_initialized{false}; + + call_once_storage() = default; + ~call_once_storage() override { + if (is_initialized) { + if (finalize != nullptr) { + finalize(*reinterpret_cast(storage)); + } else { + reinterpret_cast(storage)->~T(); + } + } + }; + call_once_storage(const call_once_storage &) = delete; + call_once_storage(call_once_storage &&) = delete; + call_once_storage &operator=(const call_once_storage &) = delete; + call_once_storage &operator=(call_once_storage &&) = delete; +}; + +/// Storage map for `gil_safe_call_once_and_store`. Stored in a capsule in the interpreter's state +/// dict with proper destructor to ensure cleanup when the interpreter is destroyed. +using call_once_storage_map_type = std::unordered_map; + +# define PYBIND11_CALL_ONCE_STORAGE_MAP_ID PYBIND11_INTERNALS_ID "_call_once_storage_map__" + // The life span of the stored result is the entire interpreter lifetime. An additional // `finalize_fn` can be provided to clean up the stored result when the interpreter is destroyed. template @@ -129,35 +167,33 @@ class gil_safe_call_once_and_store { // Multiple threads may enter here, because the GIL is released in the next line and // CPython API calls in the `fn()` call below may release and reacquire the GIL. gil_scoped_release gil_rel; // Needed to establish lock ordering. - detail::with_internals([&](detail::internals &internals) { - const void *const key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - // There can be multiple threads going through here. - detail::call_once_storage *value = nullptr; - { - gil_scoped_acquire gil_acq; - // Only one thread will enter here at a time. - const auto it = storage_map.find(key); - if (it != storage_map.end()) { - value = static_cast *>(it->second); - } else { - value = new detail::call_once_storage{}; - storage_map.emplace(key, value); - } + const void *const key = reinterpret_cast(this); + // There can be multiple threads going through here. + call_once_storage *value = nullptr; + { + gil_scoped_acquire gil_acq; + // Only one thread will enter here at a time. + auto &storage_map = *get_or_create_call_once_storage_map(); + const auto it = storage_map.find(key); + if (it != storage_map.end()) { + value = static_cast *>(it->second); + } else { + value = new call_once_storage{}; + storage_map.emplace(key, value); } - assert(value != nullptr); - std::call_once(value->once_flag, [&] { - // Only one thread will ever enter here. - gil_scoped_acquire gil_acq; - // fn may release, but will reacquire, the GIL. - ::new (value->storage) T(fn()); - value->finalize = finalize_fn; - value->is_initialized = true; - last_storage_ptr_ = reinterpret_cast(value->storage); - is_initialized_by_atleast_one_interpreter_ = true; - }); + } + assert(value != nullptr); + std::call_once(value->once_flag, [&] { + // Only one thread will ever enter here. + gil_scoped_acquire gil_acq; + // fn may release, but will reacquire, the GIL. + ::new (value->storage) T(fn()); + value->finalize = finalize_fn; + value->is_initialized = true; + last_storage_ptr_ = reinterpret_cast(value->storage); + is_initialized_by_atleast_one_interpreter_ = true; }); - // All threads will observe `is_initialized_by_atleast_one_interp_` as true here. + // All threads will observe `is_initialized_by_atleast_one_interpreter_` as true here. } // Intentionally not returning `T &` to ensure the calling code is self-documenting. return *this; @@ -167,12 +203,11 @@ class gil_safe_call_once_and_store { T &get_stored() { T *result = last_storage_ptr_; if (!is_last_storage_valid()) { - detail::with_internals([&](detail::internals &internals) { - const void *const key = reinterpret_cast(this); - auto &storage_map = internals.call_once_storage_map; - auto *value = static_cast *>(storage_map.at(key)); - result = last_storage_ptr_ = reinterpret_cast(value->storage); - }); + gil_scoped_acquire gil_acq; + const void *const key = reinterpret_cast(this); + auto &storage_map = *get_or_create_call_once_storage_map(); + auto *value = static_cast *>(storage_map.at(key)); + result = last_storage_ptr_ = reinterpret_cast(value->storage); } assert(result != nullptr); return *result; @@ -187,12 +222,43 @@ class gil_safe_call_once_and_store { private: bool is_last_storage_valid() const { return is_initialized_by_atleast_one_interpreter_ - && detail::get_num_interpreters_seen() <= 1; + && detail::get_num_interpreters_seen() == 1; + } + + static call_once_storage_map_type *get_or_create_call_once_storage_map() { + error_scope err_scope; + dict state_dict = detail::get_python_state_dict(); + auto storage_map_obj = reinterpret_steal( + detail::dict_getitemstringref(state_dict.ptr(), PYBIND11_CALL_ONCE_STORAGE_MAP_ID)); + call_once_storage_map_type *storage_map = nullptr; + if (storage_map_obj) { + void *raw_ptr = PyCapsule_GetPointer(storage_map_obj.ptr(), /*name=*/nullptr); + if (!raw_ptr) { + raise_from(PyExc_SystemError, + "pybind11::gil_safe_call_once_and_store::" + "get_or_create_call_once_storage_map() FAILED"); + throw error_already_set(); + } + storage_map = reinterpret_cast(raw_ptr); + } else { + storage_map = new call_once_storage_map_type(); + // Create capsule with destructor to clean up the storage map when the interpreter + // shuts down + state_dict[PYBIND11_CALL_ONCE_STORAGE_MAP_ID] + = capsule(storage_map, [](void *ptr) noexcept { + auto *map = reinterpret_cast(ptr); + for (const auto &entry : *map) { + delete entry.second; + } + delete map; + }); + } + return storage_map; } // No storage needed when subinterpreter support is enabled. - // The actual storage is stored in the per-interpreter state dict in - // `internals.call_once_storage_map`. + // The actual storage is stored in the per-interpreter state dict via + // `get_or_create_call_once_storage_map()`. // Fast local cache to avoid repeated lookups when there are no multiple interpreters. // This is only valid if there is a single interpreter. Otherwise, it is not used. From b72cd4162baf14472f37c66144aca55df7c9fa74 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 17 Dec 2025 11:22:54 +0800 Subject: [PATCH 15/16] Revert internal version bump --- include/pybind11/detail/class.h | 2 ++ include/pybind11/detail/internals.h | 13 ++++++++++--- include/pybind11/detail/type_caster_base.h | 10 ++++++++-- include/pybind11/pybind11.h | 4 ++++ 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/include/pybind11/detail/class.h b/include/pybind11/detail/class.h index 1cd9af0bd1..21e966cfea 100644 --- a/include/pybind11/detail/class.h +++ b/include/pybind11/detail/class.h @@ -226,12 +226,14 @@ extern "C" inline void pybind11_meta_dealloc(PyObject *obj) { local_internals.registered_types_cpp.erase(tinfo->cpptype); } else { internals.registered_types_cpp.erase(tindex); +#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast.erase(tinfo->cpptype); for (const std::type_info *alias : tinfo->alias_chain) { auto num_erased = internals.registered_types_cpp_fast.erase(alias); (void) num_erased; assert(num_erased > 0); } +#endif } internals.registered_types_py.erase(tinfo->type); diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 11a2ee4c92..5347511538 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -39,11 +39,11 @@ /// further ABI-incompatible changes may be made before the ABI is officially /// changed to the new version. #ifndef PYBIND11_INTERNALS_VERSION -# define PYBIND11_INTERNALS_VERSION 12 +# define PYBIND11_INTERNALS_VERSION 11 #endif -#if PYBIND11_INTERNALS_VERSION < 12 -# error "PYBIND11_INTERNALS_VERSION 12 is the minimum for all platforms for pybind11v3." +#if PYBIND11_INTERNALS_VERSION < 11 +# error "PYBIND11_INTERNALS_VERSION 11 is the minimum for all platforms for pybind11v3." #endif PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) @@ -242,12 +242,14 @@ struct internals { pymutex mutex; pymutex exception_translator_mutex; #endif +#if PYBIND11_INTERNALS_VERSION >= 12 // non-normative but fast "hint" for registered_types_cpp. Meant // to be used as the first level of a two-level lookup: successful // lookups are correct, but unsuccessful lookups need to try // registered_types_cpp and then backfill this map if they find // anything. fast_type_map registered_types_cpp_fast; +#endif // std::type_index -> pybind11's type information type_map registered_types_cpp; @@ -273,6 +275,9 @@ struct internals { PyObject *instance_base = nullptr; // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: thread_specific_storage tstate; +#if PYBIND11_INTERNALS_VERSION <= 11 + thread_specific_storage loader_life_support_tls; // OBSOLETE (PR #5830) +#endif // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined: PyInterpreterState *istate = nullptr; @@ -353,6 +358,7 @@ struct type_info { void *(*module_local_load)(PyObject *, const type_info *) = nullptr; holder_enum_t holder_enum_v = holder_enum_t::undefined; +#if PYBIND11_INTERNALS_VERSION >= 12 // When a type appears in multiple DSOs, // internals::registered_types_cpp_fast will have multiple distinct // keys (the std::type_info from each DSO) mapped to the same @@ -363,6 +369,7 @@ struct type_info { // nb_alias_chain` added in // https://github.com/wjakob/nanobind/commit/b515b1f7f2f4ecc0357818e6201c94a9f4cbfdc2 std::forward_list alias_chain; +#endif /* A simple type never occurs as a (direct or indirect) parent * of a class that makes use of multiple inheritance. diff --git a/include/pybind11/detail/type_caster_base.h b/include/pybind11/detail/type_caster_base.h index 21b7f0950e..b0c59e1138 100644 --- a/include/pybind11/detail/type_caster_base.h +++ b/include/pybind11/detail/type_caster_base.h @@ -227,26 +227,32 @@ inline detail::type_info *get_global_type_info_lock_held(const std::type_info &t // next time. detail::type_info *type_info = nullptr; auto &internals = get_internals(); +#if PYBIND11_INTERNALS_VERSION >= 12 auto &fast_types = internals.registered_types_cpp_fast; +#endif auto &types = internals.registered_types_cpp; +#if PYBIND11_INTERNALS_VERSION >= 12 auto fast_it = fast_types.find(&tp); if (fast_it != fast_types.end()) { -#ifndef NDEBUG +# ifndef NDEBUG auto types_it = types.find(std::type_index(tp)); assert(types_it != types.end()); assert(types_it->second == fast_it->second); -#endif +# endif return fast_it->second; } +#endif // PYBIND11_INTERNALS_VERSION >= 12 auto it = types.find(std::type_index(tp)); if (it != types.end()) { +#if PYBIND11_INTERNALS_VERSION >= 12 // We found the type in the slow map but not the fast one, so // some other DSO added it (otherwise it would be in the fast // map under &tp) and therefore we must be an alias. Record // that. it->second->alias_chain.push_front(&tp); fast_types.emplace(&tp, it->second); +#endif type_info = it->second; } return type_info; diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h index 8bd62c85c9..91b38d91ed 100644 --- a/include/pybind11/pybind11.h +++ b/include/pybind11/pybind11.h @@ -1692,7 +1692,9 @@ class generic_type : public object { local_internals.registered_types_cpp[rec.type] = tinfo; } else { internals.registered_types_cpp[tindex] = tinfo; +#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[rec.type] = tinfo; +#endif } PYBIND11_WARNING_PUSH @@ -2199,7 +2201,9 @@ class class_ : public detail::generic_type { type_info *const val = internals.registered_types_cpp[std::type_index(typeid(type))]; internals.registered_types_cpp[std::type_index(typeid(type_alias))] = val; +#if PYBIND11_INTERNALS_VERSION >= 12 internals.registered_types_cpp_fast[&typeid(type_alias)] = val; +#endif } }); } From ac02a3208d4bd377059bb97bba4df5bb8f1b3923 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Wed, 17 Dec 2025 12:07:48 +0800 Subject: [PATCH 16/16] Cleanup outdated comments --- include/pybind11/detail/internals.h | 35 +++++------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/include/pybind11/detail/internals.h b/include/pybind11/detail/internals.h index 5347511538..5ccd4d18e5 100644 --- a/include/pybind11/detail/internals.h +++ b/include/pybind11/detail/internals.h @@ -564,15 +564,6 @@ class internals_pp_manager { /// acquire the GIL. Will never return nullptr. std::unique_ptr *get_pp() { #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT - // FIXME: We cannot use `get_num_interpreters_seen() > 1` here to create a fast path for - // the multi-interpreter case. The singleton may be initialized by a subinterpreter not the - // main interpreter. - // - // For multi-interpreter support, the subinterpreters can be initialized concurrently, and - // the first time this function may not be called in the main interpreter. - // For example, a clean main interpreter that does not import any pybind11 module and then - // spawns multiple subinterpreters using `InterpreterPoolExecutor` that each imports a - // pybind11 module concurrently. if (get_num_interpreters_seen() > 1) { // Whenever the interpreter changes on the current thread we need to invalidate the // internals_pp so that it can be pulled from the interpreter's state dict. That is @@ -589,29 +580,15 @@ class internals_pp_manager { return internals_p_tls(); } #endif - return get_pp_for_main_interpreter(); - } - - /// Get the pointer-to-pointer for the main interpreter, allocating it if it does not already - /// exist. May acquire the GIL. Will never return nullptr. - std::unique_ptr *get_pp_for_main_interpreter() { - if (!seen_main_interpreter_) { - // The first call to this function **MUST** be from the main interpreter. - // Here we **ASSUME** that the current thread is running in the main interpreter. - // The caller is responsible for ensuring this. - std::call_once(seen_main_interpreter_flag_, [&] { - gil_scoped_acquire_simple gil; - internals_singleton_pp_ = get_or_create_pp_in_state_dict(); - seen_main_interpreter_ = true; - }); + if (!internals_singleton_pp_) { + gil_scoped_acquire_simple gil; + internals_singleton_pp_ = get_or_create_pp_in_state_dict(); } - // This is shared between all threads and all interpreters. return internals_singleton_pp_; } /// Drop all the references we're currently holding. void unref() { - // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { last_istate_tls() = nullptr; @@ -623,7 +600,6 @@ class internals_pp_manager { } void destroy() { - // See comment in get_pp() above. #ifdef PYBIND11_HAS_SUBINTERPRETER_SUPPORT if (get_num_interpreters_seen() > 1) { auto *tstate = get_thread_state_unchecked(); @@ -684,10 +660,9 @@ class internals_pp_manager { char const *holder_id_ = nullptr; on_fetch_function *on_fetch_ = nullptr; - // Pointer to the singleton internals for the main interpreter + // Pointer-to-pointer to the singleton internals for the first seen interpreter (may not be the + // main interpreter) std::unique_ptr *internals_singleton_pp_; - std::once_flag seen_main_interpreter_flag_; - std::atomic_bool seen_main_interpreter_{false}; }; // If We loaded the internals through `state_dict`, our `error_already_set`