From 44cb1f30800cdf196a8a53072e7ba3122894c1d1 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Mon, 21 Jul 2025 13:11:51 +0530 Subject: [PATCH 1/8] Update --- src/stub_launcher.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index 828228e6..84999205 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -301,7 +301,8 @@ StubLauncher::Launch() // monitoring thread may take longer which can make the server process think // that the stub process is unhealthy and return early. Waiting until the // health thread is spawn would make sure would prevent this issue. - parent_message_queue_->Pop(); + bi::managed_external_buffer::handle_t message; + RETURN_IF_ERROR(ReceiveMessageFromStub(message)); if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { try { @@ -458,7 +459,8 @@ StubLauncher::Launch() // monitoring thread may take longer which can make the server process think // that the stub process is unhealthy and return early. Waiting until the // health thread is spawn would prevent this issue. - parent_message_queue_->Pop(); + bi::managed_external_buffer::handle_t message; + RETURN_IF_ERROR(ReceiveMessageFromStub(message)); if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { try { From e5b20885cccf3856d28ddf7fe816ef023f60189e Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Mon, 21 Jul 2025 21:49:45 +0530 Subject: [PATCH 2/8] Update --- src/stub_launcher.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index 84999205..8b1409c9 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -288,6 +288,7 @@ StubLauncher::Launch() parent_message_queue_.reset(); memory_manager_.reset(); WaitForStubProcess(); + shm_pool_.reset(); } }); @@ -444,6 +445,7 @@ StubLauncher::Launch() parent_message_queue_.reset(); memory_manager_.reset(); WaitForStubProcess(); + shm_pool_.reset(); } }); From 76aebcf6a623f3cc90a789425ab343898c1cce55 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Wed, 23 Jul 2025 08:55:10 +0530 Subject: [PATCH 3/8] Undo shm chnages --- src/stub_launcher.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index 8b1409c9..84999205 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -288,7 +288,6 @@ StubLauncher::Launch() parent_message_queue_.reset(); memory_manager_.reset(); WaitForStubProcess(); - shm_pool_.reset(); } }); @@ -445,7 +444,6 @@ StubLauncher::Launch() parent_message_queue_.reset(); memory_manager_.reset(); WaitForStubProcess(); - shm_pool_.reset(); } }); From e62e5826be97a2e6ec225626448616f89e3044d8 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Sun, 3 Aug 2025 20:25:04 +0530 Subject: [PATCH 4/8] Fix shm issue --- src/stub_launcher.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index 84999205..7ce07a1d 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -302,9 +302,12 @@ StubLauncher::Launch() // that the stub process is unhealthy and return early. Waiting until the // health thread is spawn would make sure would prevent this issue. bi::managed_external_buffer::handle_t message; - RETURN_IF_ERROR(ReceiveMessageFromStub(message)); + auto err = ReceiveMessageFromStub(message); if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + if (err != nullptr) { + throw BackendModelException(err); + } try { AutocompleteStubProcess(); } @@ -315,6 +318,7 @@ StubLauncher::Launch() TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what())); } } else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") { + RETURN_IF_ERROR(err); RETURN_IF_ERROR(ModelInstanceStubProcess()); } else { return TRITONSERVER_ErrorNew( @@ -460,9 +464,12 @@ StubLauncher::Launch() // that the stub process is unhealthy and return early. Waiting until the // health thread is spawn would prevent this issue. bi::managed_external_buffer::handle_t message; - RETURN_IF_ERROR(ReceiveMessageFromStub(message)); + auto err = ReceiveMessageFromStub(message); if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { + if (err != nullptr) { + throw BackendModelException(err); + } try { AutocompleteStubProcess(); } @@ -473,6 +480,7 @@ StubLauncher::Launch() TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ex.what())); } } else if (stub_process_kind_ == "MODEL_INSTANCE_STUB") { + RETURN_IF_ERROR(err); RETURN_IF_ERROR(ModelInstanceStubProcess()); } else { return TRITONSERVER_ErrorNew( From a960608bb5b20dc7f865b8c5f897ab9dce3a888e Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 5 Aug 2025 00:03:32 +0530 Subject: [PATCH 5/8] Update --- src/stub_launcher.cc | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index 7ce07a1d..33c197d6 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -280,7 +280,9 @@ StubLauncher::Launch() // Push a dummy message to the message queue so that the stub // process is notified that it can release the object stored in // shared memory. - stub_message_queue_->Push(DUMMY_MESSAGE); + if (stub_message_queue_) { + stub_message_queue_->Push(DUMMY_MESSAGE); + } // If the model is not initialized, wait for the stub process to exit. if (!is_initialized_) { @@ -303,6 +305,9 @@ StubLauncher::Launch() // health thread is spawn would make sure would prevent this issue. bi::managed_external_buffer::handle_t message; auto err = ReceiveMessageFromStub(message); + if (err != nullptr) { + KillStubProcess(); + } if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { if (err != nullptr) { @@ -440,7 +445,9 @@ StubLauncher::Launch() // Push a dummy message to the message queue so that the stub // process is notified that it can release the object stored in // shared memory. - stub_message_queue_->Push(DUMMY_MESSAGE); + if (stub_message_queue_) { + stub_message_queue_->Push(DUMMY_MESSAGE); + } // If the model is not initialized, wait for the stub process to exit. if (!is_initialized_) { @@ -465,6 +472,9 @@ StubLauncher::Launch() // health thread is spawn would prevent this issue. bi::managed_external_buffer::handle_t message; auto err = ReceiveMessageFromStub(message); + if (err != nullptr) { + KillStubProcess(); + } if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { if (err != nullptr) { From 4fdb468485a373b9f970bb0fbac6715f25c304d9 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 5 Aug 2025 16:35:21 +0530 Subject: [PATCH 6/8] Update timeout --- src/stub_launcher.cc | 41 ++++++++++++++++++++++++++--------------- src/stub_launcher.h | 3 ++- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index 33c197d6..faee0528 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -301,18 +301,21 @@ StubLauncher::Launch() // // The reason it is broken into two steps is that creation of the health // monitoring thread may take longer which can make the server process think - // that the stub process is unhealthy and return early. Waiting until the - // health thread is spawn would make sure would prevent this issue. + // that the stub process is unhealthy and return early. Waiting with a longer + // timeout prevents this issue. + const uint64_t initialization_timeout_ms = 5000; // 5 sec + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + "Waiting for the stub health monitoring thread to start"); + bi::managed_external_buffer::handle_t message; - auto err = ReceiveMessageFromStub(message); + auto err = ReceiveMessageFromStub(message, initialization_timeout_ms); if (err != nullptr) { KillStubProcess(); } if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { - if (err != nullptr) { - throw BackendModelException(err); - } + THROW_IF_BACKEND_MODEL_ERROR(err); try { AutocompleteStubProcess(); } @@ -468,18 +471,21 @@ StubLauncher::Launch() // // The reason it is broken into two steps is that creation of the health // monitoring thread may take longer which can make the server process think - // that the stub process is unhealthy and return early. Waiting until the - // health thread is spawn would prevent this issue. + // that the stub process is unhealthy and return early. Waiting with a + // longer timeout prevents this issue. + const uint64_t initialization_timeout_ms = 5000; // 5 sec + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + "Waiting for the stub health monitoring thread to start"); + bi::managed_external_buffer::handle_t message; - auto err = ReceiveMessageFromStub(message); + auto err = ReceiveMessageFromStub(message, initialization_timeout_ms); if (err != nullptr) { KillStubProcess(); } if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { - if (err != nullptr) { - throw BackendModelException(err); - } + THROW_IF_BACKEND_MODEL_ERROR(err); try { AutocompleteStubProcess(); } @@ -612,8 +618,13 @@ StubLauncher::ModelInstanceStubProcess() initialize_message->Args() = initialize_map_handle; stub_message_queue_->Push(initialize_message->ShmHandle()); + const uint64_t initialization_timeout_ms = 5000; // 5 sec + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + "Waiting for the stub process initialization response"); + bi::managed_external_buffer::handle_t message; - RETURN_IF_ERROR(ReceiveMessageFromStub(message)); + RETURN_IF_ERROR(ReceiveMessageFromStub(message, initialization_timeout_ms)); std::unique_ptr initialize_response_message = IPCMessage::LoadFromSharedMemory(shm_pool_, message); @@ -746,11 +757,11 @@ StubLauncher::KillStubProcess() TRITONSERVER_Error* StubLauncher::ReceiveMessageFromStub( - bi::managed_external_buffer::handle_t& message) + bi::managed_external_buffer::handle_t& message, + uint64_t timeout_miliseconds) { bool success = false; while (!success) { - uint64_t timeout_miliseconds = 1000; { boost::posix_time::ptime timeout = boost::get_system_time() + diff --git a/src/stub_launcher.h b/src/stub_launcher.h index 6c8dd910..58cdcc61 100644 --- a/src/stub_launcher.h +++ b/src/stub_launcher.h @@ -147,7 +147,8 @@ class StubLauncher { // Get a message from the stub process TRITONSERVER_Error* ReceiveMessageFromStub( - bi::managed_external_buffer::handle_t& message); + bi::managed_external_buffer::handle_t& message, + uint64_t timeout_miliseconds = 1000); // Wait for stub process void WaitForStubProcess(); From a26c9284b65e438f95e3a82e35c23a80717cbb09 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Wed, 6 Aug 2025 14:11:59 +0530 Subject: [PATCH 7/8] Update --- src/pb_stub.cc | 18 +++++++++++------- src/stub_launcher.cc | 8 ++++++-- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 76130f94..56048d78 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1040,11 +1040,13 @@ Stub::~Stub() { #ifdef TRITON_ENABLE_GPU try { - CUDAHandler& cuda_api = CUDAHandler::getInstance(); - for (auto& m : - shm_pool_->GetCUDAMemoryPoolManager()->CUDAPoolAddressMap()) { - if (m.second != nullptr) { - cuda_api.CloseCudaHandle(m.first, m.second); + if (shm_pool_ != nullptr) { + CUDAHandler& cuda_api = CUDAHandler::getInstance(); + for (auto& m : + shm_pool_->GetCUDAMemoryPoolManager()->CUDAPoolAddressMap()) { + if (m.second != nullptr) { + cuda_api.CloseCudaHandle(m.first, m.second); + } } } } @@ -1053,13 +1055,14 @@ Stub::~Stub() } #endif - { + // Ensure the interpreter is active before trying to clean up. + if (Py_IsInitialized()) { py::gil_scoped_acquire acquire; py::object async_event_loop_local(std::move(async_event_loop_)); py::object background_futures_local(std::move(background_futures_)); py::object model_instance_local(std::move(model_instance_)); } - stub_instance_.reset(); + stub_message_queue_.reset(); parent_message_queue_.reset(); stub_to_parent_mq_.reset(); @@ -2030,6 +2033,7 @@ main(int argc, char** argv) catch (const PythonBackendException& pb_exception) { LOG_INFO << "Failed to preinitialize Python stub: " << pb_exception.what(); logger.reset(); + stub.reset(); exit(1); } diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index faee0528..dcefc51e 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -315,7 +315,9 @@ StubLauncher::Launch() } if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { - THROW_IF_BACKEND_MODEL_ERROR(err); + if (err != nullptr) { + throw BackendModelException(err); + } try { AutocompleteStubProcess(); } @@ -485,7 +487,9 @@ StubLauncher::Launch() } if (stub_process_kind_ == "AUTOCOMPLETE_STUB") { - THROW_IF_BACKEND_MODEL_ERROR(err); + if (err != nullptr) { + throw BackendModelException(err); + } try { AutocompleteStubProcess(); } From 386f27ad4bb6ca55b03261ea4696d652c4fefc53 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Wed, 6 Aug 2025 18:49:49 +0530 Subject: [PATCH 8/8] Update --- src/stub_launcher.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/stub_launcher.cc b/src/stub_launcher.cc index dcefc51e..3bd01321 100644 --- a/src/stub_launcher.cc +++ b/src/stub_launcher.cc @@ -303,7 +303,7 @@ StubLauncher::Launch() // monitoring thread may take longer which can make the server process think // that the stub process is unhealthy and return early. Waiting with a longer // timeout prevents this issue. - const uint64_t initialization_timeout_ms = 5000; // 5 sec + const uint64_t initialization_timeout_ms = 10000; // 10 sec LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, "Waiting for the stub health monitoring thread to start"); @@ -475,7 +475,7 @@ StubLauncher::Launch() // monitoring thread may take longer which can make the server process think // that the stub process is unhealthy and return early. Waiting with a // longer timeout prevents this issue. - const uint64_t initialization_timeout_ms = 5000; // 5 sec + const uint64_t initialization_timeout_ms = 10000; // 10 sec LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, "Waiting for the stub health monitoring thread to start");