Skip to content

Commit 395a780

Browse files
authored
fix: Enable detection of unresponsive or crashed Python backend stub process (#423)
1 parent 4b15926 commit 395a780

File tree

2 files changed

+48
-4
lines changed

2 files changed

+48
-4
lines changed

src/python_be.cc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2416,6 +2416,26 @@ TRITONBACKEND_ModelInstanceExecute(
24162416
return nullptr;
24172417
}
24182418

2419+
TRITONBACKEND_ISPEC TRITONSERVER_Error*
2420+
TRITONBACKEND_ModelInstanceReady(TRITONBACKEND_ModelInstance* instance)
2421+
{
2422+
void* vstate;
2423+
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
2424+
ModelInstanceState* instance_state =
2425+
reinterpret_cast<ModelInstanceState*>(vstate);
2426+
2427+
// Check if the stub process is running
2428+
if (!instance_state->Stub()->StubActive()) {
2429+
return TRITONSERVER_ErrorNew(
2430+
TRITONSERVER_ERROR_INTERNAL,
2431+
(std::string("Stub process '") + instance_state->Name() +
2432+
"' is not healthy.")
2433+
.c_str());
2434+
}
2435+
2436+
return nullptr;
2437+
}
2438+
24192439
TRITONBACKEND_ISPEC TRITONSERVER_Error*
24202440
TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
24212441
{

src/stub_launcher.cc

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,29 @@ StubLauncher::StubActive()
743743
GetExitCodeProcess(stub_pid_.hProcess, &ec);
744744
return (ec == STILL_ACTIVE);
745745
#else
746-
return (stub_pid_ != 0);
746+
if (stub_pid_ == 0) {
747+
return false;
748+
}
749+
750+
int status;
751+
pid_t return_pid = waitpid(stub_pid_, &status, WNOHANG);
752+
if (return_pid == -1) {
753+
// If waitpid fails, it likely means the process no longer exists (ECHILD)
754+
if (errno != ECHILD) {
755+
LOG_MESSAGE(
756+
TRITONSERVER_LOG_VERBOSE,
757+
(std::string("waitpid failed for stub process ") +
758+
std::to_string(stub_pid_) + ": " + strerror(errno))
759+
.c_str());
760+
}
761+
return false;
762+
} else if (return_pid == stub_pid_) {
763+
// Process has exited and has been reaped
764+
return false;
765+
}
766+
767+
// return_pid == 0 means the process is still running
768+
return true;
747769
#endif
748770
}
749771

@@ -824,9 +846,11 @@ StubLauncher::KillStubProcess()
824846
CloseHandle(stub_pid_.hProcess);
825847
CloseHandle(stub_pid_.hThread);
826848
#else
827-
kill(stub_pid_, SIGKILL);
828-
WaitForStubProcess();
829-
stub_pid_ = 0;
849+
if (stub_pid_ != 0) {
850+
kill(stub_pid_, SIGKILL);
851+
WaitForStubProcess();
852+
stub_pid_ = 0;
853+
}
830854
#endif
831855
}
832856

0 commit comments

Comments
 (0)