-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Open
Description
My version
tp4 fp8 deepseek like custom model. ExecutorApi (mpirun -n 4 command for start).
nccl version: 2.22.3
I use batched logit processor to control generation:
Main rank batch logit processor config (does logit modification)
tensorrt_llm::executor::LogitsPostProcessorConfig logitsPostProcessorConfig(
std::nullopt, LogitsProcessorBatchedAdaptor{mMainRankState->logitsProcessor, *this}, false);
executorConfig.setLogitsPostProcessorConfig(logitsPostProcessorConfig);
Other ranks config (actually do nothing)
tensorrt_llm::executor::LogitsPostProcessorConfig logitsPostProcessorConfig(
std::nullopt, [](auto&&...) {}, false);
executorConfig.setLogitsPostProcessorConfig(logitsPostProcessorConfig);
When running the stability test everything works fine for the first 20 hours, but after that time (or about 2 million requests) the server hangs.
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07 Driver Version: 550.90.07 CUDA Version: 12.4 |
|=========================================+========================+======================|
| 0 NVIDIA H100 80GB HBM3 On | 00000000:04:00.0 Off | 0 |
| N/A 38C P0 152W / 700W | 75370MiB / 81559MiB | 100% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA H100 80GB HBM3 On | 00000000:23:00.0 Off | 0 |
| N/A 34C P0 129W / 700W | 75368MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 2 NVIDIA H100 80GB HBM3 On | 00000000:43:00.0 Off | 0 |
| N/A 37C P0 137W / 700W | 75368MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 3 NVIDIA H100 80GB HBM3 On | 00000000:64:00.0 Off | 0 |
| N/A 36C P0 135W / 700W | 75368MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 2211609 C /opt/wmcore/executor_server 75358MiB |
| 1 N/A N/A 2211610 C /opt/wmcore/executor_server 75358MiB |
| 2 N/A N/A 2211611 C /opt/wmcore/executor_server 75358MiB |
| 3 N/A N/A 2211612 C /opt/wmcore/executor_server 75358MiB |
cuda-gdb attach to MAIN pid to determine active kernel (other ranks do not have active kernels)
(cuda-gdb) info stack
#0 0x00007f64a9c1f480 in ncclDevFunc_AllGather_RING_SIMPLE() ()
#1 0x00007f601be4e600 in ncclDevKernel_AllGather_RING_LL(ncclDevKernelArgsStorage<4096ul>)<<<(24,1,1),(544,1,1)>>> ()
(cuda-gdb) info cuda kernels
Kernel Parent Dev Grid Status SMs Mask GridDim BlockDim Invocation
* 0 - 0 5059392810 Active 0x000000000000000000000000000fff0fff (24,1,1) (544,1,1) ncclDevKernel_AllGather_RING_LL()
some meaningful backtraces
Thread 16 (Thread 0x7f5ff8ffd000 (LWP 2791236) "executor_server"):
#0 0x00007f6b84ad47aa in pthread_cond_timedwait@@GLIBC_2.3.2 () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6bf41a13ee in tensorrt_llm::executor::Executor::Impl::awaitResponses(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > > const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#2 0x00007f6bf4193ead in tensorrt_llm::executor::Executor::awaitResponses(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > > const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#3 0x000000000045d85b in modules::executor_server::Executor::ExecutorImpl::awaitRoutine (this=0x24c3180, interruptToken=...) at /sources/contrib/tensorrt-llm/modules/executor_server/src/serverImpl.cpp:330
#4 0x00000000004639c0 in operator() (__closure=<optimized out>, __closure=<optimized out>, stopToken=...) at /sources/contrib/tensorrt-llm/modules/executor_server/src/serverImpl.cpp:252
#5 __invoke_impl<void, modules::executor_server::Executor::ExecutorImpl::ExecutorImpl(modules::executor_server::MainRank, const std::filesystem::__cxx11::path&, const modules::executor_server::ExecutorConfig&, const modules::executor_server::LogitsProcessorStaticConfig&)::<lambda(std::stop_token)>, std::stop_token> (__f=...) at /opt/rh/gcc-toolset-10/root/usr/include/c++/10/bits/invoke.h:60
#6 __invoke<modules::executor_server::Executor::ExecutorImpl::ExecutorImpl(modules::executor_server::MainRank, const std::filesystem::__cxx11::path&, const modules::executor_server::ExecutorConfig&, const modules::executor_server::LogitsProcessorStaticConfig&)::<lambda(std::stop_token)>, std::stop_token> (__fn=...) at /opt/rh/gcc-toolset-10/root/usr/include/c++/10/bits/invoke.h:95
#7 _M_invoke<0, 1> (this=<optimized out>) at /opt/rh/gcc-toolset-10/root/usr/include/c++/10/thread:264
#8 operator() (this=<optimized out>) at /opt/rh/gcc-toolset-10/root/usr/include/c++/10/thread:271
#9 _M_run (this=<optimized out>) at /opt/rh/gcc-toolset-10/root/usr/include/c++/10/thread:215
#10 0x00007f6bcdb2ca80 in execute_native_thread_routine () from /home/askhoroshev/wmcore/lib/libtensorrt_llm_nvrtc_wrapper.so
#11 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#12 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 15 (Thread 0x7f5ff97fe000 (LWP 2791234) "executionLoop"):
#0 0x00007f6b86894e88 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#1 0x00007f6b86631833 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#2 0x00007f6b8699fb3f in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#3 0x00007f6b8699fed5 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#4 0x00007f6b866382cc in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#5 0x00007f6b8670117a in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#6 0x00007f6b86970459 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#7 0x00007f6b867a58fd in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#8 0x000000000049f575 in libcudart_static_141dba5462e92d2cffd1abc474df476c510a3a8c ()
#9 0x0000000000504248 in cudaStreamSynchronize ()
#10 0x00000000004606c7 in tensorrt_llm::runtime::CudaStream::synchronize (this=<optimized out>) at /sources/contrib/tensorrt-llm/cpp/include/tensorrt_llm/runtime/cudaStream.h:84
#11 modules::executor_server::FusedLogitsProcessor::process (bufferManager=..., stream=..., beamTokens=..., tensorPtrs=<synthetic pointer>..., logitsRequestStates=..., this=0x24c31c0) at /sources/contrib/tensorrt-llm/modules/executor_server/src/logitsProcessor.cpp:658
#12 modules::executor_server::Executor::ExecutorImpl::LogitsProcessorBatchedAdaptor::operator() (this=0x13cac3b0, internalIds=..., logitTensors=..., beamTokens=..., stream=..., userIds=...) at /sources/contrib/tensorrt-llm/modules/executor_server/src/serverImpl.cpp:238
#13 0x00007f6bf419b315 in std::_Function_handler<void (std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<std::shared_ptr<tensorrt_llm::runtime::ITensor>, std::allocator<std::shared_ptr<tensorrt_llm::runtime::ITensor--Type <RET> for more, q to quit, c to continue without paging--
> > >&, std::vector<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const>, std::allocator<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const> > > const&, std::shared_ptr<tensorrt_llm::runtime::CudaStream> const&, std::vector<std::optional<unsigned long>, std::allocator<std::optional<unsigned long> > > const&), tensorrt_llm::executor::Executor::Impl::initializeLogitsPostProcessorBatched(tensorrt_llm::executor::LogitsPostProcessorConfig const&)::{lambda(std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<std::shared_ptr<tensorrt_llm::runtime::ITensor>, std::allocator<std::shared_ptr<tensorrt_llm::runtime::ITensor> > >&, std::vector<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const>, std::allocator<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const> > > const&, std::shared_ptr<tensorrt_llm::runtime::CudaStream> const&, std::vector<std::optional<unsigned long>, std::allocator<std::optional<unsigned long> > > const&)#1}>::_M_invoke(std::_Any_data const&, std::vector<unsigned long, std::allocator<unsigned long> > const&, std::vector<std::shared_ptr<tensorrt_llm::runtime::ITensor>, std::allocator<std::shared_ptr<tensorrt_llm::runtime::ITensor> > >&, std::vector<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const>, std::allocator<std::reference_wrapper<std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const> > > const&, std::shared_ptr<tensorrt_llm::runtime::CudaStream> const&, std::vector<std::optional<unsigned long>, std::allocator<std::optional<unsigned long> > > const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#14 0x00007f6bf41793c4 in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::decoderStepAsync(tensorrt_llm::batch_manager::ScheduledRequests const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#15 0x00007f6bf417cde5 in tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#16 0x00007f6bf41a6a71 in tensorrt_llm::executor::Executor::Impl::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#17 0x00007f6bf41ab97f in tensorrt_llm::executor::Executor::Impl::executionLoop() () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#18 0x00007f6bcdb2ca80 in execute_native_thread_routine () from /home/askhoroshev/wmcore/lib/libtensorrt_llm_nvrtc_wrapper.so
#19 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#20 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 14 (Thread 0x7f5ff9fff000 (LWP 2791231) "dataTransResp"):
#0 0x00007f6b84ad445c in pthread_cond_wait@@GLIBC_2.3.2 () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b847ed870 in std::condition_variable::wait(std::unique_lock<std::mutex>&) () from /home/askhoroshev/wmcore/lib/libstdc++.so.6
#2 0x00007f6bf4110d33 in tensorrt_llm::batch_manager::DataResponder::Impl::response() () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#3 0x00007f6bf410ecdd in std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<void>, std::__future_base::_Result_base::_Deleter>, std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*> >, void> >::_M_invoke(std::_Any_data const&) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#4 0x00007f6bf410f2fb in std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#5 0x00007f6b84ad5e67 in __pthread_once_slow () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#6 0x00007f6bf410facd in std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::__future_base::_Async_state_impl<std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*> >, void>::_Async_state_impl(std::thread::_Invoker<std::tuple<void (tensorrt_llm::batch_manager::DataResponder::Impl::*)(), tensorrt_llm::batch_manager::DataResponder::Impl*> >&&)::{lambda()#1}> > >::_M_run() () from /home/askhoroshev/wmcore/lib/libtensorrt_llm.so
#7 0x00007f6bcdb2ca80 in execute_native_thread_routine () from /home/askhoroshev/wmcore/lib/libtensorrt_llm_nvrtc_wrapper.so
#8 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#9 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 13 (Thread 0x7f601d51d000 (LWP 2791219) "executor_server"):
#0 0x00007f6b83ef5f41 in poll () from /home/askhoroshev/wmcore/lib/libc.so.6
#1 0x00007f6b880ed3ea in ncclProxyServiceUDS(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#2 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#3 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 12 (Thread 0x7f694cee8000 (LWP 2791218) "executor_server"):
#0 0x00007f6b83ef5f41 in poll () from /home/askhoroshev/wmcore/lib/libc.so.6
#1 0x00007f6b880eed32 in ncclProxyService(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#2 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#3 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 11 (Thread 0x7f602cffd000 (LWP 2791205) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 10 (Thread 0x7f602d7fe000 (LWP 2791202) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 9 (Thread 0x7f602dfff000 (LWP 2791199) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
--Type <RET> for more, q to quit, c to continue without paging--
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 8 (Thread 0x7f64aaedb000 (LWP 2791197) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 7 (Thread 0x7f64ab6dc000 (LWP 2791195) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 6 (Thread 0x7f64abedd000 (LWP 2791192) "executor_server"):
#0 0x00007f6b84ad7ab4 in read () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f6b72cabfa3 in ibv_get_async_event () from /lib64/libibverbs.so.1
#2 0x00007f6b8811f042 in wrap_ibv_get_async_event(ibv_context*, ibv_async_event*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#3 0x00007f6b88141b54 in ncclIbAsyncThreadMain(void*) () from /home/askhoroshev/wmcore/lib/libnccl.so.2
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 5 (Thread 0x7f694efde000 (LWP 2790932) "cuda-EvtHandlr"):
#0 0x00007f6b84ad445c in pthread_cond_wait@@GLIBC_2.3.2 () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#1 0x00007f52129f9d6a in ?? () from /lib64/libcudadebugger.so.1
#2 0x00007f52129f7e60 in ?? () from /lib64/libcudadebugger.so.1
#3 0x00007f52125cece5 in ?? () from /lib64/libcudadebugger.so.1
#4 0x00007f5212623601 in ?? () from /lib64/libcudadebugger.so.1
#5 0x00007f52125b4e2c in ?? () from /lib64/libcudadebugger.so.1
#6 0x00007f5212711526 in ?? () from /lib64/libcudadebugger.so.1
#7 0x00007f6b8696cffb in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#8 0x00007f6b867ee6a4 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#9 0x00007f6b86721ee3 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#10 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#11 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Thread 4 (Thread 0x7f6b6eb7f000 (LWP 2790911) "cuda00006000019"):
#0 0x00007f6b83ef5f41 in poll () from /home/askhoroshev/wmcore/lib/libc.so.6
#1 0x00007f6b8672a1ef in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#2 0x00007f6b867ee64f in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#3 0x00007f6b86721ee3 in ?? () from /home/askhoroshev/wmcore/lib/libcuda.so.1
#4 0x00007f6b84ace1ca in start_thread () from /home/askhoroshev/wmcore/lib/libpthread.so.0
#5 0x00007f6b83e0be73 in clone () from /home/askhoroshev/wmcore/lib/libc.so.6
Is it related NVIDIA/nccl#311?
Metadata
Metadata
Assignees
Labels
No labels