From 95f6a0f335d4eef2fea85e2d36ba61761e1f2449 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Fri, 31 Mar 2017 10:18:46 -0700 Subject: [PATCH] common/cuda: Fix near-hang when remote side has exited Ignore errors caused by remote side having exited when closing CUDA IPC mappings. openmpi/ompi#3244 Signed-off-by: Sylvain Jeaugey --- opal/mca/common/cuda/common_cuda.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c index 138ad7e658e..8482588de82 100644 --- a/opal/mca/common/cuda/common_cuda.c +++ b/opal/mca/common/cuda/common_cuda.c @@ -1157,10 +1157,10 @@ int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg) if (ctx_ok) { result = cuFunc.cuIpcCloseMemHandle((CUdeviceptr)cuda_reg->base.alloc_base); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed", - true, result, cuda_reg->base.alloc_base); - opal_output(0, "Sleep on %d", getpid()); - sleep(20); + if (CUDA_ERROR_DEINITIALIZED != result) { + opal_show_help("help-mpi-common-cuda.txt", "cuIpcCloseMemHandle failed", + true, result, cuda_reg->base.alloc_base); + } /* We will just continue on and hope things continue to work. */ } else { opal_output_verbose(10, mca_common_cuda_output,