From 1ebc19d155e10c43a3650897c7ac6418c6536074 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 21 Nov 2016 10:21:47 +0900 Subject: [PATCH 1/2] coll/libnbc: fix race condition with multi threaded apps protect the mca_coll_libnbc_component.active_requests list with the new mca_coll_libnbc_component.lock mutex. Thanks Jie Hu for the report Signed-off-by: Gilles Gouaillardet (back-ported from commit open-mpi/ompi@2c94a3a6f398e187dff67386e9e1a7fe4bc26f15) --- ompi/mca/coll/libnbc/coll_libnbc.h | 3 +++ ompi/mca/coll/libnbc/coll_libnbc_component.c | 8 ++++++++ ompi/mca/coll/libnbc/nbc.c | 2 ++ 3 files changed, 13 insertions(+) diff --git a/ompi/mca/coll/libnbc/coll_libnbc.h b/ompi/mca/coll/libnbc/coll_libnbc.h index 7dd69ce940f..6d10aab45b2 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc.h +++ b/ompi/mca/coll/libnbc/coll_libnbc.h @@ -13,6 +13,8 @@ * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -71,6 +73,7 @@ struct ompi_coll_libnbc_component_t { opal_list_t active_requests; int32_t active_comms; opal_atomic_lock_t progress_lock; + opal_mutex_t lock; }; typedef struct ompi_coll_libnbc_component_t ompi_coll_libnbc_component_t; diff --git a/ompi/mca/coll/libnbc/coll_libnbc_component.c b/ompi/mca/coll/libnbc/coll_libnbc_component.c index 2d7b82c2b84..ca3ea4c2671 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc_component.c +++ b/ompi/mca/coll/libnbc/coll_libnbc_component.c @@ -89,6 +89,7 @@ libnbc_open(void) OBJ_CONSTRUCT(&mca_coll_libnbc_component.requests, ompi_free_list_t); OBJ_CONSTRUCT(&mca_coll_libnbc_component.active_requests, opal_list_t); + OBJ_CONSTRUCT(&mca_coll_libnbc_component.lock, opal_mutex_t); ret = ompi_free_list_init(&mca_coll_libnbc_component.requests, sizeof(ompi_coll_libnbc_request_t), OBJ_CLASS(ompi_coll_libnbc_request_t), @@ -116,6 +117,7 @@ libnbc_close(void) OBJ_DESTRUCT(&mca_coll_libnbc_component.requests); OBJ_DESTRUCT(&mca_coll_libnbc_component.active_requests); + OBJ_DESTRUCT(&mca_coll_libnbc_component.lock); return OMPI_SUCCESS; } @@ -242,19 +244,25 @@ ompi_coll_libnbc_progress(void) if (opal_atomic_trylock(&mca_coll_libnbc_component.progress_lock)) return 0; + OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock); OPAL_LIST_FOREACH_SAFE(request, next, &mca_coll_libnbc_component.active_requests, ompi_coll_libnbc_request_t) { + OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock); if (NBC_OK == NBC_Progress(request)) { /* done, remove and complete */ + OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock); opal_list_remove_item(&mca_coll_libnbc_component.active_requests, &request->super.super.super); + OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock); request->super.req_status.MPI_ERROR = OMPI_SUCCESS; OPAL_THREAD_LOCK(&ompi_request_lock); ompi_request_complete(&request->super, true); OPAL_THREAD_UNLOCK(&ompi_request_lock); } + OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock); } + OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock); opal_atomic_unlock(&mca_coll_libnbc_component.progress_lock); diff --git a/ompi/mca/coll/libnbc/nbc.c b/ompi/mca/coll/libnbc/nbc.c index 4a3f5222b37..ff72e7d2445 100644 --- a/ompi/mca/coll/libnbc/nbc.c +++ b/ompi/mca/coll/libnbc/nbc.c @@ -659,7 +659,9 @@ int NBC_Start(NBC_Handle *handle, NBC_Schedule *schedule) { res = NBC_Start_round(handle); if((NBC_OK != res)) { printf("Error in NBC_Start_round() (%i)\n", res); return res; } + OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock); opal_list_append(&mca_coll_libnbc_component.active_requests, &(handle->super.super.super)); + OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock); return NBC_OK; } From ca12d2c68155340deb381521e1f55c546ea051fb Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 30 Nov 2016 17:29:21 +0900 Subject: [PATCH 2/2] coll/libnbc: add some comments on how locks are used no code change Signed-off-by: Gilles Gouaillardet (cherry picked from commit open-mpi/ompi@15098161a331168c66b29a696522fe52c8b2d8f5) --- ompi/mca/coll/libnbc/coll_libnbc.h | 4 ++-- ompi/mca/coll/libnbc/coll_libnbc_component.c | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ompi/mca/coll/libnbc/coll_libnbc.h b/ompi/mca/coll/libnbc/coll_libnbc.h index 6d10aab45b2..b6b40532c58 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc.h +++ b/ompi/mca/coll/libnbc/coll_libnbc.h @@ -72,8 +72,8 @@ struct ompi_coll_libnbc_component_t { ompi_free_list_t requests; opal_list_t active_requests; int32_t active_comms; - opal_atomic_lock_t progress_lock; - opal_mutex_t lock; + opal_atomic_lock_t progress_lock; /* protect from recursive calls */ + opal_mutex_t lock; /* protect access to the active_requests list */ }; typedef struct ompi_coll_libnbc_component_t ompi_coll_libnbc_component_t; diff --git a/ompi/mca/coll/libnbc/coll_libnbc_component.c b/ompi/mca/coll/libnbc/coll_libnbc_component.c index ca3ea4c2671..fdb70745978 100644 --- a/ompi/mca/coll/libnbc/coll_libnbc_component.c +++ b/ompi/mca/coll/libnbc/coll_libnbc_component.c @@ -242,8 +242,11 @@ ompi_coll_libnbc_progress(void) { ompi_coll_libnbc_request_t* request, *next; + /* return if invoked recursively */ if (opal_atomic_trylock(&mca_coll_libnbc_component.progress_lock)) return 0; + /* process active requests, and use mca_coll_libnbc_component.lock to access the + * mca_coll_libnbc_component.active_requests list */ OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock); OPAL_LIST_FOREACH_SAFE(request, next, &mca_coll_libnbc_component.active_requests, ompi_coll_libnbc_request_t) {