Skip to content

Commit 0b793b9

Browse files
author
rhc54
authored
Merge pull request #2443 from ggouaillardet/topic/v1.10/libnbc_mt
v1.10: coll/libnbc: fix race condition with multi threaded apps
2 parents 884763a + ca12d2c commit 0b793b9

File tree

3 files changed

+17
-1
lines changed

3 files changed

+17
-1
lines changed

ompi/mca/coll/libnbc/coll_libnbc.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights
1515
* reserved.
16+
* Copyright (c) 2016 Research Organization for Information Science
17+
* and Technology (RIST). All rights reserved.
1618
* $COPYRIGHT$
1719
*
1820
* Additional copyrights may follow
@@ -70,7 +72,8 @@ struct ompi_coll_libnbc_component_t {
7072
ompi_free_list_t requests;
7173
opal_list_t active_requests;
7274
int32_t active_comms;
73-
opal_atomic_lock_t progress_lock;
75+
opal_atomic_lock_t progress_lock; /* protect from recursive calls */
76+
opal_mutex_t lock; /* protect access to the active_requests list */
7477
};
7578
typedef struct ompi_coll_libnbc_component_t ompi_coll_libnbc_component_t;
7679

ompi/mca/coll/libnbc/coll_libnbc_component.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ libnbc_open(void)
8989

9090
OBJ_CONSTRUCT(&mca_coll_libnbc_component.requests, ompi_free_list_t);
9191
OBJ_CONSTRUCT(&mca_coll_libnbc_component.active_requests, opal_list_t);
92+
OBJ_CONSTRUCT(&mca_coll_libnbc_component.lock, opal_mutex_t);
9293
ret = ompi_free_list_init(&mca_coll_libnbc_component.requests,
9394
sizeof(ompi_coll_libnbc_request_t),
9495
OBJ_CLASS(ompi_coll_libnbc_request_t),
@@ -116,6 +117,7 @@ libnbc_close(void)
116117

117118
OBJ_DESTRUCT(&mca_coll_libnbc_component.requests);
118119
OBJ_DESTRUCT(&mca_coll_libnbc_component.active_requests);
120+
OBJ_DESTRUCT(&mca_coll_libnbc_component.lock);
119121

120122
return OMPI_SUCCESS;
121123
}
@@ -240,21 +242,30 @@ ompi_coll_libnbc_progress(void)
240242
{
241243
ompi_coll_libnbc_request_t* request, *next;
242244

245+
/* return if invoked recursively */
243246
if (opal_atomic_trylock(&mca_coll_libnbc_component.progress_lock)) return 0;
244247

248+
/* process active requests, and use mca_coll_libnbc_component.lock to access the
249+
* mca_coll_libnbc_component.active_requests list */
250+
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
245251
OPAL_LIST_FOREACH_SAFE(request, next, &mca_coll_libnbc_component.active_requests,
246252
ompi_coll_libnbc_request_t) {
253+
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
247254
if (NBC_OK == NBC_Progress(request)) {
248255
/* done, remove and complete */
256+
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
249257
opal_list_remove_item(&mca_coll_libnbc_component.active_requests,
250258
&request->super.super.super);
259+
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
251260

252261
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
253262
OPAL_THREAD_LOCK(&ompi_request_lock);
254263
ompi_request_complete(&request->super, true);
255264
OPAL_THREAD_UNLOCK(&ompi_request_lock);
256265
}
266+
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
257267
}
268+
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
258269

259270
opal_atomic_unlock(&mca_coll_libnbc_component.progress_lock);
260271

ompi/mca/coll/libnbc/nbc.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,9 @@ int NBC_Start(NBC_Handle *handle, NBC_Schedule *schedule) {
659659
res = NBC_Start_round(handle);
660660
if((NBC_OK != res)) { printf("Error in NBC_Start_round() (%i)\n", res); return res; }
661661

662+
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
662663
opal_list_append(&mca_coll_libnbc_component.active_requests, &(handle->super.super.super));
664+
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
663665

664666
return NBC_OK;
665667
}

0 commit comments

Comments
 (0)