Skip to content

Commit 7826fb8

Browse files
authored
Merge pull request #2117 from rhc54/topic/fnc
Fix fence timeout
2 parents 610c485 + 9b32b4b commit 7826fb8

File tree

3 files changed

+30
-15
lines changed

3 files changed

+30
-15
lines changed

src/server/pmix_server.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3274,7 +3274,6 @@ static void _mdxcbfunc(int sd, short argc, void *cbdata)
32743274
PMIX_LIST_FOREACH_SAFE(cd, nxt, &tracker->local_cbs, pmix_server_caddy_t) {
32753275
reply = PMIX_NEW(pmix_buffer_t);
32763276
if (NULL == reply) {
3277-
rc = PMIX_ERR_NOMEM;
32783277
break;
32793278
}
32803279
/* setup the reply, starting with the returned status */
@@ -3286,8 +3285,8 @@ static void _mdxcbfunc(int sd, short argc, void *cbdata)
32863285
pmix_output_verbose(2, pmix_server_globals.base_output,
32873286
"server:modex_cbfunc reply being sent to %s:%u",
32883287
cd->peer->info->pname.nspace, cd->peer->info->pname.rank);
3289-
PMIX_SERVER_QUEUE_REPLY(rc, cd->peer, cd->hdr.tag, reply);
3290-
if (PMIX_SUCCESS != rc) {
3288+
PMIX_SERVER_QUEUE_REPLY(ret, cd->peer, cd->hdr.tag, reply);
3289+
if (PMIX_SUCCESS != ret) {
32913290
PMIX_RELEASE(reply);
32923291
}
32933292
/* remove this entry */

src/server/pmix_server_ops.c

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -574,20 +574,18 @@ static pmix_server_trkr_t* new_tracker(char *id, pmix_proc_t *procs,
574574

575575
static void fence_timeout(int sd, short args, void *cbdata)
576576
{
577-
pmix_server_caddy_t *cd = (pmix_server_caddy_t*)cbdata;
577+
pmix_server_trkr_t *trk = (pmix_server_trkr_t*)cbdata;
578578

579579
pmix_output_verbose(2, pmix_server_globals.fence_output,
580580
"ALERT: fence timeout fired");
581581

582582
/* execute the provided callback function with the error */
583-
if (NULL != cd->trk->modexcbfunc) {
584-
cd->trk->modexcbfunc(PMIX_ERR_TIMEOUT, NULL, 0, cd->trk, NULL, NULL);
583+
if (NULL != trk->modexcbfunc) {
584+
trk->modexcbfunc(PMIX_ERR_TIMEOUT, NULL, 0, trk, NULL, NULL);
585585
return; // the cbfunc will have cleaned up the tracker
586586
}
587-
cd->event_active = false;
588-
/* remove it from the list */
589-
pmix_list_remove_item(&cd->trk->local_cbs, &cd->super);
590-
PMIX_RELEASE(cd);
587+
trk->event_active = false;
588+
PMIX_RELEASE(trk);
591589
}
592590

593591
static pmix_status_t _collect_data(pmix_server_trkr_t *trk,
@@ -1039,11 +1037,9 @@ pmix_status_t pmix_server_fence(pmix_server_caddy_t *cd,
10391037
* notified when we are done */
10401038
pmix_list_append(&trk->local_cbs, &cd->super);
10411039
/* if a timeout was specified, set it */
1042-
if (0 < tv.tv_sec) {
1043-
PMIX_RETAIN(trk);
1044-
cd->trk = trk;
1045-
PMIX_THREADSHIFT_DELAY(cd, fence_timeout, tv.tv_sec);
1046-
cd->event_active = true;
1040+
if (0 < tv.tv_sec && !trk->event_active) {
1041+
PMIX_THREADSHIFT_DELAY(trk, fence_timeout, tv.tv_sec);
1042+
trk->event_active = true;
10471043
}
10481044

10491045
/* if all local contributions have been received,
@@ -1054,6 +1050,16 @@ pmix_status_t pmix_server_fence(pmix_server_caddy_t *cd,
10541050
pmix_list_get_size(&trk->local_cbs) == trk->nlocal) {
10551051
pmix_output_verbose(2, pmix_server_globals.fence_output,
10561052
"fence LOCALLY complete");
1053+
/* if a timeout was set, then we delete it here as we can
1054+
* ONLY check for local completion. Otherwise, passing
1055+
* the tracker object up to the host can result in
1056+
* competing timeout events, and the host could return
1057+
* the tracker AFTER we released it due to our internal
1058+
* timeout firing */
1059+
if (trk->event_active) {
1060+
pmix_event_del(&trk->ev);
1061+
trk->event_active = false;
1062+
}
10571063
/* if this is a purely local fence (i.e., all participants are local),
10581064
* then it is done and we notify accordingly */
10591065
if (pmix_server_globals.fence_localonly_opt && trk->local) {

test/test_v2/server_callbacks.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* Copyright (c) 2015-2018 Mellanox Technologies, Inc.
66
* All rights reserved.
77
* Copyright (c) 2016 IBM Corporation. All rights reserved.
8+
* Copyright (c) 2021 Nanook Consulting All rights reserved.
89
* $COPYRIGHT$
910
*
1011
* Additional copyrights may follow
@@ -124,9 +125,18 @@ pmix_status_t fencenb_fn(const pmix_proc_t procs[], size_t nprocs,
124125
char *data, size_t ndata,
125126
pmix_modex_cbfunc_t cbfunc, void *cbdata)
126127
{
128+
size_t n;
129+
127130
TEST_VERBOSE(("Getting data for %s:%d",
128131
procs[0].nspace, procs[0].rank));
129132

133+
/* see if we are asked to do something we don't support */
134+
for (n=0; n < ninfo; n++) {
135+
if (PMIX_CHECK_KEY(&info[n], PMIX_TIMEOUT)) {
136+
return PMIX_ERR_NOT_SUPPORTED;
137+
}
138+
}
139+
130140
if ((pmix_list_get_size(server_list) == 1) && (my_server_id == 0)) {
131141
if (NULL != cbfunc) {
132142
cbfunc(PMIX_SUCCESS, data, ndata, cbdata, NULL, NULL);

0 commit comments

Comments
 (0)