Skip to content

Commit d9aa3a0

Browse files
committed
Fix/implement the group invite support
We had pushed this off as the only users were working with the blocking forms of group construct...but now seems to be the time! Uncovered a few bugs in the basic code paths, so worth the effort. Fix a check in event notification that was looking at the wrong field for determining if a status had been included. Fix a bunch of places in client group functions that were using the wrong callback object or were simple errors. Ensure that fence uses the right group participants. Cleanup the asyncgroup example as the non-leader group members were not waiting for the "group complete" event. Signed-off-by: Ralph Castain <[email protected]>
1 parent 813d8ba commit d9aa3a0

File tree

7 files changed

+238
-149
lines changed

7 files changed

+238
-149
lines changed

examples/asyncgroup.c

Lines changed: 41 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
* Copyright (c) 2019 Triad National Security, LLC. All rights
1919
* reserved.
2020
* Copyright (c) 2019 IBM Corporation. All rights reserved.
21-
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
21+
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
2222
* $COPYRIGHT$
2323
*
2424
* Additional copyrights may follow
@@ -48,7 +48,8 @@ static void notification_fn(size_t evhdlr_registration_id, pmix_status_t status,
4848
EXAMPLES_HIDE_UNUSED_PARAMS(evhdlr_registration_id, source,
4949
info, ninfo, results, nresults);
5050

51-
fprintf(stderr, "Client %s:%d NOTIFIED with status %d\n", myproc.nspace, myproc.rank, status);
51+
fprintf(stderr, "Client %s:%d NOTIFIED with status %s\n",
52+
myproc.nspace, myproc.rank, PMIx_Error_string(status));
5253
if (NULL != cbfunc) {
5354
cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
5455
}
@@ -72,12 +73,12 @@ static void errhandler_reg_callbk(pmix_status_t status, size_t errhandler_ref, v
7273
DEBUG_WAKEUP_THREAD(lock);
7374
}
7475

75-
static void grpcomplete(pmix_status_t status, pmix_info_t *info, size_t ninfo, void *cbdata,
76-
pmix_release_cbfunc_t release_fn, void *release_cbdata)
76+
static void grpcomplete(size_t evhdlr_registration_id, pmix_status_t status, const pmix_proc_t *source,
77+
pmix_info_t info[], size_t ninfo, pmix_info_t results[], size_t nresults,
78+
pmix_event_notification_cbfunc_fn_t cbfunc, void *cbdata)
7779
{
78-
EXAMPLES_HIDE_UNUSED_PARAMS(status, info, ninfo, cbdata, release_fn, release_cbdata);
80+
EXAMPLES_HIDE_UNUSED_PARAMS(evhdlr_registration_id, status, source, info, ninfo, results, nresults, cbfunc, cbdata);
7981

80-
fprintf(stderr, "%s:%d GRPCOMPLETE\n", myproc.nspace, myproc.rank);
8182
DEBUG_WAKEUP_THREAD(&invitedlock);
8283
}
8384

@@ -93,7 +94,6 @@ static void invitefn(size_t evhdlr_registration_id, pmix_status_t status, const
9394

9495
/* if I am the leader, I can ignore this event */
9596
if (PMIX_CHECK_PROCID(source, &myproc)) {
96-
fprintf(stderr, "%s:%d INVITED, BUT LEADER\n", myproc.nspace, myproc.rank);
9797
/* mark the event chain as complete */
9898
if (NULL != cbfunc) {
9999
cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
@@ -108,16 +108,12 @@ static void invitefn(size_t evhdlr_registration_id, pmix_status_t status, const
108108
break;
109109
}
110110
}
111-
fprintf(stderr, "Client %s:%d INVITED by source %s:%d\n", myproc.nspace, myproc.rank,
112-
source->nspace, source->rank);
113111
invitedlock.status = status;
114-
fprintf(stderr, "%s:%d ACCEPTING INVITE\n", myproc.nspace, myproc.rank);
115-
rc = PMIx_Group_join_nb(grp, source, PMIX_GROUP_ACCEPT, NULL, 0, grpcomplete, NULL);
112+
rc = PMIx_Group_join_nb(grp, source, PMIX_GROUP_ACCEPT, NULL, 0, NULL, NULL);
116113
if (PMIX_SUCCESS != rc) {
117114
fprintf(stderr, "%s:%d Error in Group_join_nb: %sn", myproc.nspace, myproc.rank,
118115
PMIx_Error_string(rc));
119116
}
120-
121117
/* mark the event chain as complete */
122118
if (NULL != cbfunc) {
123119
cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
@@ -134,25 +130,28 @@ int main(int argc, char **argv)
134130
pmix_status_t code;
135131
pmix_info_t *results;
136132
size_t nresults;
133+
char hostname[1024];
137134

138135
EXAMPLES_HIDE_UNUSED_PARAMS(argc, argv);
139136

137+
gethostname(hostname, sizeof(hostname));
138+
140139
/* init us */
141140
if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
142141
fprintf(stderr, "Client ns %s rank %d: PMIx_Init failed: %s\n", myproc.nspace, myproc.rank,
143142
PMIx_Error_string(rc));
144143
exit(0);
145144
}
146-
fprintf(stderr, "[%d] Client ns %s rank %d: Running\n", (int) getpid(), myproc.nspace,
147-
myproc.rank);
145+
fprintf(stderr, "[%d] Client ns %s rank %d: Running on %s\n",
146+
(int) getpid(), myproc.nspace, myproc.rank, hostname);
148147

149148
DEBUG_CONSTRUCT_LOCK(&invitedlock);
150149

151150
PMIX_LOAD_PROCID(&proc, myproc.nspace, PMIX_RANK_WILDCARD);
152151

153-
/* get our universe size */
154-
if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
155-
fprintf(stderr, "Client ns %s rank %d: PMIx_Get universe size failed: %s\n", myproc.nspace,
152+
/* get our job size */
153+
if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &val))) {
154+
fprintf(stderr, "Client ns %s rank %d: PMIx_Get job size failed: %s\n", myproc.nspace,
156155
myproc.rank, PMIx_Error_string(rc));
157156
goto done;
158157
}
@@ -164,7 +163,7 @@ int main(int argc, char **argv)
164163
}
165164
goto done;
166165
}
167-
fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, nprocs);
166+
fprintf(stderr, "Client %s:%d job size %d\n", myproc.nspace, myproc.rank, nprocs);
168167

169168
/* register our default errhandler */
170169
DEBUG_CONSTRUCT_LOCK(&lock);
@@ -187,6 +186,18 @@ int main(int argc, char **argv)
187186
if (PMIX_SUCCESS != rc) {
188187
goto done;
189188
}
189+
if (2 == myproc.rank || 3 == myproc.rank) {
190+
/* need to register for group complete event */
191+
DEBUG_CONSTRUCT_LOCK(&lock);
192+
code = PMIX_GROUP_CONSTRUCT_COMPLETE;
193+
PMIx_Register_event_handler(&code, 1, NULL, 0, grpcomplete, errhandler_reg_callbk, (void *) &lock);
194+
DEBUG_WAIT_THREAD(&lock);
195+
rc = lock.status;
196+
DEBUG_DESTRUCT_LOCK(&lock);
197+
if (PMIX_SUCCESS != rc) {
198+
goto done;
199+
}
200+
}
190201

191202
/* call fence to sync */
192203
PMIX_LOAD_PROCID(&proc, myproc.nspace, PMIX_RANK_WILDCARD);
@@ -205,10 +216,10 @@ int main(int argc, char **argv)
205216
PMIX_PROC_LOAD(&procs[1], myproc.nspace, 2);
206217
PMIX_PROC_LOAD(&procs[2], myproc.nspace, 3);
207218
rc = PMIx_Group_invite("ourgroup", procs, nprocs, NULL, 0, &results, &nresults);
219+
fprintf(stderr, "Client ns %s rank %d: Group invite complete with status %s!\n",
220+
myproc.nspace, myproc.rank, PMIx_Error_string(rc));
208221
if (PMIX_SUCCESS != rc) {
209-
fprintf(stderr, "Client ns %s rank %d: PMIx_Group_invite failed: %s\n", myproc.nspace,
210-
myproc.rank, PMIx_Error_string(rc));
211-
goto done;
222+
exit(1);
212223
}
213224
PMIX_PROC_FREE(procs, nprocs);
214225
fprintf(stderr, "%s:%d Execute fence across group\n", myproc.nspace, myproc.rank);
@@ -217,29 +228,30 @@ int main(int argc, char **argv)
217228
if (PMIX_SUCCESS != rc) {
218229
fprintf(stderr, "Client ns %s rank %d: PMIx_Fence across group failed: %d\n",
219230
myproc.nspace, myproc.rank, rc);
220-
goto done;
231+
exit(1);
221232
}
222-
fprintf(stderr, "%d executing Group_destruct\n", myproc.rank);
233+
fprintf(stderr, "%d Executing Group_destruct\n", myproc.rank);
223234
rc = PMIx_Group_destruct("ourgroup", NULL, 0);
224235
if (PMIX_SUCCESS != rc) {
225-
fprintf(stderr, "Client ns %s rank %d: PMIx_Group_destruct failed: %s\n", myproc.nspace,
226-
myproc.rank, PMIx_Error_string(rc));
227-
goto done;
236+
fprintf(stderr, "Client ns %s rank %d: PMIx_Group_destruct failed: %s\n",
237+
myproc.nspace, myproc.rank, PMIx_Error_string(rc));
238+
exit(1);
228239
}
229240
} else if (2 == myproc.rank || 3 == myproc.rank) {
230241
/* wait to be invited */
231-
fprintf(stderr, "%s:%d waiting to be invited\n", myproc.nspace, myproc.rank);
242+
fprintf(stderr, "%s:%d waiting to join group\n", myproc.nspace, myproc.rank);
232243
DEBUG_WAIT_THREAD(&invitedlock);
233244
DEBUG_DESTRUCT_LOCK(&invitedlock);
234-
fprintf(stderr, "%s:%d Execute fence across group\n", myproc.nspace, myproc.rank);
245+
fprintf(stderr, "%s:%d Group complete - executing fence across group\n",
246+
myproc.nspace, myproc.rank);
235247
PMIX_PROC_LOAD(&proc, "ourgroup", PMIX_RANK_WILDCARD);
236248
rc = PMIx_Fence(&proc, 1, NULL, 0);
237249
if (PMIX_SUCCESS != rc) {
238250
fprintf(stderr, "Client ns %s rank %d: PMIx_Fence across group failed: %d\n",
239251
myproc.nspace, myproc.rank, rc);
240252
goto done;
241253
}
242-
fprintf(stderr, "%d executing Group_destruct\n", myproc.rank);
254+
fprintf(stderr, "%d Executing Group_destruct\n", myproc.rank);
243255
rc = PMIx_Group_destruct("ourgroup", NULL, 0);
244256
if (PMIX_SUCCESS != rc) {
245257
fprintf(stderr, "Client ns %s rank %d: PMIx_Group_destruct failed: %s\n", myproc.nspace,

src/client/pmix_client_fence.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ PMIX_EXPORT pmix_status_t PMIx_Fence(const pmix_proc_t procs[], size_t nprocs,
6969

7070
PMIX_ACQUIRE_THREAD(&pmix_global_lock);
7171

72-
pmix_output_verbose(2, pmix_client_globals.fence_output, "pmix: executing fence");
72+
pmix_output_verbose(2, pmix_client_globals.fence_output,
73+
"pmix: executing fence");
7374

7475
if (pmix_globals.init_cntr <= 0) {
7576
PMIX_RELEASE_THREAD(&pmix_global_lock);
@@ -106,7 +107,8 @@ PMIX_EXPORT pmix_status_t PMIx_Fence(const pmix_proc_t procs[], size_t nprocs,
106107
rc = cb->status;
107108
PMIX_RELEASE(cb);
108109

109-
pmix_output_verbose(2, pmix_client_globals.fence_output, "pmix: fence released");
110+
pmix_output_verbose(2, pmix_client_globals.fence_output,
111+
"pmix: fence released");
110112

111113
return rc;
112114
}
@@ -124,7 +126,8 @@ PMIX_EXPORT pmix_status_t PMIx_Fence_nb(const pmix_proc_t procs[], size_t nprocs
124126

125127
PMIX_ACQUIRE_THREAD(&pmix_global_lock);
126128

127-
pmix_output_verbose(2, pmix_client_globals.fence_output, "pmix: fence_nb called");
129+
pmix_output_verbose(2, pmix_client_globals.fence_output,
130+
"pmix: fence_nb called");
128131

129132
if (pmix_globals.init_cntr <= 0) {
130133
PMIX_RELEASE_THREAD(&pmix_global_lock);

0 commit comments

Comments
 (0)