Skip to content

Commit bf267b3

Browse files
committed
coll/han/alltoallv: Fix logic around waitany
The logic for waitany was flawed. While we could wait for either a send or a receive, we cannot consume receives in any order, and likewise for sends. Fix this by simply ping-ponging between waiting for sends or receives and cycling when there is nothing to wait on. Signed-off-by: Luke Robison <[email protected]>
1 parent 161fd69 commit bf267b3

File tree

1 file changed

+23
-19
lines changed

1 file changed

+23
-19
lines changed

ompi/mca/coll/han/coll_han_alltoallv.c

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -251,10 +251,16 @@ static inline int alltoallv_sendrecv_w_direct_for_debugging(
251251
if (jloop < nreqs){
252252
jreq = jloop;
253253
have_completion = 0;
254+
requests[jreq] = MPI_REQUEST_NULL;
254255
} else {
255256
have_completion = 1;
256-
rc = ompi_request_wait_any( nreqs, requests, &jreq, MPI_STATUS_IGNORE );
257+
jreq = jloop%nreqs;
258+
if (requests[jreq] == MPI_REQUEST_NULL) {
259+
continue;
260+
}
261+
rc = ompi_request_wait(&requests[jreq], MPI_STATUS_IGNORE);
257262
if (rc) break;
263+
requests[jreq] = MPI_REQUEST_NULL;
258264
}
259265
int ii_send_req = jreq >= jfirst_sendreq;
260266
if (have_completion) {
@@ -264,7 +270,6 @@ static inline int alltoallv_sendrecv_w_direct_for_debugging(
264270
jrecvs_completed++;
265271
}
266272

267-
requests[jreq] = &ompi_request_null.request;
268273
if (ii_send_req && jsends_posted < ntypes_send) {
269274
rc = ompi_datatype_create_contiguous( 1, (ompi_datatype_t*)send_types[jsends_posted], &yuck_ompi_dtype_from_opal );
270275
if (rc) break;
@@ -348,7 +353,6 @@ static int alltoallv_sendrecv_w(
348353
int jreq;
349354
int jfirst_sendreq = nbufs/2 + nbufs%2;
350355
size_t recv_post_remaining_bytes;
351-
int rc;
352356

353357
size_t jloop = 0;
354358
size_t send_pack_bytes_remaining = 0;
@@ -408,6 +412,7 @@ static int alltoallv_sendrecv_w(
408412
*/
409413
jtype_send = -1;
410414
jtype_recv = -1;
415+
int sequential_continues = 0;
411416
for (jloop=0; ; jloop++) {
412417
int ii_more_sends_to_post = jtype_send < ntypes_send || send_pack_bytes_remaining > 0;
413418
int ii_more_sends_to_complete = nsend_req_pending > 0;
@@ -424,24 +429,23 @@ static int alltoallv_sendrecv_w(
424429

425430
if (jloop >= nreqs) {
426431
/* Common Case: */
427-
/* wait for any send or recv to complete */
428-
rc = ompi_request_wait_any(nreqs, requests, &jreq, MPI_STATUS_IGNORE);
429-
if (rc != 0) {
430-
opal_output_verbose(1, mca_coll_han_component.han_output,
431-
"ompi_request_wait_any returned error code %d in alltoallv_sendrecv_w (loop=%ld)\n",rc,jloop);
432-
return rc;
432+
/* wait for the send or recv to complete */
433+
jreq = jloop%nreqs;
434+
if (requests[jreq] == MPI_REQUEST_NULL) {
435+
if (++sequential_continues > nbufs) {
436+
opal_output_verbose(1, mca_coll_han_component.han_output,
437+
"ERROR: no active requests to wait on! Loop=%ld: %d %d %d %d\n",
438+
jloop,
439+
ii_more_sends_to_post, ii_more_sends_to_complete,
440+
ii_more_recvs_to_post, ii_more_recvs_to_complete );
441+
return MPI_ERR_INTERN;
442+
}
443+
continue;
433444
}
445+
sequential_continues = 0;
446+
ompi_request_wait( &requests[jreq], MPI_STATUS_IGNORE );
434447
have_completion = 1;
435-
if (jreq == MPI_UNDEFINED) {
436-
opal_output_verbose(1, mca_coll_han_component.han_output,
437-
"ERROR: no active requests to wait on! Loop=%ld: %d %d %d %d\n",
438-
jloop,
439-
ii_more_sends_to_post, ii_more_sends_to_complete,
440-
ii_more_recvs_to_post, ii_more_recvs_to_complete );
441-
have_completion = 0;
442-
jreq = jloop % nbufs;
443-
return MPI_ERR_INTERN;
444-
}
448+
requests[jreq] = MPI_REQUEST_NULL;
445449
} else {
446450
/* priming the loop: post sends or recvs while have_completion=0.
447451

0 commit comments

Comments
 (0)