Skip to content

Commit 2ae3cfd

Browse files
authored
Merge pull request #5699 from ICLDisco/export/coll_errors
Error cases in base collectives
2 parents b0e2975 + 466217f commit 2ae3cfd

9 files changed

+136
-31
lines changed

ompi/mca/coll/base/coll_base_allreduce.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
350350
char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
351351
ptrdiff_t true_lb, true_extent, lb, extent;
352352
ptrdiff_t block_offset, max_real_segsize;
353-
ompi_request_t *reqs[2] = {NULL, NULL};
353+
ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
354354

355355
size = ompi_comm_size(comm);
356356
rank = ompi_comm_rank(comm);
@@ -528,6 +528,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
528528
error_hndl:
529529
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
530530
__FILE__, line, rank, ret));
531+
ompi_coll_base_free_reqs(reqs, 2);
531532
(void)line; // silence compiler warning
532533
if (NULL != inbuf[0]) free(inbuf[0]);
533534
if (NULL != inbuf[1]) free(inbuf[1]);
@@ -627,7 +628,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
627628
size_t typelng;
628629
char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
629630
ptrdiff_t block_offset, max_real_segsize;
630-
ompi_request_t *reqs[2] = {NULL, NULL};
631+
ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
631632
ptrdiff_t lb, extent, gap;
632633

633634
size = ompi_comm_size(comm);
@@ -847,6 +848,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
847848
error_hndl:
848849
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
849850
__FILE__, line, rank, ret));
851+
ompi_coll_base_free_reqs(reqs, 2);
850852
(void)line; // silence compiler warning
851853
if (NULL != inbuf[0]) free(inbuf[0]);
852854
if (NULL != inbuf[1]) free(inbuf[1]);

ompi/mca/coll/base/coll_base_alltoall.c

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
393393
if (0 < total_reqs) {
394394
reqs = ompi_coll_base_comm_get_reqs(module->base_data, 2 * total_reqs);
395395
if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; }
396+
reqs[0] = reqs[1] = MPI_REQUEST_NULL;
396397
}
397398

398399
prcv = (char *) rbuf;
@@ -468,6 +469,15 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
468469
return MPI_SUCCESS;
469470

470471
error_hndl:
472+
/* find a real error code */
473+
if (MPI_ERR_IN_STATUS == error) {
474+
for( ri = 0; ri < nreqs; ri++ ) {
475+
if (MPI_REQUEST_NULL == reqs[ri]) continue;
476+
if (MPI_ERR_PENDING == reqs[ri]->req_status.MPI_ERROR) continue;
477+
error = reqs[ri]->req_status.MPI_ERROR;
478+
break;
479+
}
480+
}
471481
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
472482
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
473483
rank));
@@ -661,7 +671,16 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
661671
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
662672

663673
err_hndl:
664-
if( MPI_SUCCESS != err ) {
674+
if (MPI_SUCCESS != err) {
675+
/* find a real error code */
676+
if (MPI_ERR_IN_STATUS == err) {
677+
for( i = 0; i < nreqs; i++ ) {
678+
if (MPI_REQUEST_NULL == req[i]) continue;
679+
if (MPI_ERR_PENDING == req[i]->req_status.MPI_ERROR) continue;
680+
err = req[i]->req_status.MPI_ERROR;
681+
break;
682+
}
683+
}
665684
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
666685
__FILE__, line, err, rank) );
667686
(void)line; // silence compiler warning

ompi/mca/coll/base/coll_base_alltoallv.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2017 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -276,6 +276,15 @@ ompi_coll_base_alltoallv_intra_basic_linear(const void *sbuf, const int *scounts
276276
err = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);
277277

278278
err_hndl:
279+
/* find a real error code */
280+
if (MPI_ERR_IN_STATUS == err) {
281+
for( i = 0; i < nreqs; i++ ) {
282+
if (MPI_REQUEST_NULL == reqs[i]) continue;
283+
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
284+
err = reqs[i]->req_status.MPI_ERROR;
285+
break;
286+
}
287+
}
279288
/* Free the requests in all cases as they are persistent */
280289
ompi_coll_base_free_reqs(reqs, nreqs);
281290

ompi/mca/coll/base/coll_base_barrier.c

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2017 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -102,8 +102,10 @@ int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm,
102102
{
103103
int rank, size, err = 0, line = 0, left, right;
104104

105-
rank = ompi_comm_rank(comm);
106105
size = ompi_comm_size(comm);
106+
if( 1 == size )
107+
return OMPI_SUCCESS;
108+
rank = ompi_comm_rank(comm);
107109

108110
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank));
109111

@@ -172,8 +174,10 @@ int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *c
172174
{
173175
int rank, size, adjsize, err, line, mask, remote;
174176

175-
rank = ompi_comm_rank(comm);
176177
size = ompi_comm_size(comm);
178+
if( 1 == size )
179+
return OMPI_SUCCESS;
180+
rank = ompi_comm_rank(comm);
177181
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
178182
"ompi_coll_base_barrier_intra_recursivedoubling rank %d",
179183
rank));
@@ -251,8 +255,10 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
251255
{
252256
int rank, size, distance, to, from, err, line = 0;
253257

254-
rank = ompi_comm_rank(comm);
255258
size = ompi_comm_size(comm);
259+
if( 1 == size )
260+
return MPI_SUCCESS;
261+
rank = ompi_comm_rank(comm);
256262
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
257263
"ompi_coll_base_barrier_intra_bruck rank %d", rank));
258264

@@ -285,16 +291,19 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
285291
int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
286292
mca_coll_base_module_t *module)
287293
{
288-
int remote, err;
294+
int remote, size, err;
295+
296+
size = ompi_comm_size(comm);
297+
if( 1 == size )
298+
return MPI_SUCCESS;
299+
if( 2 != ompi_comm_size(comm) ) {
300+
return MPI_ERR_UNSUPPORTED_OPERATION;
301+
}
289302

290303
remote = ompi_comm_rank(comm);
291304
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
292305
"ompi_coll_base_barrier_intra_two_procs rank %d", remote));
293306

294-
if (2 != ompi_comm_size(comm)) {
295-
return MPI_ERR_UNSUPPORTED_OPERATION;
296-
}
297-
298307
remote = (remote + 1) & 0x1;
299308

300309
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
@@ -324,8 +333,10 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
324333
int i, err, rank, size, line;
325334
ompi_request_t** requests = NULL;
326335

327-
rank = ompi_comm_rank(comm);
328336
size = ompi_comm_size(comm);
337+
if( 1 == size )
338+
return MPI_SUCCESS;
339+
rank = ompi_comm_rank(comm);
329340

330341
/* All non-root send & receive zero-length message. */
331342
if (rank > 0) {
@@ -367,11 +378,21 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
367378
/* All done */
368379
return MPI_SUCCESS;
369380
err_hndl:
381+
if( NULL != requests ) {
382+
/* find a real error code */
383+
if (MPI_ERR_IN_STATUS == err) {
384+
for( i = 0; i < size; i++ ) {
385+
if (MPI_REQUEST_NULL == requests[i]) continue;
386+
if (MPI_ERR_PENDING == requests[i]->req_status.MPI_ERROR) continue;
387+
err = requests[i]->req_status.MPI_ERROR;
388+
break;
389+
}
390+
}
391+
ompi_coll_base_free_reqs(requests, size);
392+
}
370393
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
371394
__FILE__, line, err, rank) );
372395
(void)line; // silence compiler warning
373-
if( NULL != requests )
374-
ompi_coll_base_free_reqs(requests, size);
375396
return err;
376397
}
377398
/* copied function (with appropriate renaming) ends here */
@@ -385,8 +406,10 @@ int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm,
385406
{
386407
int rank, size, depth, err, jump, partner;
387408

388-
rank = ompi_comm_rank(comm);
389409
size = ompi_comm_size(comm);
410+
if( 1 == size )
411+
return MPI_SUCCESS;
412+
rank = ompi_comm_rank(comm);
390413
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
391414
"ompi_coll_base_barrier_intra_tree %d",
392415
rank));

ompi/mca/coll/base/coll_base_bcast.c

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2017 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -214,13 +214,29 @@ ompi_coll_base_bcast_intra_generic( void* buffer,
214214
return (MPI_SUCCESS);
215215

216216
error_hndl:
217-
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
218-
__FILE__, line, err, rank) );
219-
(void)line; // silence compiler warnings
217+
if (MPI_ERR_IN_STATUS == err) {
218+
for( req_index = 0; req_index < 2; req_index++ ) {
219+
if (MPI_REQUEST_NULL == recv_reqs[req_index]) continue;
220+
if (MPI_ERR_PENDING == recv_reqs[req_index]->req_status.MPI_ERROR) continue;
221+
err = recv_reqs[req_index]->req_status.MPI_ERROR;
222+
break;
223+
}
224+
}
220225
ompi_coll_base_free_reqs( recv_reqs, 2);
221226
if( NULL != send_reqs ) {
227+
if (MPI_ERR_IN_STATUS == err) {
228+
for( req_index = 0; req_index < tree->tree_nextsize; req_index++ ) {
229+
if (MPI_REQUEST_NULL == send_reqs[req_index]) continue;
230+
if (MPI_ERR_PENDING == send_reqs[req_index]->req_status.MPI_ERROR) continue;
231+
err = send_reqs[req_index]->req_status.MPI_ERROR;
232+
break;
233+
}
234+
}
222235
ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize);
223236
}
237+
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
238+
__FILE__, line, err, rank) );
239+
(void)line; // silence compiler warnings
224240

225241
return err;
226242
}
@@ -649,12 +665,21 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count,
649665
* care what the error was -- just that there *was* an error. The
650666
* PML will finish all requests, even if one or more of them fail.
651667
* i.e., by the end of this call, all the requests are free-able.
652-
* So free them anyway -- even if there was an error, and return
653-
* the error after we free everything. */
668+
* So free them anyway -- even if there was an error.
669+
* Note we still need to get the actual error, as collective
670+
* operations cannot return MPI_ERR_IN_STATUS.
671+
*/
654672

655673
err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE);
656674
err_hndl:
657675
if( MPI_SUCCESS != err ) { /* Free the reqs */
676+
/* first find the real error code */
677+
for( preq = reqs; preq < reqs+i; preq++ ) {
678+
if (MPI_REQUEST_NULL == *preq) continue;
679+
if (MPI_ERR_PENDING == (*preq)->req_status.MPI_ERROR) continue;
680+
err = (*preq)->req_status.MPI_ERROR;
681+
break;
682+
}
658683
ompi_coll_base_free_reqs(reqs, i);
659684
}
660685

ompi/mca/coll/base/coll_base_gather.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,15 @@ ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount,
326326
return MPI_SUCCESS;
327327
error_hndl:
328328
if (NULL != reqs) {
329+
/* find a real error code */
330+
if (MPI_ERR_IN_STATUS == ret) {
331+
for( i = 0; i < size; i++ ) {
332+
if (MPI_REQUEST_NULL == reqs[i]) continue;
333+
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
334+
ret = reqs[i]->req_status.MPI_ERROR;
335+
break;
336+
}
337+
}
329338
ompi_coll_base_free_reqs(reqs, size);
330339
}
331340
OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,

ompi/mca/coll/base/coll_base_reduce.c

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -338,16 +338,34 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, int origi
338338
return OMPI_SUCCESS;
339339

340340
error_hndl: /* error handler */
341+
/* find a real error code */
342+
if (MPI_ERR_IN_STATUS == ret) {
343+
for( i = 0; i < 2; i++ ) {
344+
if (MPI_REQUEST_NULL == reqs[i]) continue;
345+
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
346+
ret = reqs[i]->req_status.MPI_ERROR;
347+
break;
348+
}
349+
}
350+
ompi_coll_base_free_reqs(reqs, 2);
351+
if( NULL != sreq ) {
352+
if (MPI_ERR_IN_STATUS == ret) {
353+
for( i = 0; i < max_outstanding_reqs; i++ ) {
354+
if (MPI_REQUEST_NULL == sreq[i]) continue;
355+
if (MPI_ERR_PENDING == sreq[i]->req_status.MPI_ERROR) continue;
356+
ret = sreq[i]->req_status.MPI_ERROR;
357+
break;
358+
}
359+
}
360+
ompi_coll_base_free_reqs(sreq, max_outstanding_reqs);
361+
}
362+
if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
363+
if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
364+
if( accumbuf_free != NULL ) free(accumbuf);
341365
OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
342366
"ERROR_HNDL: node %d file %s line %d error %d\n",
343367
rank, __FILE__, line, ret ));
344368
(void)line; // silence compiler warning
345-
if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
346-
if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
347-
if( accumbuf_free != NULL ) free(accumbuf);
348-
if( NULL != sreq ) {
349-
ompi_coll_base_free_reqs(sreq, max_outstanding_reqs);
350-
}
351369
return ret;
352370
}
353371

ompi/mca/coll/base/coll_base_reduce_scatter.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, const in
464464
char *tmpsend = NULL, *tmprecv = NULL, *accumbuf = NULL, *accumbuf_free = NULL;
465465
char *inbuf_free[2] = {NULL, NULL}, *inbuf[2] = {NULL, NULL};
466466
ptrdiff_t extent, max_real_segsize, dsize, gap = 0;
467-
ompi_request_t *reqs[2] = {NULL, NULL};
467+
ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
468468

469469
size = ompi_comm_size(comm);
470470
rank = ompi_comm_rank(comm);

ompi/mca/coll/base/coll_base_util.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ int ompi_coll_base_sendrecv_actual( const void* sendbuf, size_t scount,
4141
{ /* post receive first, then send, then wait... should be fast (I hope) */
4242
int err, line = 0;
4343
size_t rtypesize, stypesize;
44-
ompi_request_t *req;
44+
ompi_request_t *req = MPI_REQUEST_NULL;
4545
ompi_status_public_t rstatus;
4646

4747
/* post new irecv */

0 commit comments

Comments
 (0)