Skip to content

Commit 4ccfe9f

Browse files
committed
coll/han/alltoallv: handle errors better
Previously if one rank returned an error, it would immediately exit the collective with an error which likely causes that pid to immediately exit with an error. Unfortunately this causes immediate segfaults as other ranks may still be accessing shared memory, often causing cascading walls of segfault text. I have moved the existing MPI_barrier so that all processes must pass it before exit regardless of error condition. This means that either errors will hang, or all ranks will encounter an error. Signed-off-by: Luke Robison <[email protected]>
1 parent 317a1c4 commit 4ccfe9f

File tree

1 file changed

+78
-35
lines changed

1 file changed

+78
-35
lines changed

ompi/mca/coll/han/coll_han_alltoallv.c

Lines changed: 78 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ static inline int alltoallv_sendrecv_w_direct_for_debugging(
184184
have_completion = 0;
185185
} else {
186186
have_completion = 1;
187-
ompi_request_wait_any( nreqs, requests, &jreq, MPI_STATUS_IGNORE );
187+
rc = ompi_request_wait_any( nreqs, requests, &jreq, MPI_STATUS_IGNORE );
188188
}
189189
int ii_send_req = jreq >= jfirst_sendreq;
190190
if (have_completion) {
@@ -197,28 +197,41 @@ static inline int alltoallv_sendrecv_w_direct_for_debugging(
197197
requests[jreq] = &ompi_request_null.request;
198198
if (ii_send_req && jsends_posted < ntypes_send) {
199199
rc = ompi_datatype_create_contiguous( 1, (ompi_datatype_t*)send_types[jsends_posted], &yuck_ompi_dtype_from_opal );
200-
ompi_datatype_commit(&yuck_ompi_dtype_from_opal);
201-
MCA_PML_CALL(isend
200+
if (rc) break;
201+
rc = ompi_datatype_commit(&yuck_ompi_dtype_from_opal);
202+
if (rc) break;
203+
rc = MCA_PML_CALL(isend
202204
(send_from_addrs[jsends_posted], (int)send_counts[jsends_posted], yuck_ompi_dtype_from_opal, jrank_sendto,
203205
MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD,
204206
comm, &requests[jreq]));
205-
ompi_datatype_destroy( &yuck_ompi_dtype_from_opal );
207+
if (rc) break;
208+
rc = ompi_datatype_destroy( &yuck_ompi_dtype_from_opal );
209+
if (rc) break;
210+
206211
jsends_posted++;
207212
}
208213
if (!ii_send_req && jrecvs_posted < ntypes_recv ) {
209214
rc = ompi_datatype_create_contiguous( 1, (ompi_datatype_t*)recv_types[jrecvs_posted], &yuck_ompi_dtype_from_opal );
210-
ompi_datatype_commit(&yuck_ompi_dtype_from_opal);
211-
MCA_PML_CALL(irecv
215+
if (rc) break;
216+
rc = ompi_datatype_commit(&yuck_ompi_dtype_from_opal);
217+
if (rc) break;
218+
rc = MCA_PML_CALL(irecv
212219
(recv_to_addrs[jrecvs_posted], (int)recv_counts[jrecvs_posted], yuck_ompi_dtype_from_opal, jrank_recvfrom,
213220
MCA_COLL_BASE_TAG_ALLTOALLV,
214221
comm, &requests[jreq]));
215-
ompi_datatype_destroy( &yuck_ompi_dtype_from_opal );
222+
if (rc) break;
223+
rc = ompi_datatype_destroy( &yuck_ompi_dtype_from_opal );
224+
if (rc) break;
216225
jrecvs_posted++;
217226
}
218227

219-
220228
if (rc) { break; };
221229
}
230+
if (rc) {
231+
opal_output_verbose(1, mca_coll_han_component.han_output,
232+
"Failed in alltoallv_sendrecv_w_direct_for_debugging: jloop=%d, rc=%d\n",
233+
jloop,rc);
234+
}
222235
return rc;
223236
}
224237

@@ -250,10 +263,16 @@ static int alltoallv_sendrecv_w(
250263
buf_items[jbuf] = opal_free_list_get(&mca_coll_han_component.pack_buffers);
251264
if (buf_items[jbuf] == NULL) {
252265
nbufs = jbuf - 1;
253-
printf("Uh-oh, not enough buffers: %d\n",nbufs);
266+
opal_output_verbose(20, mca_coll_han_component.han_output,
267+
"Uh-oh, not enough buffers: %d\n",nbufs);
254268
break;
255269
}
256270
}
271+
if (nbufs < 2) {
272+
opal_output_verbose(1, mca_coll_han_component.han_output,
273+
"ERROR: Need at least 2 buffers from mca_coll_han_component.pack_buffers!");
274+
return MPI_ERR_NO_MEM;
275+
}
257276

258277
size_t nreqs = nbufs;
259278
int jreq;
@@ -549,22 +568,26 @@ static int decide_to_use_smsc_alg(
549568
OBJ_CONSTRUCT( &convertor, opal_convertor_t );
550569
rc = opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor,
551570
&rdtype->super, count_for_convertor, rbuf, 0, &convertor);
571+
if (rc) goto cleanup1;
552572
bufs_on_device = opal_convertor_on_device(&convertor);
553573
need_bufs = opal_convertor_need_buffers(&convertor);
554-
rc |= opal_convertor_cleanup(&convertor);
555-
rc |= opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor,
574+
rc = opal_convertor_cleanup(&convertor);
575+
if (rc) goto cleanup1;
576+
rc = opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor,
556577
&sdtype->super, count_for_convertor, sbuf, 0, &convertor);
578+
if (rc) goto cleanup1;
557579
bufs_on_device |= opal_convertor_on_device(&convertor);
558580
opal_convertor_get_packed_size(&convertor, &packed_size_bytes);
559581
for (int jrank=0; jrank<comm_size; jrank++) {
560582
avg_send_bytes += packed_size_bytes/count_for_convertor * ompi_count_array_get(scounts,jrank);
561583
}
562584
need_bufs |= opal_convertor_need_buffers(&convertor);
563585

564-
rc |= opal_convertor_cleanup(&convertor);
586+
rc = opal_convertor_cleanup(&convertor);
587+
cleanup1:
565588
OBJ_DESTRUCT( &convertor );
566-
567589
if (rc != OMPI_SUCCESS) { return rc;}
590+
568591
avg_send_bytes = avg_send_bytes / comm_size;
569592
reduce_buf_input[0] = !!(bufs_on_device);
570593
reduce_buf_input[1] = avg_send_bytes;
@@ -613,12 +636,17 @@ int mca_coll_han_alltoallv_using_smsc(
613636
struct ompi_communicator_t *comm,
614637
mca_coll_base_module_t *module)
615638
{
639+
int rc;
640+
void **send_from_addrs = NULL;
641+
void **recv_to_addrs = NULL;
642+
size_t *send_counts = NULL;
643+
size_t *recv_counts = NULL;
644+
opal_datatype_t **send_types = NULL;
645+
opal_datatype_t **recv_types = NULL;
646+
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
616647

617648
OPAL_OUTPUT_VERBOSE((90, mca_coll_han_component.han_output,
618649
"Entering mca_coll_han_alltoall_using_smsc\n"));
619-
int rc;
620-
621-
mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module;
622650

623651
if (!mca_smsc || !mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) {
624652
/* Assume all hosts take this path together :-\ */
@@ -654,13 +682,15 @@ int mca_coll_han_alltoallv_using_smsc(
654682
comm, han_module->previous_alltoallv_module);
655683
}
656684

657-
int w_rank = ompi_comm_rank(comm);
658685
int w_size = ompi_comm_size(comm);
659686

660687
int use_smsc;
661688
rc = decide_to_use_smsc_alg(&use_smsc,
662689
sbuf, scounts, sdispls, sdtype, rbuf, rcounts, rdispls, rdtype, comm);
663-
if (rc != 0) { return rc; }
690+
if (rc != 0) {
691+
opal_output_verbose(1, mca_coll_han_component.han_output,
692+
"decide_to_use_smsc_alg failed during execution! rc=%d\n", rc);
693+
}
664694
if (!use_smsc) {
665695
return han_module->previous_alltoallv(sbuf, scounts, sdispls, sdtype, rbuf, rcounts, rdispls, rdtype,
666696
comm, han_module->previous_alltoallv_module);
@@ -738,8 +768,8 @@ int mca_coll_han_alltoallv_using_smsc(
738768
low_gather_out, sizeof(low_gather_in), MPI_BYTE, low_comm,
739769
low_comm->c_coll->coll_allgather_module);
740770
if (rc != 0) {
741-
OPAL_OUTPUT_VERBOSE((40, mca_coll_han_component.han_output,
742-
"Allgather failed with %d\n",rc));
771+
opal_output_verbose(1, mca_coll_han_component.han_output,
772+
"During mca_coll_han_alltoallv_using_smsc: Allgather failed with rc=%d\n",rc);
743773
goto cleanup;
744774
}
745775

@@ -806,12 +836,12 @@ int mca_coll_han_alltoallv_using_smsc(
806836
peers[jrank].recvtype = &peer_recv_types[jrank];
807837
}
808838

809-
void **send_from_addrs = malloc(sizeof(*send_from_addrs)*low_size);
810-
void **recv_to_addrs = malloc(sizeof(*recv_to_addrs)*low_size);
811-
size_t *send_counts = malloc(sizeof(*send_counts)*low_size);
812-
size_t *recv_counts = malloc(sizeof(*recv_counts)*low_size);
813-
opal_datatype_t **send_types = malloc(sizeof(*send_types)*low_size);
814-
opal_datatype_t **recv_types = malloc(sizeof(*recv_types)*low_size);
839+
send_from_addrs = malloc(sizeof(*send_from_addrs)*low_size);
840+
recv_to_addrs = malloc(sizeof(*recv_to_addrs)*low_size);
841+
send_counts = malloc(sizeof(*send_counts)*low_size);
842+
recv_counts = malloc(sizeof(*recv_counts)*low_size);
843+
send_types = malloc(sizeof(*send_types)*low_size);
844+
recv_types = malloc(sizeof(*recv_types)*low_size);
815845

816846
/****
817847
* Main exchange loop
@@ -828,6 +858,11 @@ int mca_coll_han_alltoallv_using_smsc(
828858
ptrdiff_t peer_sextent;
829859

830860
rc = opal_datatype_type_extent( peers[jlow].sendtype, &peer_sextent);
861+
if (rc != 0) {
862+
opal_output_verbose(1, mca_coll_han_component.han_output,
863+
"opal_datatype_type_extent returned error code = %d during mca_coll_han_alltoallv_using_smsc!\n",rc);
864+
goto cleanup;
865+
}
831866
void *from_addr = (uint8_t*)peers[jlow].sbuf + peers[jlow].counts[jrank_sendto].sdispl*peer_sextent;
832867

833868
send_from_addrs[jlow] = from_addr;
@@ -847,20 +882,28 @@ send_types[jlow] = peers[jlow].sendtype;
847882
send_from_addrs, send_counts, send_types, jrank_sendto, ntypes_send,
848883
recv_to_addrs, recv_counts, recv_types, jrank_recvfrom, ntypes_recv,
849884
comm);
850-
if (rc != 0) goto cleanup;
851-
}
885+
if (rc != 0) {
886+
opal_output_verbose(1, mca_coll_han_component.han_output,
887+
"alltoallv_sendrecv_w returned error code = %d!\n",rc);
888+
goto cleanup;
889+
}
852890

853-
free(send_from_addrs);
854-
free(recv_to_addrs);
855-
free(send_counts);
856-
free(recv_counts);
857-
free(send_types);
858-
free(recv_types);
859891

892+
}
860893
rc=0;
894+
895+
cleanup:
861896
low_comm->c_coll->coll_barrier(low_comm, low_comm->c_coll->coll_barrier_module);
862897

863-
cleanup:
898+
if (send_from_addrs) {
899+
free(send_from_addrs);
900+
free(recv_to_addrs);
901+
free(send_counts);
902+
free(recv_counts);
903+
free(send_types);
904+
free(recv_types);
905+
}
906+
864907
for (int jlow=0; jlow<low_size; jlow++) {
865908
if (jlow != low_rank) {
866909
OBJ_DESTRUCT(&peer_send_types[jlow]);

0 commit comments

Comments
 (0)