Skip to content

Commit 97da19f

Browse files
author
Ralph Castain
authored
Merge pull request #5498 from karasevb/pmix_fence_status
pmix: added check for pmix fence status
2 parents ae03014 + 5768336 commit 97da19f

File tree

12 files changed

+168
-43
lines changed

12 files changed

+168
-43
lines changed

ompi/dpm/dpm.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,11 @@ int ompi_dpm_disconnect(ompi_communicator_t *comm)
589589

590590
/* ensure we tell the host RM to disconnect us - this
591591
* is a blocking operation so just use a fence */
592-
ret = opal_pmix.fence(&coll, false);
592+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(&coll, false))) {
593+
OMPI_ERROR_LOG(ret);
594+
OPAL_LIST_DESTRUCT(&coll);
595+
return ret;
596+
}
593597
OPAL_LIST_DESTRUCT(&coll);
594598

595599
return ret;

ompi/mca/bml/r2/bml_r2_ft.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,10 @@ int mca_bml_r2_ft_event(int state)
155155
* Barrier to make all processes have been successfully restarted before
156156
* we try to remove some restart only files.
157157
*/
158-
opal_pmix.fence(NULL, 0);
158+
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
159+
opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n");
160+
return ret;
161+
}
159162

160163
/*
161164
* Re-open the BTL framework to get the full list of components.
@@ -224,7 +227,10 @@ int mca_bml_r2_ft_event(int state)
224227
* Barrier to make all processes have been successfully restarted before
225228
* we try to remove some restart only files.
226229
*/
227-
opal_pmix.fence(NULL, 0);
230+
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
231+
opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n");
232+
return ret;
233+
}
228234

229235
/*
230236
* Re-open the BTL framework to get the full list of components.

ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3028,7 +3028,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
30283028

30293029
if( opal_cr_timing_barrier_enabled ) {
30303030
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0);
3031-
opal_pmix.fence(NULL, 0);
3031+
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
3032+
exit_status = ret;
3033+
goto DONE;
3034+
}
30323035
}
30333036
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0);
30343037

@@ -3096,7 +3099,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
30963099

30973100
if( opal_cr_timing_barrier_enabled ) {
30983101
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1);
3099-
opal_pmix.fence(NULL, 0);
3102+
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
3103+
exit_status = ret;
3104+
goto DONE;
3105+
}
31003106
}
31013107
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2);
31023108
}
@@ -6207,14 +6213,16 @@ static void clear_timers(void) {
62076213
static void display_all_timers(int state) {
62086214
bool report_ready = false;
62096215
double barrier_start, barrier_stop;
6210-
int i;
6216+
int i, ret;
62116217

62126218
if( 0 != OMPI_PROC_MY_NAME->vpid ) {
62136219
if( 2 > timing_enabled ) {
62146220
return;
62156221
}
62166222
else if( 2 == timing_enabled ) {
6217-
opal_pmix.fence(NULL, 0);
6223+
if( OPAL_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
6224+
OPAL_ERROR_LOG(ret);
6225+
}
62186226
return;
62196227
}
62206228
}
@@ -6235,7 +6243,9 @@ static void display_all_timers(int state) {
62356243

62366244
if( timing_enabled >= 2) {
62376245
barrier_start = get_time();
6238-
opal_pmix.fence(NULL, 0);
6246+
if( OPAL_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
6247+
OPAL_ERROR_LOG(ret);
6248+
}
62396249
barrier_stop = get_time();
62406250
opal_output(0,
62416251
"crcp:bkmrk: timing(%20s): %20s = %10.2f s\n",

ompi/mca/pml/bfo/pml_bfo.c

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -666,7 +666,10 @@ int mca_pml_bfo_ft_event( int state )
666666
if(OPAL_CRS_CHECKPOINT == state) {
667667
if( opal_cr_timing_barrier_enabled ) {
668668
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
669-
opal_pmix.fence(NULL, 0);
669+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
670+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
671+
return ret;
672+
}
670673
}
671674

672675
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
@@ -677,7 +680,10 @@ int mca_pml_bfo_ft_event( int state )
677680
if( !first_continue_pass ) {
678681
if( opal_cr_timing_barrier_enabled ) {
679682
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
680-
opal_pmix.fence(NULL, 0);
683+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
684+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
685+
return ret;
686+
}
681687
}
682688
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
683689
}
@@ -777,7 +783,10 @@ int mca_pml_bfo_ft_event( int state )
777783
if( !first_continue_pass ) {
778784
if( opal_cr_timing_barrier_enabled ) {
779785
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
780-
opal_pmix.fence(NULL, 0);
786+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
787+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
788+
return ret;
789+
}
781790
}
782791
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
783792
}
@@ -787,7 +796,10 @@ int mca_pml_bfo_ft_event( int state )
787796
* Exchange the modex information once again.
788797
* BTLs will have republished their modex information.
789798
*/
790-
opal_pmix.fence(NULL, 0);
799+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
800+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
801+
return ret;
802+
}
791803

792804
/*
793805
* Startup the PML stack now that the modex is running again
@@ -799,7 +811,10 @@ int mca_pml_bfo_ft_event( int state )
799811
}
800812

801813
/* Is this barrier necessary ? JJH */
802-
opal_pmix.fence(NULL, 0);
814+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
815+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
816+
return ret;
817+
}
803818

804819
if( NULL != procs ) {
805820
for(p = 0; p < (int)num_procs; ++p) {
@@ -812,7 +827,10 @@ int mca_pml_bfo_ft_event( int state )
812827
if( !first_continue_pass ) {
813828
if( opal_cr_timing_barrier_enabled ) {
814829
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
815-
opal_pmix.fence(NULL, 0);
830+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
831+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
832+
return ret;
833+
}
816834
}
817835
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
818836
}
@@ -825,7 +843,10 @@ int mca_pml_bfo_ft_event( int state )
825843
* Exchange the modex information once again.
826844
* BTLs will have republished their modex information.
827845
*/
828-
opal_pmix.fence(NULL, 0);
846+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
847+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
848+
return ret;
849+
}
829850

830851
/*
831852
* Startup the PML stack now that the modex is running again
@@ -837,7 +858,10 @@ int mca_pml_bfo_ft_event( int state )
837858
}
838859

839860
/* Is this barrier necessary ? JJH */
840-
opal_pmix.fence(NULL, 0);
861+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
862+
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
863+
return ret;
864+
}
841865

842866
if( NULL != procs ) {
843867
for(p = 0; p < (int)num_procs; ++p) {

ompi/mca/pml/ob1/pml_ob1.c

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -807,7 +807,10 @@ int mca_pml_ob1_ft_event( int state )
807807
if(OPAL_CRS_CHECKPOINT == state) {
808808
if( opal_cr_timing_barrier_enabled ) {
809809
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
810-
opal_pmix.fence(NULL, 0);
810+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
811+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
812+
return ret;
813+
}
811814
}
812815

813816
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
@@ -818,7 +821,10 @@ int mca_pml_ob1_ft_event( int state )
818821
if( !first_continue_pass ) {
819822
if( opal_cr_timing_barrier_enabled ) {
820823
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
821-
opal_pmix.fence(NULL, 0);
824+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
825+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
826+
return ret;
827+
}
822828
}
823829
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
824830
}
@@ -918,13 +924,19 @@ int mca_pml_ob1_ft_event( int state )
918924
if( !first_continue_pass ) {
919925
if( opal_cr_timing_barrier_enabled ) {
920926
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
921-
opal_pmix.fence(NULL, 0);
927+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
928+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
929+
return ret;
930+
}
922931
}
923932
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
924933
}
925934

926935
if (opal_cr_continue_like_restart && !first_continue_pass) {
927-
opal_pmix.fence(NULL, 0);
936+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
937+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
938+
return ret;
939+
}
928940

929941
/*
930942
* Startup the PML stack now that the modex is running again
@@ -936,7 +948,10 @@ int mca_pml_ob1_ft_event( int state )
936948
}
937949

938950
/* Is this barrier necessary ? JJH */
939-
opal_pmix.fence(NULL, 0);
951+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
952+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
953+
return ret;
954+
}
940955

941956
if( NULL != procs ) {
942957
for(p = 0; p < (int)num_procs; ++p) {
@@ -949,7 +964,10 @@ int mca_pml_ob1_ft_event( int state )
949964
if( !first_continue_pass ) {
950965
if( opal_cr_timing_barrier_enabled ) {
951966
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
952-
opal_pmix.fence(NULL, 0);
967+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
968+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
969+
return ret;
970+
}
953971
}
954972
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
955973
}
@@ -962,7 +980,10 @@ int mca_pml_ob1_ft_event( int state )
962980
* Exchange the modex information once again.
963981
* BTLs will have republished their modex information.
964982
*/
965-
opal_pmix.fence(NULL, 0);
983+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
984+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
985+
return ret;
986+
}
966987

967988
/*
968989
* Startup the PML stack now that the modex is running again
@@ -974,7 +995,10 @@ int mca_pml_ob1_ft_event( int state )
974995
}
975996

976997
/* Is this barrier necessary ? JJH */
977-
opal_pmix.fence(NULL, 0);
998+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
999+
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
1000+
return ret;
1001+
}
9781002

9791003
if( NULL != procs ) {
9801004
for(p = 0; p < (int)num_procs; ++p) {

ompi/mca/pml/yalla/pml_yalla.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ int mca_pml_yalla_add_procs(struct ompi_proc_t **procs, size_t nprocs)
265265
int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
266266
{
267267
size_t i;
268+
int ret;
268269

269270
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
270271
PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown");
@@ -276,7 +277,9 @@ int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
276277
PML_YALLA_VERBOSE(2, "disconnected from rank %s", OPAL_NAME_PRINT(procs[i]->super.proc_name));
277278
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = NULL;
278279
}
279-
opal_pmix.fence(NULL, 0);
280+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
281+
return ret;
282+
}
280283
return OMPI_SUCCESS;
281284
}
282285

ompi/runtime/ompi_mpi_finalize.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,13 @@ int ompi_mpi_finalize(void)
257257
* communications/actions to complete. See
258258
* https://github.com/open-mpi/ompi/issues/1576 for the
259259
* original bug report. */
260-
opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active);
260+
if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0, fence_cbfunc,
261+
(void*)&active))) {
262+
OMPI_ERROR_LOG(ret);
263+
/* Reset the active flag to false, to avoid waiting for
264+
* completion when the fence was failed. */
265+
active = false;
266+
}
261267
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
262268
} else {
263269
/* However, we cannot guarantee that the provided PMIx has
@@ -268,7 +274,9 @@ int ompi_mpi_finalize(void)
268274
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
269275
comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module);
270276

271-
opal_pmix.fence(NULL, 0);
277+
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
278+
OMPI_ERROR_LOG(ret);
279+
}
272280
}
273281
}
274282

0 commit comments

Comments
 (0)