@@ -574,20 +574,18 @@ static pmix_server_trkr_t* new_tracker(char *id, pmix_proc_t *procs,
574
574
575
575
static void fence_timeout (int sd , short args , void * cbdata )
576
576
{
577
- pmix_server_caddy_t * cd = (pmix_server_caddy_t * )cbdata ;
577
+ pmix_server_trkr_t * trk = (pmix_server_trkr_t * )cbdata ;
578
578
579
579
pmix_output_verbose (2 , pmix_server_globals .fence_output ,
580
580
"ALERT: fence timeout fired" );
581
581
582
582
/* execute the provided callback function with the error */
583
- if (NULL != cd -> trk -> modexcbfunc ) {
584
- cd -> trk -> modexcbfunc (PMIX_ERR_TIMEOUT , NULL , 0 , cd -> trk , NULL , NULL );
583
+ if (NULL != trk -> modexcbfunc ) {
584
+ trk -> modexcbfunc (PMIX_ERR_TIMEOUT , NULL , 0 , trk , NULL , NULL );
585
585
return ; // the cbfunc will have cleaned up the tracker
586
586
}
587
- cd -> event_active = false;
588
- /* remove it from the list */
589
- pmix_list_remove_item (& cd -> trk -> local_cbs , & cd -> super );
590
- PMIX_RELEASE (cd );
587
+ trk -> event_active = false;
588
+ PMIX_RELEASE (trk );
591
589
}
592
590
593
591
static pmix_status_t _collect_data (pmix_server_trkr_t * trk ,
@@ -1039,11 +1037,9 @@ pmix_status_t pmix_server_fence(pmix_server_caddy_t *cd,
1039
1037
* notified when we are done */
1040
1038
pmix_list_append (& trk -> local_cbs , & cd -> super );
1041
1039
/* if a timeout was specified, set it */
1042
- if (0 < tv .tv_sec ) {
1043
- PMIX_RETAIN (trk );
1044
- cd -> trk = trk ;
1045
- PMIX_THREADSHIFT_DELAY (cd , fence_timeout , tv .tv_sec );
1046
- cd -> event_active = true;
1040
+ if (0 < tv .tv_sec && !trk -> event_active ) {
1041
+ PMIX_THREADSHIFT_DELAY (trk , fence_timeout , tv .tv_sec );
1042
+ trk -> event_active = true;
1047
1043
}
1048
1044
1049
1045
/* if all local contributions have been received,
@@ -1054,6 +1050,16 @@ pmix_status_t pmix_server_fence(pmix_server_caddy_t *cd,
1054
1050
pmix_list_get_size (& trk -> local_cbs ) == trk -> nlocal ) {
1055
1051
pmix_output_verbose (2 , pmix_server_globals .fence_output ,
1056
1052
"fence LOCALLY complete" );
1053
+ /* if a timeout was set, then we delete it here as we can
1054
+ * ONLY check for local completion. Otherwise, passing
1055
+ * the tracker object up to the host can result in
1056
+ * competing timeout events, and the host could return
1057
+ * the tracker AFTER we released it due to our internal
1058
+ * timeout firing */
1059
+ if (trk -> event_active ) {
1060
+ pmix_event_del (& trk -> ev );
1061
+ trk -> event_active = false;
1062
+ }
1057
1063
/* if this is a purely local fence (i.e., all participants are local),
1058
1064
* then it is done and we notify accordingly */
1059
1065
if (pmix_server_globals .fence_localonly_opt && trk -> local ) {
0 commit comments