@@ -357,6 +357,7 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int mpi_assert, ompi_win_t *
357357 return ret ;
358358}
359359
360+
360361int ompi_osc_rdma_start_atomic (ompi_group_t * group , int mpi_assert , ompi_win_t * win )
361362{
362363 ompi_osc_rdma_module_t * module = GET_MODULE (win );
@@ -590,6 +591,82 @@ int ompi_osc_rdma_test_atomic (ompi_win_t *win, int *flag)
590591 return OMPI_SUCCESS ;
591592}
592593
594+ /**
595+ * This function implements a different barrier mechanism for Fence,
596+ * when any of the selected btl does not support remote completion.
597+ * This barrier is based on imposing the MCA_BTL_ORDER_RDMA_ATOMCS
598+ * ordering requirement on seleted btls.
599+ */
600+ static
601+ int ompi_osc_rdma_fence_barrier_by_ordered_channel (ompi_win_t * win )
602+ {
603+ ompi_osc_rdma_module_t * module = GET_MODULE (win );
604+ ompi_osc_rdma_state_t * state = module -> state ;
605+ ompi_osc_rdma_sync_t * sync = & module -> all_sync ;
606+ ompi_osc_rdma_peer_t * * peers ;
607+ ompi_group_t * group ;
608+ int num_peers ;
609+ int ret ;
610+
611+ assert (module -> btl_order == MCA_BTL_IN_ORDER_RDMA_ATOMICS );
612+ OPAL_THREAD_LOCK (& module -> lock );
613+
614+ if (ompi_comm_size (module -> comm ) == 1 ) {
615+ OPAL_THREAD_UNLOCK (& (module -> lock ));
616+ return OMPI_SUCCESS ;
617+ }
618+
619+ ret = ompi_comm_group (module -> comm , & group );
620+ if (OMPI_SUCCESS != ret ) {
621+ OPAL_THREAD_UNLOCK (& (module -> lock ));
622+ return ret ;
623+ }
624+
625+ num_peers = sync -> num_peers ;
626+ assert (ompi_group_size (group ) == num_peers );
627+ peers = ompi_osc_rdma_get_peers (module , group );
628+ if (NULL == peers ) {
629+ OPAL_THREAD_UNLOCK (& (module -> lock ));
630+ return OMPI_ERR_OUT_OF_RESOURCE ;
631+ }
632+
633+ module -> state -> num_fenced_peers = 0 ;
634+ OPAL_THREAD_UNLOCK (& (module -> lock ));
635+ ret = module -> comm -> c_coll -> coll_barrier (module -> comm , module -> comm -> c_coll -> coll_barrier_module );
636+ if (ret ) {
637+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "barrier failed!" );
638+ return ret ;
639+ }
640+
641+ /* for each process in the group increment their number of fenced peers */
642+ for (int i = 0 ; i < num_peers ; ++ i ) {
643+ ompi_osc_rdma_peer_t * peer = peers [i ];
644+ intptr_t target = (intptr_t ) peer -> state + offsetof (ompi_osc_rdma_state_t , num_fenced_peers );
645+
646+ /* the usage of peer local state requires selected btls to support remote completion,
647+ * if that is the case, this function will not have been called
648+ */
649+ assert (!ompi_osc_rdma_peer_local_state (peer ));
650+ ret = ompi_osc_rdma_lock_btl_op (module , peer , target , MCA_BTL_ATOMIC_ADD , 1 , true);
651+ if (OMPI_SUCCESS != ret ) {
652+ return ret ;
653+ }
654+ }
655+
656+ ompi_osc_rdma_release_peers (peers , num_peers );
657+ ompi_group_free (& group );
658+
659+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "increased fenced_peer counter of all peers" );
660+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "waiting for all peers to increase my counter" );
661+ while (num_peers != state -> num_fenced_peers ) {
662+ ompi_osc_rdma_progress (module );
663+ opal_atomic_mb ();
664+ }
665+
666+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "received fence message from all peers" );
667+ return OMPI_SUCCESS ;
668+ }
669+
593670int ompi_osc_rdma_fence_atomic (int mpi_assert , ompi_win_t * win )
594671{
595672 ompi_osc_rdma_module_t * module = GET_MODULE (win );
@@ -627,7 +704,18 @@ int ompi_osc_rdma_fence_atomic (int mpi_assert, ompi_win_t *win)
627704 ompi_osc_rdma_sync_rdma_complete (& module -> all_sync );
628705
629706 /* ensure all writes to my memory are complete (both local stores, and RMA operations) */
630- ret = module -> comm -> c_coll -> coll_barrier (module -> comm , module -> comm -> c_coll -> coll_barrier_module );
707+ if (module -> btl_support_remote_completion ) {
708+ /* if all selected btls support remote completion, then all RMA operations have finished
709+ * on remote side. A barrier is enough to complete the fence.
710+ */
711+ ret = module -> comm -> c_coll -> coll_barrier (module -> comm , module -> comm -> c_coll -> coll_barrier_module );
712+ } else {
713+ /*
714+ * if any selected btl does not support remote completion, we will have to send a completion
715+ * message (through the same endpoint of data transfer) to every peer, then wait for a message from every peer.
716+ */
717+ ret = ompi_osc_rdma_fence_barrier_by_ordered_channel (win );
718+ }
631719
632720 if (mpi_assert & MPI_MODE_NOSUCCEED ) {
633721 /* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly
0 commit comments