4040#include "mtl_ofi_endpoint.h"
4141#include "mtl_ofi_compat.h"
4242
43- #define MTL_OFI_RETRY_UNTIL_DONE (FUNC ) \
44- do { \
45- do { \
46- ret = FUNC; \
47- if(OPAL_LIKELY(0 == ret)) {break;} \
48- } while(-FI_EAGAIN == ret); \
49- } while(0);
5043
5144BEGIN_C_DECLS
5245
@@ -134,6 +127,24 @@ ompi_mtl_ofi_progress(void)
134127 return count ;
135128}
136129
130+ /**
131+ * When attempting to execute an OFI operation we need to handle
132+ * resource overrun cases. When a call to an OFI OP fails with -FI_EAGAIN
133+ * the OFI mtl will attempt to progress any pending Completion Queue
134+ * events that may prevent additional operations to be enqueued.
135+ * If the call to ofi progress is successful, then the function call
136+ * will be retried.
137+ */
138+ #define MTL_OFI_RETRY_UNTIL_DONE (FUNC , RETURN ) \
139+ do { \
140+ do { \
141+ RETURN = FUNC; \
142+ if (OPAL_LIKELY(0 == RETURN)) {break;} \
143+ if (OPAL_LIKELY(RETURN == -FI_EAGAIN)) { \
144+ ompi_mtl_ofi_progress(); \
145+ } \
146+ } while (OPAL_LIKELY(-FI_EAGAIN == RETURN)); \
147+ } while (0);
137148
138149/* MTL interface functions */
139150int ompi_mtl_ofi_finalize (struct mca_mtl_base_module_t * mtl );
@@ -281,7 +292,7 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
281292 src_addr ,
282293 match_bits | ompi_mtl_ofi .sync_send_ack ,
283294 0 , /* Exact match, no ignore bits */
284- (void * ) & ack_req -> ctx ));
295+ (void * ) & ack_req -> ctx ), ret );
285296 if (OPAL_UNLIKELY (0 > ret )) {
286297 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
287298 "%s:%d: fi_trecv failed: %s(%zd)" ,
@@ -302,15 +313,14 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
302313 length ,
303314 comm -> c_my_rank ,
304315 endpoint -> peer_fiaddr ,
305- match_bits ));
316+ match_bits ), ret );
306317 } else {
307318 MTL_OFI_RETRY_UNTIL_DONE (fi_tinject (ompi_mtl_ofi .ep ,
308319 start ,
309320 length ,
310321 endpoint -> peer_fiaddr ,
311- match_bits ));
322+ match_bits ), ret );
312323 }
313-
314324 if (OPAL_UNLIKELY (0 > ret )) {
315325 char * fi_api = ompi_mtl_ofi .fi_cq_data ? "fi_tinjectddata" : "fi_tinject" ;
316326 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
@@ -334,15 +344,15 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
334344 comm -> c_my_rank ,
335345 endpoint -> peer_fiaddr ,
336346 match_bits ,
337- (void * ) & ofi_req -> ctx ));
347+ (void * ) & ofi_req -> ctx ), ret );
338348 } else {
339349 MTL_OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ep ,
340350 start ,
341351 length ,
342352 NULL ,
343353 endpoint -> peer_fiaddr ,
344354 match_bits ,
345- (void * ) & ofi_req -> ctx ));
355+ (void * ) & ofi_req -> ctx ), ret );
346356 }
347357 if (OPAL_UNLIKELY (0 > ret )) {
348358 char * fi_api = ompi_mtl_ofi .fi_cq_data ? "fi_tsendddata" : "fi_send" ;
@@ -517,7 +527,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
517527 tagged_msg .data = 0 ;
518528
519529 MTL_OFI_RETRY_UNTIL_DONE (fi_tsendmsg (ompi_mtl_ofi .ep ,
520- & tagged_msg , 0 ));
530+ & tagged_msg , 0 ), ret );
521531 if (OPAL_UNLIKELY (0 > ret )) {
522532 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
523533 "%s:%d: fi_tsendmsg failed: %s(%zd)" ,
@@ -621,7 +631,7 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
621631 remote_addr ,
622632 match_bits ,
623633 mask_bits ,
624- (void * )& ofi_req -> ctx ));
634+ (void * )& ofi_req -> ctx ), ret );
625635 if (OPAL_UNLIKELY (0 > ret )) {
626636 if (NULL != ofi_req -> buffer ) {
627637 free (ofi_req -> buffer );
@@ -734,7 +744,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
734744 msg .context = (void * )& ofi_req -> ctx ;
735745 msg .data = 0 ;
736746
737- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ));
747+ MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ), ret );
738748 if (OPAL_UNLIKELY (0 > ret )) {
739749 opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
740750 "%s:%d: fi_trecvmsg failed: %s(%zd)" ,
@@ -833,7 +843,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
833843 ofi_req .completion_count = 1 ;
834844 ofi_req .match_state = 0 ;
835845
836- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ));
846+ MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ), ret );
837847 if (- FI_ENOMSG == ret ) {
838848 /**
839849 * The search request completed but no matching message was found.
@@ -928,7 +938,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
928938 ofi_req -> match_state = 0 ;
929939 ofi_req -> mask_bits = mask_bits ;
930940
931- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ));
941+ MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ), ret );
932942 if (- FI_ENOMSG == ret ) {
933943 /**
934944 * The search request completed but no matching message was found.
0 commit comments