Merge pull request #1177 from bosilca/topic/large_msg

bosilca · web-flow · commit dc538e9675c5 · 2017-09-05T13:30:19.000-04:00
Topic/large msg
diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h
@@ -55,8 +55,8 @@ struct mca_pml_ob1_t {
     int free_list_num;      /* initial size of free list */
     int free_list_max;      /* maximum size of free list */
     int free_list_inc;      /* number of elements to grow free list */
-    size_t send_pipeline_depth;
-    size_t recv_pipeline_depth;
+    int32_t send_pipeline_depth;
+    int32_t recv_pipeline_depth;
     size_t rdma_retries_limit;
     int max_rdma_per_request;
     int max_send_per_range;
diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c
@@ -184,8 +184,8 @@ static int mca_pml_ob1_component_register(void)
     mca_pml_ob1_param_register_int("free_list_max", -1, &mca_pml_ob1.free_list_max);
     mca_pml_ob1_param_register_int("free_list_inc", 64, &mca_pml_ob1.free_list_inc);
     mca_pml_ob1_param_register_int("priority", 20, &mca_pml_ob1.priority);
-    mca_pml_ob1_param_register_sizet("send_pipeline_depth", 3, &mca_pml_ob1.send_pipeline_depth);
-    mca_pml_ob1_param_register_sizet("recv_pipeline_depth", 4, &mca_pml_ob1.recv_pipeline_depth);
+    mca_pml_ob1_param_register_int("send_pipeline_depth", 3, &mca_pml_ob1.send_pipeline_depth);
+    mca_pml_ob1_param_register_int("recv_pipeline_depth", 4, &mca_pml_ob1.recv_pipeline_depth);
 
     /* NTH: we can get into a live-lock situation in the RDMA failure path so disable
        RDMA retries for now. Falling back to send may suck but it is better than
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -190,15 +190,15 @@ static void mca_pml_ob1_put_completion (mca_pml_ob1_rdma_frag_t *frag, int64_t r
     mca_pml_ob1_recv_request_t* recvreq = (mca_pml_ob1_recv_request_t *) frag->rdma_req;
     mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
 
-    OPAL_THREAD_SUB_SIZE_T(&recvreq->req_pipeline_depth, 1);
+    OPAL_THREAD_ADD32(&recvreq->req_pipeline_depth, -1);
 
     MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
 
     if (OPAL_LIKELY(0 < rdma_size)) {
         assert ((uint64_t) rdma_size == frag->rdma_length);
 
         /* check completion status */
-        OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, (size_t) rdma_size);
+        OPAL_THREAD_ADD_SIZE_T(&recvreq->req_bytes_received, rdma_size);
         if (recv_request_pml_complete_check(recvreq) == false &&
             recvreq->req_rdma_offset < recvreq->req_send_offset) {
             /* schedule additional rdma operations */
@@ -951,7 +951,7 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
     }
 
     while(bytes_remaining > 0 &&
-           recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
+          recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth) {
         mca_pml_ob1_rdma_frag_t *frag = NULL;
         mca_btl_base_module_t *btl;
         int rc, rdma_idx;
@@ -983,14 +983,10 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
         } while(!size);
         btl = bml_btl->btl;
 
-        /* NTH: This conditional used to check if there was a registration in
-         * recvreq->req_rdma[rdma_idx].btl_reg. If once existed it was due to
-         * the btl not needed registration (equivalent to btl->btl_register_mem
-         * != NULL. This new check is equivalent. Note: I feel this protocol
-         * needs work to better improve resource usage when running with a
-         * leave pinned protocol. */
-        if (btl->btl_register_mem && (btl->btl_rdma_pipeline_frag_size != 0) &&
-            (size > btl->btl_rdma_pipeline_frag_size)) {
+         /* NTH: Note: I feel this protocol needs work to better improve resource
+          * usage when running with a leave pinned protocol. */
+        /* GB: We should always abide by the BTL RDMA pipeline fragment limit (if one is set) */
+        if ((btl->btl_rdma_pipeline_frag_size != 0) && (size > btl->btl_rdma_pipeline_frag_size)) {
             size = btl->btl_rdma_pipeline_frag_size;
         }
 
@@ -1028,7 +1024,7 @@ int mca_pml_ob1_recv_request_schedule_once( mca_pml_ob1_recv_request_t* recvreq,
         if (OPAL_LIKELY(OMPI_SUCCESS == rc)) {
             /* update request state */
             recvreq->req_rdma_offset += size;
-            OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth, 1);
+            OPAL_THREAD_ADD32(&recvreq->req_pipeline_depth, 1);
             recvreq->req_rdma[rdma_idx].length -= size;
             bytes_remaining -= size;
         } else {
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
@@ -41,12 +41,12 @@ BEGIN_C_DECLS
 struct mca_pml_ob1_recv_request_t {
     mca_pml_base_recv_request_t req_recv;
     opal_ptr_t remote_req_send;
-    int32_t req_lock;
-    size_t  req_pipeline_depth;
-    size_t  req_bytes_received;  /**< amount of data transferred into the user buffer */
-    size_t  req_bytes_expected; /**< local size of the data as suggested by the user */
-    size_t  req_rdma_offset;
-    size_t  req_send_offset;
+    int32_t  req_lock;
+    int32_t  req_pipeline_depth;
+    size_t   req_bytes_received;  /**< amount of data transferred into the user buffer */
+    size_t   req_bytes_expected; /**< local size of the data as suggested by the user */
+    size_t   req_rdma_offset;
+    size_t   req_send_offset;
     uint32_t req_rdma_cnt;
     uint32_t req_rdma_idx;
     bool req_pending;
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -313,7 +313,7 @@ mca_pml_ob1_frag_completion( mca_btl_base_module_t* btl,
                                                                    des->des_segment_count,
                                                                    sizeof(mca_pml_ob1_frag_hdr_t));
 
-    OPAL_THREAD_SUB_SIZE_T(&sendreq->req_pipeline_depth, 1);
+    OPAL_THREAD_ADD32(&sendreq->req_pipeline_depth, -1);
     OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
 
     if(send_request_pml_complete_check(sendreq) == false) {
@@ -913,13 +913,13 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq)
 
     /* check pipeline_depth here before attempting to get any locks */
     if(true == sendreq->req_throttle_sends &&
-            sendreq->req_pipeline_depth >= mca_pml_ob1.send_pipeline_depth)
+       sendreq->req_pipeline_depth >= mca_pml_ob1.send_pipeline_depth)
         return OMPI_SUCCESS;
 
     range = get_send_range(sendreq);
 
     while(range && (false == sendreq->req_throttle_sends ||
-            sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) {
+          sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) {
         mca_pml_ob1_frag_hdr_t* hdr;
         mca_btl_base_descriptor_t* des;
         int rc, btl_idx;
@@ -1044,7 +1044,7 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq)
             range->range_btls[btl_idx].length -= size;
             range->range_send_length -= size;
             range->range_send_offset += size;
-            OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
+            OPAL_THREAD_ADD32(&sendreq->req_pipeline_depth, 1);
             if(range->range_send_length == 0) {
                 range = get_next_send_range(sendreq, range);
                 prev_bytes_remaining = 0;
@@ -1060,7 +1060,7 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq)
             range->range_btls[btl_idx].length -= size;
             range->range_send_length -= size;
             range->range_send_offset += size;
-            OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
+            OPAL_THREAD_ADD32(&sendreq->req_pipeline_depth, 1);
             if(range->range_send_length == 0) {
                 range = get_next_send_range(sendreq, range);
                 prev_bytes_remaining = 0;
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@@ -45,11 +45,11 @@ struct mca_pml_ob1_send_request_t {
     mca_pml_base_send_request_t req_send;
     mca_bml_base_endpoint_t* req_endpoint;
     opal_ptr_t req_recv;
-    int32_t req_state;
-    int32_t req_lock;
-    bool req_throttle_sends;
-    size_t req_pipeline_depth;
-    size_t req_bytes_delivered;
+    int32_t  req_state;
+    int32_t  req_lock;
+    bool     req_throttle_sends;
+    int32_t  req_pipeline_depth;
+    size_t   req_bytes_delivered;
     uint32_t req_rdma_cnt;
     mca_pml_ob1_send_pending_t req_pending;
     opal_mutex_t req_send_range_lock;
diff --git a/opal/include/opal/sys/atomic.h b/opal/include/opal/sys/atomic.h
@@ -447,7 +447,7 @@ int64_t opal_atomic_sub_64(volatile int64_t *addr, int64_t delta);
  */
 #if defined(DOXYGEN) || OPAL_ENABLE_DEBUG
 static inline size_t
-opal_atomic_add_size_t(volatile size_t *addr, int delta)
+opal_atomic_add_size_t(volatile size_t *addr, size_t delta)
 {
 #if SIZEOF_SIZE_T == 4
     return (size_t) opal_atomic_add_32((int32_t*) addr, delta);
@@ -458,7 +458,7 @@ opal_atomic_add_size_t(volatile size_t *addr, int delta)
 #endif
 }
 static inline size_t
-opal_atomic_sub_size_t(volatile size_t *addr, int delta)
+opal_atomic_sub_size_t(volatile size_t *addr, size_t delta)
 {
 #if SIZEOF_SIZE_T == 4
     return (size_t) opal_atomic_sub_32((int32_t*) addr, delta);
diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c
@@ -318,7 +318,12 @@ static int mca_btl_tcp_component_register(void)
     mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024;
     mca_btl_tcp_module.super.btl_max_send_size = 128*1024;
     mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024;
-    mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX;
+    /* Some OSes have hard coded limits on how many bytes can be manipulated
+     * by each writev operation.  Force a reasonable limit, to prevent overflowing
+     * a signed 32-bit integer (limit comes from BSD and OS X). We remove 1k to
+     * make some room for our internal headers.
+     */
+    mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = ((1UL<<31) - 1024);
     mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0;
     mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT |
                                        MCA_BTL_FLAGS_SEND_INPLACE |
@@ -335,7 +340,11 @@ static int mca_btl_tcp_component_register(void)
 
     mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version,
                                 &mca_btl_tcp_module.super);
-
+    if (mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size > ((1UL<<31) - 1024) ) {
+        /* Assume a hard limit. A test in configure would be a better solution, but until then
+         * kicking-in the pipeline RDMA for extremely large data is good enough. */
+        mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = ((1UL<<31) - 1024);
+    }
     mca_btl_tcp_param_register_int ("disable_family", NULL, 0, OPAL_INFO_LVL_2,  &mca_btl_tcp_component.tcp_disable_family);
 
     return mca_btl_tcp_component_verify();
diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c
@@ -112,11 +112,11 @@ size_t mca_btl_tcp_frag_dump(mca_btl_tcp_frag_t* frag, char* msg, char* buf, siz
 
 bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
 {
-    ssize_t cnt = -1;
+    ssize_t cnt;
     size_t i, num_vecs;
 
     /* non-blocking write, but continue if interrupted */
-    while(cnt < 0) {
+    do {
         cnt = writev(sd, frag->iov_ptr, frag->iov_cnt);
         if(cnt < 0) {
             switch(opal_socket_errno) {
@@ -140,11 +140,11 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
                 return false;
             }
         }
-    }
+    } while(cnt < 0);
 
     /* if the write didn't complete - update the iovec state */
     num_vecs = frag->iov_cnt;
-    for(i=0; i<num_vecs; i++) {
+    for( i = 0; i < num_vecs; i++) {
         if(cnt >= (ssize_t)frag->iov_ptr->iov_len) {
             cnt -= frag->iov_ptr->iov_len;
             frag->iov_ptr++;
@@ -166,8 +166,8 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
 bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
 {
     mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint;
-    int i, num_vecs, dont_copy_data = 0;
     ssize_t cnt;
+    int32_t i, num_vecs, dont_copy_data = 0;
 
  repeat:
     num_vecs = frag->iov_cnt;
@@ -208,8 +208,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
 #endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
 
     /* non-blocking read, but continue if interrupted */
-    cnt = -1;
-    while( cnt < 0 ) {
+    do {
         cnt = readv(sd, frag->iov_ptr, num_vecs);
         if( 0 < cnt ) goto advance_iov_position;
         if( cnt == 0 ) {
@@ -247,7 +246,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
             mca_btl_tcp_endpoint_close(btl_endpoint);
             return false;
         }
-    }
+    } while( cnt < 0 );
 
  advance_iov_position:
     /* if the read didn't complete - update the iovec state */
diff --git a/opal/mca/btl/tcp/btl_tcp_frag.h b/opal/mca/btl/tcp/btl_tcp_frag.h
@@ -53,8 +53,8 @@ struct mca_btl_tcp_frag_t {
     mca_btl_tcp_hdr_t hdr;
     struct iovec iov[MCA_BTL_TCP_FRAG_IOVEC_NUMBER + 1];
     struct iovec *iov_ptr;
-    size_t iov_cnt;
-    size_t iov_idx;
+    uint32_t iov_cnt;
+    uint32_t iov_idx;
     size_t size;
     uint16_t next_step;
     int rc;

Original file line number	Diff line number	Diff line change
`@@ -447,7 +447,7 @@ int64_t opal_atomic_sub_64(volatile int64_t *addr, int64_t delta);`
`447`	`447`	`*/`
`448`	`448`	`#if defined(DOXYGEN) \|\| OPAL_ENABLE_DEBUG`
`449`	`449`	`static inline size_t`
`450`		`-opal_atomic_add_size_t(volatile size_t *addr, int delta)`
	`450`	`+opal_atomic_add_size_t(volatile size_t *addr, size_t delta)`
`451`	`451`	`{`
`452`	`452`	`#if SIZEOF_SIZE_T == 4`
`453`	`453`	`return (size_t) opal_atomic_add_32((int32_t*) addr, delta);`
`@@ -458,7 +458,7 @@ opal_atomic_add_size_t(volatile size_t *addr, int delta)`
`458`	`458`	`#endif`
`459`	`459`	`}`
`460`	`460`	`static inline size_t`
`461`		`-opal_atomic_sub_size_t(volatile size_t *addr, int delta)`
	`461`	`+opal_atomic_sub_size_t(volatile size_t *addr, size_t delta)`
`462`	`462`	`{`
`463`	`463`	`#if SIZEOF_SIZE_T == 4`
`464`	`464`	`return (size_t) opal_atomic_sub_32((int32_t*) addr, delta);`