From 8cb987eccd7d1e40f41590396c42e1a5ea8f219b Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Fri, 22 Jan 2016 17:17:46 -0700 Subject: [PATCH] osc/rdma: fix hang when performing large unaligned gets This commit adds code to handle large unaligned gets. There are two possible code paths for these transactions: 1) The remote region and local region have the same alignment. In this case the get will be broken down into at most three get transactions: 1 transaction to get the unaligned start of the region (buffered), 1 transaction to get the aligned portion of the region, and 1 transaction to get the end of the region. 2) The remote and local regions do not have the same alignment. This should be an uncommon case and is not optimized. In this case a buffer is allocated and registered locally to hold the aligned data from the remote region. There may be cases where this fails (low memory, can't register memory). Those conditions are unlikely and will be handled later. (cherry picked from open-mpi/ompi@45da3114737d876d5c8bab25f61ad45f242d6eba) Signed-off-by: Nathan Hjelm --- ompi/mca/osc/rdma/osc_rdma_comm.c | 100 +++++++++++++++++++++++---- ompi/mca/osc/rdma/osc_rdma_frag.h | 2 +- ompi/mca/osc/rdma/osc_rdma_lock.h | 6 +- ompi/mca/osc/rdma/osc_rdma_request.c | 5 +- ompi/mca/osc/rdma/osc_rdma_request.h | 10 +-- 5 files changed, 102 insertions(+), 21 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 1a2b9dde77..5d9335613c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -15,6 +15,11 @@ #include "osc_rdma_dynamic.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" +#include "opal/align.h" + +static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address, + mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size, + ompi_osc_rdma_request_t *request); static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, mca_btl_base_registration_handle_t *local_handle, @@ -136,7 +141,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc ompi_osc_rdma_peer_t *peer, uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, int remote_count, ompi_datatype_t *remote_datatype, ompi_osc_rdma_request_t *request, const size_t max_rdma_len, - const ompi_osc_rdma_fn_t rdma_fn,const bool alloc_reqs) + const ompi_osc_rdma_fn_t rdma_fn, const bool alloc_reqs) { ompi_osc_rdma_module_t *module = sync->module; struct iovec local_iovec[OMPI_OSC_RDMA_DECODE_MAX], remote_iovec[OMPI_OSC_RDMA_DECODE_MAX]; @@ -575,11 +580,13 @@ static void ompi_osc_rdma_get_complete (struct mca_btl_base_module_t *btl, struc assert (OPAL_SUCCESS == status); - if (NULL != frag) { + if (request->buffer || NULL != frag) { if (OPAL_LIKELY(OMPI_SUCCESS == status)) { memcpy (origin_addr, (void *) source, request->len); } + } + if (NULL != frag) { ompi_osc_rdma_frag_complete (frag); } else { ompi_osc_rdma_deregister (sync->module, local_handle); @@ -621,6 +628,27 @@ int ompi_osc_rdma_peer_aggregate_flush (ompi_osc_rdma_peer_t *peer) } +static int ompi_osc_rdma_get_partial (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address, + mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size, + ompi_osc_rdma_request_t *request) { + ompi_osc_rdma_module_t *module = sync->module; + ompi_osc_rdma_request_t *subreq; + int ret; + + OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq); + subreq->internal = true; + subreq->type = OMPI_OSC_RDMA_TYPE_RDMA; + subreq->parent_request = request; + (void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1); + + ret = ompi_osc_rdma_get_contig (sync, peer, source_address, source_handle, target_buffer, size, subreq); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OMPI_OSC_RDMA_REQUEST_RETURN(subreq); + (void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1); + } + + return ret; +} static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *peer, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *target_buffer, size_t size, @@ -639,33 +667,81 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p aligned_source_bound = (source_address + size + btl_alignment_mask) & ~btl_alignment_mask; aligned_len = aligned_source_bound - aligned_source_base; - request->offset = source_address - aligned_source_base; - request->len = size; - request->origin_addr = target_buffer; - request->sync = sync; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p", size, source_address, target_buffer); if ((module->selected_btl->btl_register_mem && size > module->selected_btl->btl_get_local_registration_threshold) || (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { + ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - /* check for alignment */ - if (!(((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - (void) ompi_osc_rdma_register (module, peer->data_endpoint, target_buffer, size, MCA_BTL_REG_FLAG_LOCAL_WRITE, + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { + /* region is too large for a buffered read */ + size_t subsize; + + if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) { + /* remote region has the same alignment but the base is not aligned. perform a small + * buffered get of the beginning of the remote region */ + aligned_source_base = OPAL_ALIGN(source_address, module->selected_btl->btl_get_alignment, osc_rdma_base_t); + subsize = (size_t) (aligned_source_base - source_address); + + ret = ompi_osc_rdma_get_partial (sync, peer, source_address, source_handle, target_buffer, subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + source_address += subsize; + target_buffer = (void *) ((intptr_t) target_buffer + subsize); + size -= subsize; + + aligned_len = aligned_source_bound - aligned_source_base; + } + + if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) && + (size & btl_alignment_mask)) { + /* remote region bases are aligned but the bounds are not. perform a + * small buffered get of the end of the remote region */ + aligned_len = size & ~btl_alignment_mask; + subsize = size - aligned_len; + size = aligned_len; + ret = ompi_osc_rdma_get_partial (sync, peer, source_address + aligned_len, source_handle, + (void *) ((intptr_t) target_buffer + aligned_len), subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } + /* (remaining) user request is now correctly aligned */ + } + + if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { + /* local and remote alignments differ */ + request->buffer = ptr = malloc (aligned_len); + } else { + ptr = target_buffer; + } + + if (NULL != ptr) { + (void) ompi_osc_rdma_register (module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE, &local_handle); } if (OPAL_UNLIKELY(NULL == local_handle)) { - return OMPI_ERR_OUT_OF_RESOURCE; + free (request->buffer); + request->buffer = NULL; + return ret; } } else { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get", ptr, (void *) frag); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", + ptr, (void *) frag, aligned_len, (unsigned long) aligned_source_base); local_handle = frag->handle; } } + request->offset = source_address - aligned_source_base; + request->len = size; + request->origin_addr = target_buffer; + request->sync = sync; + ompi_osc_rdma_sync_rdma_inc (sync); do { diff --git a/ompi/mca/osc/rdma/osc_rdma_frag.h b/ompi/mca/osc/rdma/osc_rdma_frag.h index fcd7243fd0..e9636a24d2 100644 --- a/ompi/mca/osc/rdma/osc_rdma_frag.h +++ b/ompi/mca/osc/rdma/osc_rdma_frag.h @@ -60,7 +60,7 @@ static inline int ompi_osc_rdma_frag_alloc (ompi_osc_rdma_module_t *module, size request_len = OPAL_ALIGN(request_len, 8, size_t); if (request_len > (mca_osc_rdma_component.buffer_size >> 1)) { - return OMPI_ERR_OUT_OF_RESOURCE; + return OMPI_ERR_VALUE_OUT_OF_BOUNDS; } OPAL_THREAD_LOCK(&module->lock); diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h index 8dedcc3f45..67e78a2a68 100644 --- a/ompi/mca/osc/rdma/osc_rdma_lock.h +++ b/ompi/mca/osc/rdma/osc_rdma_lock.h @@ -50,7 +50,7 @@ static inline int ompi_osc_rdma_lock_release_shared (ompi_osc_rdma_module_t *mod { uint64_t lock = (uint64_t) (intptr_t) peer->state + offset; volatile bool atomic_complete = false; - void *temp; + void *temp = NULL; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing shared lock %" PRIx64 " on peer %d. value 0x%lx", lock, @@ -117,7 +117,7 @@ static inline int ompi_osc_rdma_lock_acquire_shared (ompi_osc_rdma_module_t *mod { uint64_t lock = (uint64_t) peer->state + offset; volatile bool atomic_complete; - ompi_osc_rdma_lock_t *temp; + ompi_osc_rdma_lock_t *temp = NULL; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "acquiring shared lock %" PRIx64 " on peer %d. value 0x%lx", lock, @@ -292,7 +292,7 @@ static inline int ompi_osc_rdma_lock_release_exclusive (ompi_osc_rdma_module_t * { uint64_t lock = (uint64_t) (intptr_t) peer->state + offset; volatile bool atomic_complete = false; - void *temp; + void *temp = NULL; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock %" PRIx64 " on peer %d", lock, peer->rank); diff --git a/ompi/mca/osc/rdma/osc_rdma_request.c b/ompi/mca/osc/rdma/osc_rdma_request.c index a5a8bb8084..86b088b1b6 100644 --- a/ompi/mca/osc/rdma/osc_rdma_request.c +++ b/ompi/mca/osc/rdma/osc_rdma_request.c @@ -59,7 +59,10 @@ static void request_construct(ompi_osc_rdma_request_t *request) request->super.req_free = request_free; request->super.req_cancel = request_cancel; request->super.req_complete_cb = request_complete; - request->parent_request = 0; + request->parent_request = NULL; + request->buffer = NULL; + request->internal = false; + request->outstanding_requests = 0; OBJ_CONSTRUCT(&request->convertor, opal_convertor_t); } diff --git a/ompi/mca/osc/rdma/osc_rdma_request.h b/ompi/mca/osc/rdma/osc_rdma_request.h index ad10e4c69a..3cec365a7a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_request.h +++ b/ompi/mca/osc/rdma/osc_rdma_request.h @@ -60,6 +60,7 @@ struct ompi_osc_rdma_request_t { /** synchronization object */ struct ompi_osc_rdma_sync_t *sync; + void *buffer; }; typedef struct ompi_osc_rdma_request_t ompi_osc_rdma_request_t; OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t); @@ -78,18 +79,19 @@ OBJ_CLASS_DECLARATION(ompi_osc_rdma_request_t); req = (ompi_osc_rdma_request_t*) item; \ OMPI_REQUEST_INIT(&req->super, false); \ req->super.req_mpi_object.win = module->win; \ - req->super.req_complete = false; \ req->super.req_state = OMPI_REQUEST_ACTIVE; \ req->module = rmodule; \ - req->internal = false; \ - req->outstanding_requests = 0; \ - req->parent_request = NULL; \ req->peer = (rpeer); \ } while (0) #define OMPI_OSC_RDMA_REQUEST_RETURN(req) \ do { \ OMPI_REQUEST_FINI(&(req)->super); \ + free ((req)->buffer); \ + (req)->buffer = NULL; \ + (req)->parent_request = NULL; \ + (req)->internal = false; \ + (req)->outstanding_requests = 0; \ opal_free_list_return (&mca_osc_rdma_component.requests, \ (opal_free_list_item_t *) (req)); \ } while (0)