From e4a31f0daf61ca6f99258c311472270ce6f9448c Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 15 Mar 2018 12:50:10 -0600 Subject: [PATCH 1/2] btl: add a new btl for the UCT layer in OpenUCX This commit adds a new btl for one-sided and two-sided. This btl uses the uct layer in OpenUCX. This btl makes use of multiple uct contexts and per-thread device pinning to provide good performance when using threads and osc/rdma. This btl has been tested extensively with osc/rdma and passes all MTT tests on aries and IB hardware. For now this new component disables itself but can be enabled by setting the btl_ucx_transports MCA variable with a comma-delimited list of supported memory domains/transport layers. For example: --mca btl_uct_memory_domains ib/mlx5_0. The specific transports used can be selected using --mca btl_uct_transports. The default is to use any available transport. Signed-off-by: Nathan Hjelm --- config/ompi_check_ucx.m4 | 1 + opal/mca/btl/uct/Makefile.am | 69 +++ opal/mca/btl/uct/btl_uct.h | 327 ++++++++++++ opal/mca/btl/uct/btl_uct_am.c | 317 ++++++++++++ opal/mca/btl/uct/btl_uct_am.h | 33 ++ opal/mca/btl/uct/btl_uct_amo.c | 186 +++++++ opal/mca/btl/uct/btl_uct_component.c | 542 ++++++++++++++++++++ opal/mca/btl/uct/btl_uct_device_context.h | 162 ++++++ opal/mca/btl/uct/btl_uct_endpoint.c | 396 +++++++++++++++ opal/mca/btl/uct/btl_uct_endpoint.h | 94 ++++ opal/mca/btl/uct/btl_uct_frag.c | 55 +++ opal/mca/btl/uct/btl_uct_frag.h | 63 +++ opal/mca/btl/uct/btl_uct_module.c | 386 +++++++++++++++ opal/mca/btl/uct/btl_uct_rdma.c | 287 +++++++++++ opal/mca/btl/uct/btl_uct_rdma.h | 62 +++ opal/mca/btl/uct/btl_uct_tl.c | 574 ++++++++++++++++++++++ opal/mca/btl/uct/btl_uct_types.h | 321 ++++++++++++ opal/mca/btl/uct/configure.m4 | 47 ++ opal/mca/btl/uct/owner.txt | 7 + 19 files changed, 3929 insertions(+) create mode 100644 opal/mca/btl/uct/Makefile.am create mode 100644 opal/mca/btl/uct/btl_uct.h create mode 100644 opal/mca/btl/uct/btl_uct_am.c create mode 100644 opal/mca/btl/uct/btl_uct_am.h create mode 100644 opal/mca/btl/uct/btl_uct_amo.c create mode 100644 opal/mca/btl/uct/btl_uct_component.c create mode 100644 opal/mca/btl/uct/btl_uct_device_context.h create mode 100644 opal/mca/btl/uct/btl_uct_endpoint.c create mode 100644 opal/mca/btl/uct/btl_uct_endpoint.h create mode 100644 opal/mca/btl/uct/btl_uct_frag.c create mode 100644 opal/mca/btl/uct/btl_uct_frag.h create mode 100644 opal/mca/btl/uct/btl_uct_module.c create mode 100644 opal/mca/btl/uct/btl_uct_rdma.c create mode 100644 opal/mca/btl/uct/btl_uct_rdma.h create mode 100644 opal/mca/btl/uct/btl_uct_tl.c create mode 100644 opal/mca/btl/uct/btl_uct_types.h create mode 100644 opal/mca/btl/uct/configure.m4 create mode 100644 opal/mca/btl/uct/owner.txt diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 62cb693bb9b..1bce5179f81 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -83,6 +83,7 @@ AC_DEFUN([OMPI_CHECK_UCX],[ [$ompi_check_ucx_libdir], [ompi_check_ucx_happy="yes"], [ompi_check_ucx_happy="no"]) + CPPFLAGS="$ompi_check_ucx_$1_save_CPPFLAGS" LDFLAGS="$ompi_check_ucx_$1_save_LDFLAGS" LIBS="$ompi_check_ucx_$1_save_LIBS" diff --git a/opal/mca/btl/uct/Makefile.am b/opal/mca/btl/uct/Makefile.am new file mode 100644 index 00000000000..e1015f2823e --- /dev/null +++ b/opal/mca/btl/uct/Makefile.am @@ -0,0 +1,69 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = $(btl_uct_CPPFLAGS) + +amca_paramdir = $(AMCA_PARAM_SETS_DIR) + +sources = \ + btl_uct.h \ + btl_uct_module.c \ + btl_uct_component.c \ + btl_uct_rdma.h \ + btl_uct_rdma.c \ + btl_uct_endpoint.h \ + btl_uct_endpoint.c \ + btl_uct_amo.c \ + btl_uct_am.h \ + btl_uct_am.c \ + btl_uct_frag.h \ + btl_uct_frag.c \ + btl_uct_tl.c \ + btl_uct_types.h \ + btl_uct_device_context.h + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_opal_btl_uct_DSO +lib = +lib_sources = +component = mca_btl_uct.la +component_sources = $(sources) +else +lib = libmca_btl_uct.la +lib_sources = $(sources) +component = +component_sources = +endif + +mcacomponentdir = $(opallibdir) +mcacomponent_LTLIBRARIES = $(component) +mca_btl_uct_la_SOURCES = $(component_sources) +mca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS) +mca_btl_uct_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la $(btl_uct_LIBS) + +noinst_LTLIBRARIES = $(lib) +libmca_btl_uct_la_SOURCES = $(lib_sources) +libmca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS) +libmca_btl_uct_la_LIBADD = $(btl_uct_LIBS) diff --git a/opal/mca/btl/uct/btl_uct.h b/opal/mca/btl/uct/btl_uct.h new file mode 100644 index 00000000000..0a896dd736e --- /dev/null +++ b/opal/mca/btl/uct/btl_uct.h @@ -0,0 +1,327 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2009 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_BTL_UCT_H +#define MCA_BTL_UCT_H + +#include "opal_config.h" +#include +#include + +/* Open MPI includes */ +#include "opal/mca/event/event.h" +#include "opal/mca/btl/base/base.h" +#include "opal/mca/mpool/mpool.h" +#include "opal/mca/btl/base/btl_base_error.h" +#include "opal/mca/rcache/base/base.h" +#include "opal/class/opal_fifo.h" +#include "opal/class/opal_hash_table.h" +#include "opal/mca/pmix/pmix.h" +#include "opal/threads/tsd.h" +#include +#include + +#include "btl_uct_types.h" + +BEGIN_C_DECLS + +/* detection for old vs new atomic flags */ +#if defined(UCT_IFACE_FLAG_ATOMIC_ADD32) +#define OPAL_HAVE_UCT_EP_ATOMIC64_POST 0 +#else +#define OPAL_HAVE_UCT_EP_ATOMIC64_POST 1 +#endif + +/** + * @brief UCT BTL module + */ +struct mca_btl_uct_module_t { + /** base BTL interface */ + mca_btl_base_module_t super; + + /** whether the module has been fully initialized or not */ + bool initialized; + + /** lock for the hash table */ + opal_mutex_t endpoint_lock; + + /** endpoint hash table */ + opal_hash_table_t id_to_endpoint; + + /** mutex to protect the module */ + opal_mutex_t lock; + + /** async context */ + ucs_async_context_t *ucs_async; + + /** transport for active messaging */ + mca_btl_uct_tl_t *am_tl; + + /** transport for RDMA/AMOs */ + mca_btl_uct_tl_t *rdma_tl; + + /** transport for forming connections (if needed) */ + mca_btl_uct_tl_t *conn_tl; + + /** array containing the am_tl and rdma_tl */ + mca_btl_uct_tl_t *comm_tls[2]; + + /** registration cache */ + mca_rcache_base_module_t *rcache; + + /** name of the memory domain backing this module */ + char *md_name; + + /** am and rdma share endpoints */ + bool shared_endpoints; + + /** memory domain */ + mca_btl_uct_md_t *md; + + /** un-registered frags that will be used with uct_ep_am_short() */ + opal_free_list_t short_frags; + + /** registered frags that will be used with uct_ep_am_zcopy() */ + opal_free_list_t eager_frags; + + /** large registered frags for packing non-contiguous data */ + opal_free_list_t max_frags; + + /** RDMA completions */ + opal_free_list_t rdma_completions; + + /** frags that were waiting on connections that are now ready to send */ + opal_list_t pending_frags; +}; +typedef struct mca_btl_uct_module_t mca_btl_uct_module_t; + +extern mca_btl_uct_module_t mca_btl_uct_module_template; + +/** + * @brief UCT BTL component + */ +struct mca_btl_uct_component_t { + /** base BTL component */ + mca_btl_base_component_3_0_0_t super; + + /** number of TL modules */ + int module_count; + + /** All BTL UCT modules (1 per memory domain) */ + mca_btl_uct_module_t *modules[MCA_BTL_UCT_MAX_MODULES]; + + /** allowed UCT memory domains */ + char *memory_domains; + + /** allowed transports */ + char *allowed_transports; + + /** number of worker contexts to create */ + int num_contexts_per_module; + +#if OPAL_C_HAVE__THREAD_LOCAL + /** bind threads to contexts */ + bool bind_threads_to_contexts; +#endif + + /** disable UCX memory hooks */ + bool disable_ucx_memory_hooks; +}; +typedef struct mca_btl_uct_component_t mca_btl_uct_component_t; + +OPAL_MODULE_DECLSPEC extern mca_btl_uct_component_t mca_btl_uct_component; + +struct mca_btl_base_registration_handle_t { + /** The packed memory handle. The size of this field is defined by UCT. */ + uint8_t packed_handle[1]; +}; + +struct mca_btl_uct_reg_t { + mca_rcache_base_registration_t base; + + /** UCT memory handle */ + uct_mem_h uct_memh; + + /** remote handle */ + mca_btl_base_registration_handle_t handle; +}; +typedef struct mca_btl_uct_reg_t mca_btl_uct_reg_t; + +OBJ_CLASS_DECLARATION(mca_btl_uct_reg_t); + +#define MCA_BTL_UCT_REG_REMOTE_TO_LOCAL(reg) ((mca_btl_uct_reg_t *)((intptr_t) (reg) - offsetof (mca_btl_uct_reg_t, handle))) + +/** + * Initiate an asynchronous put. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the put operation has been queued with the + * network. the local_handle can not be deregistered + * until all outstanding operations on that handle + * have been completed. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (IN) Local address to put from (registered) + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param local_handle (IN) Registration handle for region containing + * (local_address, local_address + size) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + size) + * @param size (IN) Number of bytes to put + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The descriptor was successfully queued for a put + * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or + * alignment restrictions. + */ +int mca_btl_uct_put (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +/** + * Initiate an asynchronous get. + * Completion Semantics: if this function returns a 1 then the operation + * is complete. a return of OPAL_SUCCESS indicates + * the get operation has been queued with the + * network. the local_handle can not be deregistered + * until all outstanding operations on that handle + * have been completed. + * + * @param btl (IN) BTL module + * @param endpoint (IN) BTL addressing information + * @param local_address (IN) Local address to put from (registered) + * @param remote_address (IN) Remote address to put to (registered remotely) + * @param local_handle (IN) Registration handle for region containing + * (local_address, local_address + size) + * @param remote_handle (IN) Remote registration handle for region containing + * (remote_address, remote_address + size) + * @param size (IN) Number of bytes to put + * @param flags (IN) Flags for this put operation + * @param order (IN) Ordering + * @param cbfunc (IN) Function to call on completion (if queued) + * @param cbcontext (IN) Context for the callback + * @param cbdata (IN) Data for callback + * + * @retval OPAL_SUCCESS The descriptor was successfully queued for a put + * @retval OPAL_ERROR The descriptor was NOT successfully queued for a put + * @retval OPAL_ERR_OUT_OF_RESOURCE Insufficient resources to queue the put + * operation. Try again later + * @retval OPAL_ERR_NOT_AVAILABLE Put can not be performed due to size or + * alignment restrictions. + */ +int mca_btl_uct_get (struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + + /** + * Fault Tolerance Event Notification Function + * @param state Checkpoint Stae + * @return OPAL_SUCCESS or failure status + */ +int mca_btl_uct_ft_event(int state); + +int mca_btl_uct_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +int mca_btl_uct_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); + +int mca_btl_uct_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + + +int mca_btl_uct_flush (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint); +int mca_btl_uct_flush_thread (mca_btl_base_module_t *btl); + +int mca_btl_uct_finalize (mca_btl_base_module_t *btl); + +int mca_btl_uct_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg); +int mca_btl_uct_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg); + +ucs_status_t mca_btl_uct_am_handler (void *arg, void *data, size_t length, unsigned flags); + +struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc); + +int mca_btl_uct_query_tls (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count); + +/** + * @brief Checks if a tl is suitable for using for RDMA + * + * @param[in] tl btl/uct tl pointer + */ +static inline bool mca_btl_uct_tl_supports_rdma (mca_btl_uct_tl_t *tl) +{ + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY)) == + (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY); +} + +/** + * @brief Checks if a tl is suitable for using for active messaging + */ +static inline bool mca_btl_uct_tl_support_am (mca_btl_uct_tl_t *tl) +{ + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY)); +} + +/** + * @brief Checks if a tl can be used for passing data to connect endpoints + * + * @param[in] tl btl/uct tl pointer + */ +static inline bool mca_btl_uct_tl_supports_conn (mca_btl_uct_tl_t *tl) +{ + return (tl->uct_iface_attr.cap.flags & (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE)) == + (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE); +} + +/** + * @brief Check if tl endpoints need to be connected via a connection tl + * + * @param[in] tl btl/uct tl pointer + */ +static inline bool mca_btl_uct_tl_requires_connection_tl (mca_btl_uct_tl_t *tl) +{ + return !(tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE); +} + +END_C_DECLS +#endif diff --git a/opal/mca/btl/uct/btl_uct_am.c b/opal/mca/btl/uct/btl_uct_am.c new file mode 100644 index 00000000000..6927f31c8cd --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_am.c @@ -0,0 +1,317 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_uct_am.h" +#include "btl_uct_rdma.h" +#include "btl_uct_device_context.h" + +/** + * Allocate a segment. + * + * @param btl (IN) BTL module + * @param size (IN) Request segment size. + */ +mca_btl_base_descriptor_t *mca_btl_uct_alloc (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + uint8_t order, size_t size, uint32_t flags) +{ + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; + mca_btl_uct_base_frag_t *frag = NULL; + + if ((size + 8) <= (size_t) uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { + frag = mca_btl_uct_frag_alloc_short (uct_btl, endpoint); + } else if (size <= uct_btl->super.btl_eager_limit) { + frag = mca_btl_uct_frag_alloc_eager (uct_btl, endpoint); + } else { + frag = mca_btl_uct_frag_alloc_max (uct_btl, endpoint); + } + + if (OPAL_LIKELY(frag != NULL)) { + frag->segments[0].seg_len = size; + + frag->base.des_segment_count = 1; + frag->base.des_flags = flags; + frag->base.order = order; + frag->uct_iov.length = size; + } + + return (mca_btl_base_descriptor_t *) frag; +} + +static inline void _mca_btl_uct_send_pack (void *data, void *header, size_t header_size, opal_convertor_t *convertor, + size_t payload_size) +{ + uint32_t iov_count = 1; + struct iovec iov; + size_t length; + + if (header_size > 0) { + assert (NULL != header); + memcpy (data, header, header_size); + } + + /* pack the data into the supplied buffer */ + iov.iov_base = (IOVBASE_TYPE *) ((intptr_t) data + header_size); + iov.iov_len = length = payload_size; + + (void) opal_convertor_pack (convertor, &iov, &iov_count, &length); + + assert (length == payload_size); +} + +struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src (mca_btl_base_module_t *btl, + mca_btl_base_endpoint_t *endpoint, + opal_convertor_t *convertor, + uint8_t order, size_t reserve, + size_t *size, uint32_t flags) +{ + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; + const size_t total_size = reserve + *size; + mca_btl_uct_base_frag_t *frag; + void *data_ptr; + + /* in place send fragment */ + if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor) || total_size > uct_btl->super.btl_eager_limit)) { + frag = (mca_btl_uct_base_frag_t *) mca_btl_uct_alloc (btl, endpoint, order, total_size, flags); + if (OPAL_UNLIKELY(NULL == frag)) { + return NULL; + } + + _mca_btl_uct_send_pack ((void *) ((intptr_t) frag->uct_iov.buffer + reserve), NULL, 0, + convertor, *size); + } else { + opal_convertor_get_current_pointer (convertor, &data_ptr); + assert (NULL != data_ptr); + + frag = mca_btl_uct_frag_alloc_short (uct_btl, endpoint); + if (OPAL_UNLIKELY(NULL == frag)) { + return NULL; + } + + frag->base.order = order; + frag->base.des_flags = flags; + if (total_size > (size_t) uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { + frag->segments[1].seg_len = *size; + frag->segments[1].seg_addr.pval = data_ptr; + frag->base.des_segment_count = 2; + } else { + memcpy ((void *)((intptr_t) frag->segments[1].seg_addr.pval + reserve), data_ptr, *size); + } + } + + return &frag->base; +} + +/** + * Return a segment allocated by this BTL. + * + * @param btl (IN) BTL module + * @param segment (IN) Allocated segment. + */ +int mca_btl_uct_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des) +{ + mca_btl_uct_frag_return ((mca_btl_uct_base_frag_t *) des); + return OPAL_SUCCESS; +} + +static size_t mca_btl_uct_send_frag_pack (void *data, void *arg) +{ + mca_btl_uct_base_frag_t *frag = (mca_btl_uct_base_frag_t *) arg; + size_t length = 8; + + memcpy (data, &frag->header, sizeof (frag->header)); + data = (void *)((intptr_t) data + 8); + + /* this function should only ever get called with fragments with two segments */ + for (size_t i = 0 ; i < 2 ; ++i) { + const size_t seg_len = frag->segments[i].seg_len; + memcpy (data, frag->segments[i].seg_addr.pval, seg_len); + data = (void *)((intptr_t) data + seg_len); + length += seg_len; + } + + return length; +} + +int mca_btl_uct_send_frag (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_base_frag_t *frag, + int32_t flags, mca_btl_uct_device_context_t *context, uct_ep_h ep_handle) +{ + ucs_status_t ucs_status; + + mca_btl_uct_context_lock (context); + + do { + if (NULL != frag->base.super.registration) { + frag->comp.dev_context = context; + + ucs_status = uct_ep_am_zcopy (ep_handle, MCA_BTL_UCT_FRAG, &frag->header, sizeof (frag->header), + &frag->uct_iov, 1, 0, &frag->comp.uct_comp); + } else { + /* short message */ + /* restore original flags */ + frag->base.des_flags = flags; + + if (1 == frag->base.des_segment_count) { + ucs_status = uct_ep_am_short (ep_handle, MCA_BTL_UCT_FRAG, frag->header.value, frag->uct_iov.buffer, + frag->uct_iov.length); + } else { + ucs_status = uct_ep_am_bcopy (ep_handle, MCA_BTL_UCT_FRAG, mca_btl_uct_send_frag_pack, frag, 0); + } + } + + if (UCS_ERR_NO_RESOURCE != ucs_status) { + /* go ahead and progress the worker while we have the lock */ + (void) uct_worker_progress (context->uct_worker); + break; + } + + /* wait for something to complete before trying again */ + while (!uct_worker_progress (context->uct_worker)); + } while (1); + + mca_btl_uct_context_unlock (context); + + if (UCS_OK == ucs_status) { + /* restore original flags */ + frag->base.des_flags = flags; + /* send is complete */ + mca_btl_uct_frag_complete (frag, OPAL_SUCCESS); + return 1; + } + + if (OPAL_UNLIKELY(UCS_INPROGRESS != ucs_status)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + return 0; +} + +int mca_btl_uct_send (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, mca_btl_base_descriptor_t *descriptor, + mca_btl_base_tag_t tag) +{ + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; + mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_am_context (uct_btl); + mca_btl_uct_base_frag_t *frag = (mca_btl_uct_base_frag_t *) descriptor; + int flags = frag->base.des_flags; + uct_ep_h ep_handle; + int rc; + + BTL_VERBOSE(("btl/uct sending descriptor %p from %d -> %d. length = %" PRIu64, (void *)descriptor, + OPAL_PROC_MY_NAME.vpid, endpoint->ep_proc->proc_name.vpid, frag->uct_iov.length)); + + + frag->header.data.tag = tag; + + /* add the callback flag before posting to avoid potential races with other threads */ + frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; + + rc = mca_btl_uct_endpoint_check_am (uct_btl, endpoint, context, &ep_handle); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + OPAL_THREAD_LOCK(&endpoint->ep_lock); + /* check one more time in case another thread is completing the connection now */ + if (OPAL_SUCCESS != mca_btl_uct_endpoint_test_am (uct_btl, endpoint, context, &ep_handle)) { + frag->context_id = context->context_id; + frag->ready = false; + OPAL_THREAD_LOCK(&uct_btl->lock); + opal_list_append (&uct_btl->pending_frags, (opal_list_item_t *) frag); + OPAL_THREAD_UNLOCK(&endpoint->ep_lock); + OPAL_THREAD_UNLOCK(&uct_btl->lock); + + return OPAL_SUCCESS; + } + OPAL_THREAD_UNLOCK(&endpoint->ep_lock); + } + + return mca_btl_uct_send_frag (uct_btl, endpoint, frag, flags, context, ep_handle); +} + +struct mca_btl_uct_sendi_pack_args_t { + uint64_t am_header; + void *header; + size_t header_size; + opal_convertor_t *convertor; + size_t payload_size; +}; + +typedef struct mca_btl_uct_sendi_pack_args_t mca_btl_uct_sendi_pack_args_t; + +static size_t mca_btl_uct_sendi_pack (void *data, void *arg) +{ + mca_btl_uct_sendi_pack_args_t *args = (mca_btl_uct_sendi_pack_args_t *) arg; + mca_btl_uct_am_header_t *am_header = (mca_btl_uct_am_header_t *) data; + + am_header->value = args->am_header; + _mca_btl_uct_send_pack ((void *)((intptr_t)data + 8), args->header, args->header_size, args->convertor, + args->payload_size); + return args->header_size + args->payload_size + 8; +} + +static inline size_t mca_btl_uct_max_sendi (mca_btl_uct_module_t *uct_btl) +{ + return (uct_btl->am_tl->uct_iface_attr.cap.am.max_short > uct_btl->am_tl->uct_iface_attr.cap.am.max_bcopy) ? + uct_btl->am_tl->uct_iface_attr.cap.am.max_short : uct_btl->am_tl->uct_iface_attr.cap.am.max_bcopy; +} + +int mca_btl_uct_sendi (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, opal_convertor_t *convertor, + void *header, size_t header_size, size_t payload_size, uint8_t order, uint32_t flags, + mca_btl_base_tag_t tag, mca_btl_base_descriptor_t **descriptor) +{ + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; + mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_am_context (uct_btl); + const size_t total_size = header_size + payload_size; + /* message with header */ + const size_t msg_size = total_size + 8; + mca_btl_uct_am_header_t am_header; + ucs_status_t ucs_status; + uct_ep_h ep_handle; + int rc; + + rc = mca_btl_uct_endpoint_check_am (uct_btl, endpoint, context, &ep_handle); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || msg_size > mca_btl_uct_max_sendi (uct_btl))) { + if (descriptor) { + *descriptor = mca_btl_uct_alloc (btl, endpoint, order, total_size, flags); + } + + return OPAL_ERR_OUT_OF_RESOURCE; + } + + am_header.data.tag = tag; + + mca_btl_uct_context_lock (context); + if (0 == payload_size) { + ucs_status = uct_ep_am_short (ep_handle, MCA_BTL_UCT_FRAG, am_header.value, header, header_size); + } else if (msg_size < (size_t) uct_btl->am_tl->uct_iface_attr.cap.am.max_short) { + int8_t *data = alloca (total_size); + _mca_btl_uct_send_pack (data, header, header_size, convertor, payload_size); + ucs_status = uct_ep_am_short (ep_handle, MCA_BTL_UCT_FRAG, am_header.value, data, total_size); + } else { + ssize_t size; + + size = uct_ep_am_bcopy (ep_handle, MCA_BTL_UCT_FRAG, mca_btl_uct_sendi_pack, + &(mca_btl_uct_sendi_pack_args_t) {.am_header = am_header.value, + .header = header, .header_size = header_size, + .convertor = convertor, .payload_size = payload_size}, 0); + if (OPAL_LIKELY(size == (ssize_t) msg_size)) { + ucs_status = UCS_OK; + } + } + + mca_btl_uct_context_unlock (context); + + if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { + if (descriptor) { + *descriptor = mca_btl_uct_alloc (btl, endpoint, order, total_size, flags); + } + + return OPAL_ERR_OUT_OF_RESOURCE; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/uct/btl_uct_am.h b/opal/mca/btl/uct/btl_uct_am.h new file mode 100644 index 00000000000..07d7223eab5 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_am.h @@ -0,0 +1,33 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_UCT_AM_H) +#define MCA_BTL_UCT_AM_H + +#include "btl_uct_frag.h" + +int mca_btl_uct_sendi (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, opal_convertor_t *convertor, + void *header, size_t header_size, size_t payload_size, uint8_t order, uint32_t flags, + mca_btl_base_tag_t tag, mca_btl_base_descriptor_t **descriptor); + +int mca_btl_uct_send (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, mca_btl_base_descriptor_t *descriptor, + mca_btl_base_tag_t tag); + +int mca_btl_uct_send_frag (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_base_frag_t *frag, + int32_t flags, mca_btl_uct_device_context_t *context, uct_ep_h ep_handle); + +mca_btl_base_descriptor_t *mca_btl_uct_alloc (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + uint8_t order, size_t size, uint32_t flags); + +int mca_btl_uct_free (mca_btl_base_module_t *btl, mca_btl_base_descriptor_t *des); + + +#endif /* !defined(MCA_BTL_UCT_AM_H) */ diff --git a/opal/mca/btl/uct/btl_uct_amo.c b/opal/mca/btl/uct/btl_uct_amo.c new file mode 100644 index 00000000000..d443777089f --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_amo.c @@ -0,0 +1,186 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_uct_device_context.h" + +#if OPAL_HAVE_UCT_EP_ATOMIC64_POST +/* we add 1 to the ops to differentiate between unsupported and supported ops since + * UCT_ATOMIC_OP_ADD == 0. otherwise we would have to fill in this table completely. */ +static int mca_btl_uct_btl_to_uct_atomic[MCA_BTL_ATOMIC_LAST] = { + [MCA_BTL_ATOMIC_ADD] = UCT_ATOMIC_OP_ADD + 1, + [MCA_BTL_ATOMIC_AND] = UCT_ATOMIC_OP_AND + 1, + [MCA_BTL_ATOMIC_OR] = UCT_ATOMIC_OP_OR + 1, + [MCA_BTL_ATOMIC_XOR] = UCT_ATOMIC_OP_XOR + 1, + [MCA_BTL_ATOMIC_SWAP] = UCT_ATOMIC_OP_SWAP + 1, +}; +#endif + +int mca_btl_uct_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; + mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_rdma_context (uct_btl); + mca_btl_uct_uct_completion_t *comp = NULL; + ucs_status_t ucs_status; + uct_rkey_bundle_t rkey; + uct_ep_h ep_handle; + int rc; + +#if OPAL_HAVE_UCT_EP_ATOMIC64_POST + int uct_op = mca_btl_uct_btl_to_uct_atomic[op]; + + if (OPAL_UNLIKELY(0 == uct_op--)) { + return OPAL_ERR_BAD_PARAM; + } +#else + if (OPAL_UNLIKELY(MCA_BTL_ATOMIC_ADD != op && MCA_BTL_ATOMIC_SWAP != op)) { + return OPAL_ERR_BAD_PARAM; + } +#endif + + if (cbfunc) { + comp = mca_btl_uct_uct_completion_alloc (uct_btl, endpoint, local_address, local_handle, context, + cbfunc, cbcontext, cbdata); + if (OPAL_UNLIKELY(NULL == comp)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + + rc = mca_btl_uct_get_rkey (uct_btl, context, endpoint, remote_handle, &rkey, &ep_handle); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_uct_uct_completion_release (comp); + return rc; + } + + mca_btl_uct_context_lock (context); + +#if OPAL_HAVE_UCT_EP_ATOMIC64_POST + if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { + ucs_status = uct_ep_atomic32_fetch (ep_handle, uct_op, operand, (uint32_t *) local_address, remote_address, + rkey.rkey, &comp->uct_comp); + } else { + ucs_status = uct_ep_atomic64_fetch (ep_handle, uct_op, operand, (uint64_t *) local_address, remote_address, + rkey.rkey, &comp->uct_comp); + } +#else + if (MCA_BTL_ATOMIC_ADD == op) { + if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { + ucs_status = uct_ep_atomic_fadd32 (ep_handle, (uint32_t) operand, remote_address, + rkey.rkey, (uint32_t *) local_address, &comp->uct_comp); + } else { + ucs_status = uct_ep_atomic_fadd64 (ep_handle, operand, remote_address, rkey.rkey, + (uint64_t *) local_address, &comp->uct_comp); + } + } else { + if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { + ucs_status = uct_ep_atomic_swap32 (ep_handle, (uint32_t) operand, remote_address, + rkey.rkey, (uint32_t *) local_address, &comp->uct_comp); + } else { + ucs_status = uct_ep_atomic_swap64 (ep_handle, operand, remote_address, rkey.rkey, + (uint64_t *) local_address, &comp->uct_comp); + } + } +#endif + + /* go ahead and progress the worker while we have the lock */ + (void) uct_worker_progress (context->uct_worker); + + mca_btl_uct_context_unlock (context); + + mca_btl_uct_device_handle_completions (context); + + if (UCS_INPROGRESS == ucs_status) { + rc = OPAL_SUCCESS; + } else if (UCS_OK == ucs_status) { + rc = 1; + } else { + rc = OPAL_ERR_OUT_OF_RESOURCE; + } + + uct_rkey_release (&rkey); + + return rc; +} + +int mca_btl_uct_aop (struct mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + /* this is static so it survives after this function returns. we don't care about the result */ + static uint64_t result; + + /* just use the fetching ops for now. there probably is a performance benefit to using + * the non-fetching on some platforms but this is easier to implement quickly and it + * guarantees remote completion. */ + return mca_btl_uct_afop (btl, endpoint, &result, remote_address, NULL, remote_handle, op, + operand, flags, order, cbfunc, cbcontext, cbdata); +} + +int mca_btl_uct_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; + mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_rdma_context (uct_btl); + mca_btl_uct_uct_completion_t *comp = NULL; + ucs_status_t ucs_status; + uct_rkey_bundle_t rkey; + uct_ep_h ep_handle; + int rc; + + if (cbfunc) { + comp = mca_btl_uct_uct_completion_alloc (uct_btl, endpoint, local_address, local_handle, context, + cbfunc, cbcontext, cbdata); + if (OPAL_UNLIKELY(NULL == comp)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + + rc = mca_btl_uct_get_rkey (uct_btl, context, endpoint, remote_handle, &rkey, &ep_handle); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_uct_uct_completion_release (comp); + return rc; + } + + mca_btl_uct_context_lock (context); + + if (flags & MCA_BTL_ATOMIC_FLAG_32BIT) { + ucs_status = uct_ep_atomic_cswap32 (ep_handle, (uint32_t) compare, (uint32_t) value, remote_address, + rkey.rkey, (uint32_t *) local_address, &comp->uct_comp); + } else { + ucs_status = uct_ep_atomic_cswap64 (ep_handle, compare, value, remote_address, rkey.rkey, + (uint64_t *) local_address, &comp->uct_comp); + } + + /* go ahead and progress the worker while we have the lock */ + (void) uct_worker_progress (context->uct_worker); + + mca_btl_uct_context_unlock (context); + + mca_btl_uct_device_handle_completions (context); + + if (UCS_INPROGRESS == ucs_status) { + rc = OPAL_SUCCESS; + } else if (UCS_OK == ucs_status) { + rc = 1; + } else { + rc = OPAL_ERR_OUT_OF_RESOURCE; + } + + uct_rkey_release (&rkey); + + return rc; +} diff --git a/opal/mca/btl/uct/btl_uct_component.c b/opal/mca/btl/uct/btl_uct_component.c new file mode 100644 index 00000000000..67769c285df --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_component.c @@ -0,0 +1,542 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "opal_config.h" + +#include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/base.h" +#include "opal/mca/hwloc/base/base.h" + +#include + +#include "btl_uct_device_context.h" +#include "btl_uct_am.h" + +#if !OPAL_C_HAVE__THREAD_LOCAL +opal_tsd_key_t mca_btl_uct_tsd_device_key; +#endif + +static int mca_btl_uct_component_register(void) +{ + mca_btl_uct_module_t *module = &mca_btl_uct_module_template; + + mca_btl_uct_component.memory_domains = "none"; + (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version, + "memory_domains", "Comma-delimited list of memory domains of the form " + "to use for communication. Memory domains MUST provide transports that " + "support put, get, and amos. Special values: all (all available), none." + " (default: none)", MCA_BASE_VAR_TYPE_STRING, NULL, 0, + MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, + &mca_btl_uct_component.memory_domains); + + mca_btl_uct_component.allowed_transports = "any"; + (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version, + "transports", "Comma-delimited list of transports of the form to use." + " The list of transports available can be queried using ucx_info. Special" + "values: any (any available) (default: any)", MCA_BASE_VAR_TYPE_STRING, + NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, + &mca_btl_uct_component.allowed_transports); + + mca_btl_uct_component.num_contexts_per_module = 0; + (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version, + "num_contexts_per_module", "Number of UCT worker contexts " + "to create for each BTL module. Larger numbers will improve " + "multi-threaded performance but may increase memory usage. " + "A good rule of thumb is one context per application thread " + "that will be calling into MPI. (default: 0 -- autoselect " + "based on the number of cores)", MCA_BASE_VAR_TYPE_INT, + NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.num_contexts_per_module); + + mca_btl_uct_component.disable_ucx_memory_hooks = true; + (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version, + "disable_ucx_memory_hooks", "Disable the munmap memory hook " + "inside UCX. These hooks are not necessary when using the " + "uct btl and tend to cause performance problems when using " + "multiple threads (default: true)", MCA_BASE_VAR_TYPE_BOOL, + NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.disable_ucx_memory_hooks); + + +#if OPAL_C_HAVE__THREAD_LOCAL + mca_btl_uct_component.bind_threads_to_contexts = true; + (void) mca_base_component_var_register(&mca_btl_uct_component.super.btl_version, + "bind_threads_to_contexts", "Bind threads to device contexts. " + "In general this should improve the multi-threaded performance " + "when threads are used. (default: true)", MCA_BASE_VAR_TYPE_BOOL, + NULL, 0 ,MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.bind_threads_to_contexts); +#endif + + /* for now we want this component to lose to btl/ugni and btl/vader */ + module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH; + + return mca_btl_base_param_register (&mca_btl_uct_component.super.btl_version, + &module->super); +} + +static int mca_btl_uct_component_open(void) +{ + if (0 == mca_btl_uct_component.num_contexts_per_module) { + /* use the core count and the number of local processes to determine + * how many UCT workers to create */ + int core_count = 36; + + (void) opal_hwloc_base_get_topology (); + core_count = hwloc_get_nbobjs_by_type (opal_hwloc_topology, HWLOC_OBJ_CORE); + + if (core_count <= opal_process_info.num_local_peers || !opal_using_threads()) { + /* there is probably no benefit to using multiple device contexts when not + * using threads or oversubscribing the node with mpi processes. */ + mca_btl_uct_component.num_contexts_per_module = 1; + } else { + mca_btl_uct_component.num_contexts_per_module = core_count / (opal_process_info.num_local_peers + 1); + } + } + +#if !OPAL_C_HAVE__THREAD_LOCAL + opal_tsd_key_create (&mca_btl_uct_tsd_device_key, NULL); +#endif + + return OPAL_SUCCESS; +} + + +/* + * component cleanup - sanity checking of queue lengths + */ +static int mca_btl_uct_component_close(void) +{ +#if !OPAL_C_HAVE__THREAD_LOCAL + opal_tsd_key_delete (mca_btl_uct_tsd_device_key); +#endif + + return OPAL_SUCCESS; +} + +static size_t mca_btl_uct_tl_modex_size (mca_btl_uct_tl_t *tl) +{ + const size_t size = strlen (tl->uct_tl_name) + 1; + + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + /* pad out to a multiple of 4 bytes */ + return (4 + 3 + size + tl->uct_iface_attr.device_addr_len + tl->uct_iface_attr.iface_addr_len) & ~3; + } + + return (4 + 3 + size + tl->uct_iface_attr.device_addr_len) & ~3; +} + +static size_t mca_btl_uct_module_modex_size (mca_btl_uct_module_t *module) +{ + size_t modex_size = 4 + strlen (module->md_name) + 1; + + if (module->rdma_tl) { + modex_size += mca_btl_uct_tl_modex_size (module->rdma_tl); + } + + if (module->am_tl && module->am_tl != module->rdma_tl) { + modex_size += mca_btl_uct_tl_modex_size (module->am_tl); + } + + if (module->conn_tl && module->conn_tl != module->rdma_tl && module->conn_tl != module->am_tl) { + modex_size += mca_btl_uct_tl_modex_size (module->conn_tl); + } + + return modex_size; +} + +static size_t mca_btl_uct_tl_modex_pack (mca_btl_uct_tl_t *tl, uint8_t *modex_data) +{ + mca_btl_uct_device_context_t *dev_context = tl->uct_dev_contexts[0]; + size_t modex_size = mca_btl_uct_tl_modex_size (tl); + + *((uint32_t *) modex_data) = (uint32_t) modex_size; + modex_data += 4; + + strcpy ((char *) modex_data, tl->uct_tl_name); + modex_data += strlen (tl->uct_tl_name) + 1; + + /* NTH: only the first context is available. i assume the device addresses of the + * contexts will be the same but they will have different iface addresses. i also + * am assuming that it doesn't really matter if all remote contexts connect to + * the same endpoint since we are only doing RDMA. if any of these assumptions are + * wrong then we can't delay creating the other contexts and must include their + * information in the modex. */ + if (tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { + uct_iface_get_address (dev_context->uct_iface, (uct_iface_addr_t *) modex_data); + modex_data += tl->uct_iface_attr.iface_addr_len; + } + + uct_iface_get_device_address (dev_context->uct_iface, (uct_device_addr_t *) modex_data); + modex_data += tl->uct_iface_attr.device_addr_len; + + return modex_size; +} + +static int mca_btl_uct_modex_send (void) +{ + size_t modex_size = sizeof (mca_btl_uct_modex_t); + mca_btl_uct_modex_t *modex; + uint8_t *modex_data; + int rc; + + for (unsigned i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + modex_size += mca_btl_uct_module_modex_size (mca_btl_uct_component.modules[i]); + } + + modex = alloca (modex_size); + modex_data = modex->data; + + modex->module_count = mca_btl_uct_component.module_count; + + for (unsigned i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + size_t name_len = strlen (module->md_name); + + /* pack the size */ + *((uint32_t *) modex_data) = (uint32_t) mca_btl_uct_module_modex_size (module); + + modex_data += 4; + + strcpy ((char *) modex_data, module->md_name); + modex_data += name_len + 1; + + if (module->rdma_tl) { + modex_data += mca_btl_uct_tl_modex_pack (module->rdma_tl, modex_data); + } + + if (module->am_tl && module->am_tl != module->rdma_tl) { + modex_data += mca_btl_uct_tl_modex_pack (module->am_tl, modex_data); + } + + if (module->conn_tl && module->conn_tl != module->rdma_tl && module->conn_tl != module->am_tl) { + modex_data += mca_btl_uct_tl_modex_pack (module->conn_tl, modex_data); + } + } + + OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, &mca_btl_uct_component.super.btl_version, modex, modex_size); + return rc; +} + +static mca_btl_uct_module_t *mca_btl_uct_alloc_module (const char *md_name, mca_btl_uct_md_t *md, + size_t registration_size) +{ + mca_btl_uct_module_t *module; + ucs_status_t ucs_status; + + module = malloc (sizeof (*module)); + if (NULL == module) { + return NULL; + } + + /* copy the module template */ + *module = mca_btl_uct_module_template; + + OBJ_CONSTRUCT(&module->id_to_endpoint, opal_hash_table_t); + OBJ_CONSTRUCT(&module->endpoint_lock, opal_mutex_t); + OBJ_CONSTRUCT(&module->short_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->eager_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->max_frags, opal_free_list_t); + OBJ_CONSTRUCT(&module->rdma_completions, opal_free_list_t); + OBJ_CONSTRUCT(&module->pending_frags, opal_list_t); + OBJ_CONSTRUCT(&module->lock, opal_mutex_t); + + module->md = md; + module->md_name = strdup (md_name); + module->super.btl_registration_handle_size = registration_size; + + ucs_status = ucs_async_context_create (UCS_ASYNC_MODE_THREAD, &module->ucs_async); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("Could not create a UCT async context")); + mca_btl_uct_finalize (&module->super); + return NULL; + } + + return module; +} + +ucs_status_t mca_btl_uct_am_handler (void *arg, void *data, size_t length, unsigned flags) +{ + mca_btl_uct_device_context_t *tl_context = (mca_btl_uct_device_context_t *) arg; + mca_btl_uct_module_t *uct_btl = tl_context->uct_btl; + mca_btl_uct_am_header_t *header = (mca_btl_uct_am_header_t *) data; + mca_btl_active_message_callback_t *reg; + mca_btl_base_segment_t seg = {.seg_addr = {.pval = (void *) ((intptr_t) data + sizeof (*header))}, + .seg_len = length - sizeof (*header)}; + mca_btl_uct_base_frag_t frag = {.base = {.des_segments = &seg, .des_segment_count = 1}}; + + reg = mca_btl_base_active_message_trigger + header->data.tag; + mca_btl_uct_context_unlock (tl_context); + reg->cbfunc (&uct_btl->super, header->data.tag, &frag.base, reg->cbdata); + mca_btl_uct_context_lock (tl_context); + + return UCS_OK; +} + +static int mca_btl_uct_component_process_uct_md (uct_md_resource_desc_t *md_desc, char **allowed_ifaces) +{ + mca_rcache_base_resources_t rcache_resources; + uct_tl_resource_desc_t *tl_desc; + mca_btl_uct_module_t *module; + uct_md_config_t *uct_config; + uct_md_attr_t md_attr; + mca_btl_uct_md_t *md; + bool found = false; + unsigned num_tls; + char *tmp; + + if (MCA_BTL_UCT_MAX_MODULES == mca_btl_uct_component.module_count) { + BTL_VERBOSE(("created the maximum number of allowable modules")); + return OPAL_ERR_NOT_AVAILABLE; + } + + BTL_VERBOSE(("processing memory domain %s", md_desc->md_name)); + + for (int j = 0 ; allowed_ifaces[j] ; ++j) { + if (0 == strncmp (allowed_ifaces[j], md_desc->md_name, strlen (md_desc->md_name)) || + 0 == strcmp (allowed_ifaces[j], "all")) { + found = true; + break; + } + } + + if (!found) { + /* nothing to do */ + return OPAL_SUCCESS; + } + + md = OBJ_NEW(mca_btl_uct_md_t); + + uct_md_config_read (md_desc->md_name, NULL, NULL, &uct_config); + uct_md_open (md_desc->md_name, uct_config, &md->uct_md); + uct_config_release (uct_config); + + uct_md_query (md->uct_md, &md_attr); + uct_md_query_tl_resources (md->uct_md, &tl_desc, &num_tls); + + module = mca_btl_uct_alloc_module (md_desc->md_name, md, md_attr.rkey_packed_size); + if (NULL == module) { + uct_release_tl_resource_list (tl_desc); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + (void) mca_btl_uct_query_tls (module, md, tl_desc, num_tls); + + uct_release_tl_resource_list (tl_desc); + + /* release the initial reference to the md object. if any modules were created the UCT md will remain + * open until those modules are finalized. */ + OBJ_RELEASE(md); + + if (NULL == module->am_tl && NULL == module->rdma_tl) { + BTL_VERBOSE(("uct memory domain %s does not have any appropriate tls", md_desc->md_name)); + mca_btl_uct_finalize (&module->super); + return OPAL_ERR_NOT_AVAILABLE; + } + + mca_btl_uct_component.modules[mca_btl_uct_component.module_count++] = module; + + /* NTH: a registration cache shouldn't be necessary when using UCT but there are measurable + * performance benefits to using rcache/grdma instead of assuming UCT will do the right + * thing. */ + (void) asprintf (&tmp, "uct.%s", module->md_name); + + rcache_resources.cache_name = tmp; + rcache_resources.reg_data = (void *) module; + rcache_resources.sizeof_reg = sizeof (mca_btl_uct_reg_t) + module->super.btl_registration_handle_size; + rcache_resources.register_mem = mca_btl_uct_reg_mem; + rcache_resources.deregister_mem = mca_btl_uct_dereg_mem; + + module->rcache = mca_rcache_base_module_create ("grdma", module, &rcache_resources); + free (tmp); + if (NULL == module->rcache) { + /* something when horribly wrong */ + BTL_VERBOSE(("could not allocate a registration cache for this btl module")); + mca_btl_uct_finalize (&module->super); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} + +/* + * UCT component initialization: + * (1) read interface list from kernel and compare against component parameters + * then create a BTL instance for selected interfaces + * (2) setup UCT listen socket for incoming connection attempts + * (3) register BTL parameters with the MCA + */ + +static mca_btl_base_module_t **mca_btl_uct_component_init (int *num_btl_modules, bool enable_progress_threads, + bool enable_mpi_threads) +{ + /* for this BTL to be useful the interface needs to support RDMA and certain atomic operations */ + struct mca_btl_base_module_t **base_modules; + uct_md_resource_desc_t *resources; + unsigned resource_count; + char **allowed_ifaces; + int rc; + + BTL_VERBOSE(("initializing uct btl")); + + if (NULL == mca_btl_uct_component.memory_domains || 0 == strlen (mca_btl_uct_component.memory_domains) || + 0 == strcmp (mca_btl_uct_component.memory_domains, "none")) { + BTL_VERBOSE(("no uct memory domains specified")); + return NULL; + } + + allowed_ifaces = opal_argv_split (mca_btl_uct_component.memory_domains, ','); + if (NULL == allowed_ifaces) { + return NULL; + } + + uct_query_md_resources (&resources, &resource_count); + + mca_btl_uct_component.module_count = 0; + + /* generate all suitable btl modules */ + for (unsigned i = 0 ; i < resource_count ; ++i) { + rc = mca_btl_uct_component_process_uct_md (resources + i, allowed_ifaces); + if (OPAL_SUCCESS != rc) { + break; + } + } + + opal_argv_free (allowed_ifaces); + uct_release_md_resource_list (resources); + + mca_btl_uct_modex_send (); + + /* pass module array back to caller */ + base_modules = calloc (mca_btl_uct_component.module_count, sizeof (*base_modules)); + if (NULL == base_modules) { + return NULL; + } + + memcpy (base_modules, mca_btl_uct_component.modules, mca_btl_uct_component.module_count * + sizeof (mca_btl_uct_component.modules[0])); + + *num_btl_modules = mca_btl_uct_component.module_count; + + BTL_VERBOSE(("uct btl initialization complete. found %d suitable memory domains", + mca_btl_uct_component.module_count)); + + return base_modules; +} + +int mca_btl_uct_tl_progress (mca_btl_uct_tl_t *tl, int starting_index) +{ + unsigned int ret = 0; + + if (NULL == tl) { + return 0; + } + + for (int j = 0 ; j < tl->max_device_contexts ; ++j) { + if (tl->uct_dev_contexts[j]) { + ret += mca_btl_uct_context_progress (tl->uct_dev_contexts[j]); + } + } + + return ret; +} + +static int mca_btl_uct_component_progress_pending (mca_btl_uct_module_t *uct_btl) +{ + mca_btl_uct_base_frag_t *frag, *next; + size_t count; + + if (0 == (count = opal_list_get_size (&uct_btl->pending_frags))) { + return 0; + } + + OPAL_THREAD_LOCK(&uct_btl->lock); + OPAL_LIST_FOREACH_SAFE(frag, next, &uct_btl->pending_frags, mca_btl_uct_base_frag_t) { + if (!frag->ready) { + continue; + } + + opal_list_remove_item (&uct_btl->pending_frags, (opal_list_item_t *) frag); + + if (OPAL_SUCCESS > mca_btl_uct_send (&uct_btl->super, frag->endpoint, &frag->base, + frag->header.data.tag)) { + opal_list_prepend (&uct_btl->pending_frags, (opal_list_item_t *) frag); + } + } + OPAL_THREAD_UNLOCK(&uct_btl->lock); + + return OPAL_SUCCESS; +} + +/** + * @brief UCT BTL progress function + * + * This function explictly progresses all workers. + */ +static int mca_btl_uct_component_progress (void) +{ + int starting_index = mca_btl_uct_get_context_index (); + unsigned ret = 0; + + for (unsigned i = 0 ; i < mca_btl_uct_component.module_count ; ++i) { + mca_btl_uct_module_t *module = mca_btl_uct_component.modules[i]; + + /* unlike ucp, uct actually tells us something useful! its almost like it was "inspired" + * by the btl progress functions.... */ + ret += mca_btl_uct_tl_progress (module->rdma_tl, starting_index); + + if (module->am_tl != module->rdma_tl) { + ret += mca_btl_uct_tl_progress (module->am_tl, starting_index); + } + + if (module->conn_tl) { + if (module->conn_tl != module->am_tl && module->conn_tl != module->rdma_tl) { + ret += mca_btl_uct_tl_progress (module->conn_tl, 0); + } + } + + if (0 != opal_list_get_size (&module->pending_frags)) { + mca_btl_uct_component_progress_pending (module); + } + } + + return (int) ret; +} + +/** UCT btl component */ +mca_btl_uct_component_t mca_btl_uct_component = { + .super = { + .btl_version = { + MCA_BTL_DEFAULT_VERSION("uct"), + .mca_open_component = mca_btl_uct_component_open, + .mca_close_component = mca_btl_uct_component_close, + .mca_register_component_params = mca_btl_uct_component_register, + }, + .btl_data = { + /* The component is not checkpoint ready */ + .param_field = MCA_BASE_METADATA_PARAM_NONE + }, + + .btl_init = mca_btl_uct_component_init, + .btl_progress = mca_btl_uct_component_progress, + } +}; diff --git a/opal/mca/btl/uct/btl_uct_device_context.h b/opal/mca/btl/uct/btl_uct_device_context.h new file mode 100644 index 00000000000..ccb4f3be71f --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_device_context.h @@ -0,0 +1,162 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(BTL_UCT_DEVICE_CONTEXT_H) +#define BTL_UCT_DEVICE_CONTEXT_H + +#include "btl_uct.h" +#include "btl_uct_rdma.h" +#include "btl_uct_frag.h" + +/** + * @brief Create a new device context for the given transport + * + * @param[in] module btl uct module + * @param[in] tl btl uct tl pointer + * @param[in] context_id identifier for this context (0..MCA_BTL_UCT_MAX_WORKERS-1) + */ +mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id); + +/** + * @brief Destroy a device context and release all resources + * + * @param[in] context btl uct device context + * + * This call frees a device context and all assoicated resources. It is not + * valid to use the device context after this returns. + */ +void mca_btl_uct_context_destroy (mca_btl_uct_device_context_t *context); + +static inline bool mca_btl_uct_context_trylock (mca_btl_uct_device_context_t *context) +{ + return OPAL_THREAD_TRYLOCK(&context->mutex); +} + +static inline void mca_btl_uct_context_lock (mca_btl_uct_device_context_t *context) +{ + OPAL_THREAD_LOCK (&context->mutex); +} + +static inline void mca_btl_uct_context_unlock (mca_btl_uct_device_context_t *context) +{ + OPAL_THREAD_UNLOCK (&context->mutex); +} + +#define MCA_BTL_UCT_CONTEXT_SERIALIZE(context,code) \ + do { \ + mca_btl_uct_context_lock (context); \ + code; \ + mca_btl_uct_context_unlock(context); \ + } while (0); + +static inline int mca_btl_uct_get_context_index (void) +{ + static volatile uint32_t next_uct_index = 0; + int context_id; + +#if OPAL_C_HAVE__THREAD_LOCAL + if (mca_btl_uct_component.bind_threads_to_contexts) { + static _Thread_local int uct_index = -1; + + context_id = uct_index; + if (OPAL_UNLIKELY(-1 == context_id)) { + context_id = uct_index = opal_atomic_fetch_add_32 ((volatile int32_t *) &next_uct_index, 1) % + mca_btl_uct_component.num_contexts_per_module; + } + } else { +#endif + /* avoid using atomics in this. i doubt it improves performance to ensure atomicity on the next + * index in this case. */ + context_id = next_uct_index++ % mca_btl_uct_component.num_contexts_per_module; +#if OPAL_C_HAVE__THREAD_LOCAL + } +#endif + + return context_id; +} + +static inline mca_btl_uct_device_context_t * +mca_btl_uct_module_get_tl_context_specific (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id) +{ + mca_btl_uct_device_context_t *context = tl->uct_dev_contexts[context_id]; + + if (OPAL_UNLIKELY(NULL == context)) { + mca_btl_uct_device_context_t *new_context; + + new_context = mca_btl_uct_context_create (module, tl, context_id); + if (!opal_atomic_compare_exchange_strong_ptr (&tl->uct_dev_contexts[context_id], &context, new_context)) { + mca_btl_uct_context_destroy (new_context); + } else { + context = new_context; + } + } + + return context; +} + +static inline mca_btl_uct_device_context_t *mca_btl_uct_module_get_rdma_context (mca_btl_uct_module_t *module) +{ + return mca_btl_uct_module_get_tl_context_specific (module, module->rdma_tl, mca_btl_uct_get_context_index ()); +} + +static inline mca_btl_uct_device_context_t *mca_btl_uct_module_get_rdma_context_specific (mca_btl_uct_module_t *module, int context_id) +{ + return mca_btl_uct_module_get_tl_context_specific (module, module->rdma_tl, context_id); +} + +static inline mca_btl_uct_device_context_t *mca_btl_uct_module_get_am_context (mca_btl_uct_module_t *module) +{ + return mca_btl_uct_module_get_tl_context_specific (module, module->am_tl, mca_btl_uct_get_context_index ()); +} + +static inline void mca_btl_uct_device_handle_completions (mca_btl_uct_device_context_t *dev_context) +{ + mca_btl_uct_uct_completion_t *comp; + + while (NULL != (comp = (mca_btl_uct_uct_completion_t *) opal_fifo_pop (&dev_context->completion_fifo))) { + int rc = UCS_OK == comp->status ? OPAL_SUCCESS : OPAL_ERROR; + + if (comp->frag) { + /* reset the count */ + comp->uct_comp.count = 1; + mca_btl_uct_frag_complete (comp->frag, rc); + + continue; + } + + /* we may be calling the callback before remote completion. this is in violation of the + * btl interface specification but should not hurt in non-ob1 use cases. if this ever + * becomes a problem we can look at possible solutions. */ + comp->cbfunc (comp->btl, comp->endpoint, comp->local_address, comp->local_handle, + comp->cbcontext, comp->cbdata, rc); + mca_btl_uct_uct_completion_release (comp); + } +} + +static inline int mca_btl_uct_context_progress (mca_btl_uct_device_context_t *context) +{ + int ret = 0; + + if (!context->uct_worker) { + return 0; + } + + if (!mca_btl_uct_context_trylock (context)) { + ret = uct_worker_progress (context->uct_worker); + mca_btl_uct_context_unlock (context); + + mca_btl_uct_device_handle_completions (context); + } + + return ret; +} + +#endif /* BTL_UCT_DEVICE_CONTEXT_H */ diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c new file mode 100644 index 00000000000..576e01f13da --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -0,0 +1,396 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_uct.h" +#include "btl_uct_endpoint.h" +#include "btl_uct_device_context.h" +#include "btl_uct_am.h" +#include "opal/util/proc.h" + +static void mca_btl_uct_endpoint_construct (mca_btl_uct_endpoint_t *endpoint) +{ + memset (endpoint->uct_eps, 0, sizeof (endpoint->uct_eps)); + endpoint->conn_ep = NULL; + OBJ_CONSTRUCT(&endpoint->ep_lock, opal_recursive_mutex_t); +} + +static void mca_btl_uct_endpoint_destruct (mca_btl_uct_endpoint_t *endpoint) +{ + for (int tl_index = 0 ; tl_index < 2 ; ++tl_index) { + for (int i = 0 ; i < MCA_BTL_UCT_MAX_WORKERS ; ++i) { + if (NULL != endpoint->uct_eps[tl_index][i].uct_ep) { + uct_ep_destroy (endpoint->uct_eps[tl_index][i].uct_ep); + } + } + } + + memset (endpoint->uct_eps, 0, sizeof (endpoint->uct_eps)); + + OBJ_DESTRUCT(&endpoint->ep_lock); +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_endpoint_t, opal_object_t, + mca_btl_uct_endpoint_construct, + mca_btl_uct_endpoint_destruct); + +mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create (opal_proc_t *proc) +{ + mca_btl_uct_endpoint_t *endpoint = OBJ_NEW(mca_btl_uct_endpoint_t); + + if (OPAL_UNLIKELY(NULL == endpoint)) { + return NULL; + } + + endpoint->ep_proc = proc; + + return (mca_btl_base_endpoint_t *) endpoint; +} + +static unsigned char *mca_btl_uct_process_modex_tl (unsigned char *modex_data) +{ + BTL_VERBOSE(("processing modex for tl %s. size: %u", modex_data, *((uint32_t *) modex_data))); + + /* skip size and name */ + return modex_data + 4 + strlen ((char *) modex_data + 4) + 1; +} + +static void mca_btl_uct_process_modex (mca_btl_uct_module_t *uct_btl, unsigned char *modex_data, + unsigned char **rdma_tl_data, unsigned char **am_tl_data, + unsigned char **conn_tl_data) +{ + BTL_VERBOSE(("processing remote modex data")); + + if (uct_btl->rdma_tl) { + BTL_VERBOSE(("modex contains RDMA data")); + if (rdma_tl_data) { + *rdma_tl_data = mca_btl_uct_process_modex_tl (modex_data); + } + modex_data += *((uint32_t *) modex_data); + } else if (rdma_tl_data) { + *rdma_tl_data = NULL; + } + + if (uct_btl->am_tl && uct_btl->am_tl != uct_btl->rdma_tl) { + BTL_VERBOSE(("modex contains active message data")); + if (am_tl_data) { + *am_tl_data = mca_btl_uct_process_modex_tl (modex_data); + } + modex_data += *((uint32_t *) modex_data); + } else if (am_tl_data) { + *am_tl_data = NULL; + } + + if (uct_btl->conn_tl && uct_btl->conn_tl != uct_btl->rdma_tl && uct_btl->conn_tl != uct_btl->am_tl) { + BTL_VERBOSE(("modex contains connection data")); + if (conn_tl_data) { + *conn_tl_data = mca_btl_uct_process_modex_tl (modex_data); + } + modex_data += *((uint32_t *) modex_data); + } else if (conn_tl_data) { + *conn_tl_data = NULL; + } +} + +static int mca_btl_uct_endpoint_connect_iface (mca_btl_uct_module_t *uct_btl, mca_btl_uct_tl_t *tl, + mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data) +{ + uct_device_addr_t *device_addr = NULL; + uct_iface_addr_t *iface_addr; + ucs_status_t ucs_status; + + /* easy case. just connect to the interface */ + iface_addr = (uct_iface_addr_t *) tl_data; + device_addr = (uct_device_addr_t *) ((uintptr_t) iface_addr + tl->uct_iface_attr.iface_addr_len); + + BTL_VERBOSE(("connecting endpoint to interface")); + + mca_btl_uct_context_lock (tl_context); + ucs_status = uct_ep_create_connected (tl_context->uct_iface, device_addr, iface_addr, &tl_endpoint->uct_ep); + mca_btl_uct_context_unlock (tl_context); + + tl_endpoint->flags = MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY; + + return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR; +} + +static void mca_btl_uct_connection_ep_construct (mca_btl_uct_connection_ep_t *ep) +{ + ep->uct_ep = NULL; +} + +static void mca_btl_uct_connection_ep_destruct (mca_btl_uct_connection_ep_t *ep) +{ + if (ep->uct_ep) { + uct_ep_destroy (ep->uct_ep); + ep->uct_ep = NULL; + } +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_connection_ep_t, opal_object_t, mca_btl_uct_connection_ep_construct, + mca_btl_uct_connection_ep_destruct); + +static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_device_context_t *conn_tl_context, + int64_t type, void *request, size_t request_length) +{ + mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; + ucs_status_t ucs_status; + + BTL_VERBOSE(("sending connection request to peer. type: %" PRId64 ", length: %" PRIsize_t, + type, request_length)); + + OBJ_RETAIN(endpoint->conn_ep); + + /* need to drop the lock to avoid hold-and-wait */ + opal_mutex_unlock (&endpoint->ep_lock); + + do { + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status = uct_ep_am_short (conn_ep->uct_ep, MCA_BTL_UCT_CONNECT_RDMA, type, request, request_length); + }); + if (OPAL_LIKELY(UCS_OK == ucs_status)) { + break; + } + + if (OPAL_UNLIKELY(UCS_ERR_NO_RESOURCE != ucs_status)) { + return OPAL_ERROR; + } + + /* some TLs (UD for example) need to be progressed to get resources */ + mca_btl_uct_context_progress (conn_tl_context); + } while (1); + + /* for now we just wait for the connection request to complete before continuing */ + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + do { + uct_worker_progress (conn_tl_context->uct_worker); + ucs_status = uct_ep_flush (conn_ep->uct_ep, 0, NULL); + } while (UCS_INPROGRESS == ucs_status); + }); + + opal_mutex_lock (&endpoint->ep_lock); + + OBJ_RELEASE(endpoint->conn_ep); + + return OPAL_SUCCESS; +} + +static int mca_btl_uct_endpoint_connect_endpoint (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + mca_btl_uct_tl_t *tl, mca_btl_uct_device_context_t *tl_context, + mca_btl_uct_tl_endpoint_t *tl_endpoint, uint8_t *tl_data, + uint8_t *conn_tl_data, void *ep_addr) +{ + size_t request_length = sizeof (mca_btl_uct_conn_req_t) + tl->uct_iface_attr.ep_addr_len; + mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep; + mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl; + mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0]; + mca_btl_uct_conn_req_t *request = alloca (request_length); + uct_device_addr_t *device_addr = NULL; + uct_iface_addr_t *iface_addr; + ucs_status_t ucs_status; + int rc; + + assert (NULL != conn_tl); + + BTL_VERBOSE(("connecting endpoint to remote endpoint")); + + if (NULL == conn_ep) { + BTL_VERBOSE(("creating a temporary endpoint for handling connections to %p", + opal_process_name_print (endpoint->ep_proc->proc_name))); + + iface_addr = (uct_iface_addr_t *) conn_tl_data; + device_addr = (uct_device_addr_t *) ((uintptr_t) conn_tl_data + conn_tl->uct_iface_attr.iface_addr_len); + + endpoint->conn_ep = conn_ep = OBJ_NEW(mca_btl_uct_connection_ep_t); + if (OPAL_UNLIKELY(NULL == conn_ep)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + /* create a temporary endpoint for setting up the rdma endpoint */ + MCA_BTL_UCT_CONTEXT_SERIALIZE(conn_tl_context, { + ucs_status = uct_ep_create_connected (conn_tl_context->uct_iface, device_addr, iface_addr, + &conn_ep->uct_ep); + }); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("could not create an endpoint for forming connection to remote peer. code = %d", + ucs_status)); + return OPAL_ERROR; + } + } else { + OBJ_RETAIN(conn_ep); + } + + /* fill in common request parameters */ + request->proc_name = OPAL_PROC_MY_NAME; + request->context_id = tl_context->context_id; + request->tl_index = tl->tl_index; + + if (NULL == tl_endpoint->uct_ep) { + BTL_VERBOSE(("allocating endpoint for peer %s and sending connection data", + opal_process_name_print (endpoint->ep_proc->proc_name))); + + MCA_BTL_UCT_CONTEXT_SERIALIZE(tl_context, { + ucs_status = uct_ep_create (tl_context->uct_iface, &tl_endpoint->uct_ep); + }); + if (UCS_OK != ucs_status) { + OBJ_RELEASE(endpoint->conn_ep); + return OPAL_ERROR; + } + + /* fill in connection request */ + ucs_status = uct_ep_get_address (tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr); + if (UCS_OK != ucs_status) { + /* this is a fatal a fatal error */ + OBJ_RELEASE(endpoint->conn_ep); + uct_ep_destroy (tl_endpoint->uct_ep); + tl_endpoint->uct_ep = NULL; + return OPAL_ERROR; + } + + rc = mca_btl_uct_endpoint_send_conn_req (uct_btl, endpoint, conn_tl_context, 0, request, + request_length); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + OBJ_RELEASE(endpoint->conn_ep); + uct_ep_destroy (tl_endpoint->uct_ep); + tl_endpoint->uct_ep = NULL; + return OPAL_ERROR; + } + } + + if (ep_addr) { + BTL_VERBOSE(("using remote endpoint address to connect endpoint. ep_addr = %p", ep_addr)); + + device_addr = (uct_device_addr_t *) tl_data; + + /* NTH: there is no need to lock the device context in this case */ + ucs_status = uct_ep_connect_to_ep (tl_endpoint->uct_ep, device_addr, ep_addr); + if (UCS_OK != ucs_status) { + return OPAL_ERROR; + } + + /* let the remote side know that the connection has been established and + * wait for the message to be sent */ + rc = mca_btl_uct_endpoint_send_conn_req (uct_btl, endpoint, conn_tl_context, 1, request, + sizeof (mca_btl_uct_conn_req_t)); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + OBJ_RELEASE(endpoint->conn_ep); + uct_ep_destroy (tl_endpoint->uct_ep); + tl_endpoint->uct_ep = NULL; + return OPAL_ERROR; + } + } + + return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS : OPAL_ERR_OUT_OF_RESOURCE; +} + +int mca_btl_uct_endpoint_connect (mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, int context_id, + void *ep_addr, int tl_index) +{ + mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[tl_index] + context_id; + mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_rdma_context_specific (uct_btl, context_id); + mca_btl_uct_tl_t *tl = (tl_index == uct_btl->rdma_tl->tl_index) ? uct_btl->rdma_tl : uct_btl->am_tl; + uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data; + mca_btl_uct_connection_ep_t *conn_ep = NULL; + mca_btl_uct_modex_t *modex; + uint8_t *modex_data; + size_t msg_size; + int rc; + + /* only two types of endpoints at this time */ + assert (tl_index < 2); + + if (OPAL_UNLIKELY(NULL == tl)) { + return OPAL_ERR_UNREACH; + } + + BTL_VERBOSE(("checking endpoint %p with context id %d. cached uct ep: %p, ready: %d", endpoint, context_id, + tl_endpoint->uct_ep, !!(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags))); + + opal_mutex_lock (&endpoint->ep_lock); + if (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & tl_endpoint->flags) { + opal_mutex_unlock (&endpoint->ep_lock); + /* nothing more to do. someone else completed the connection */ + return OPAL_SUCCESS; + } + + /* dumpicate connection request. nothing to do until the endpoint data is received */ + if (NULL != tl_endpoint->uct_ep && NULL == ep_addr) { + opal_mutex_unlock (&endpoint->ep_lock); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + do { + /* read the modex. this is done both to start the connection and to process endpoint data */ + OPAL_MODEX_RECV(rc, &mca_btl_uct_component.super.btl_version, + &endpoint->ep_proc->proc_name, (void **)&modex, &msg_size); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_ERROR(("error receiving modex")); + break; + } + + BTL_VERBOSE(("received modex of size %lu for proc %s. module count %d", (unsigned long) msg_size, + OPAL_NAME_PRINT(endpoint->ep_proc->proc_name), modex->module_count)); + modex_data = modex->data; + + /* look for matching transport in the modex */ + for (int i = 0 ; i < modex->module_count ; ++i) { + uint32_t modex_size = *((uint32_t *) modex_data); + + BTL_VERBOSE(("found modex for md %s, searching for %s", modex_data + 4, uct_btl->md_name)); + + modex_data += 4; + + if (0 != strcmp ((char *) modex_data, uct_btl->md_name)) { + /* modex belongs to a different module, skip it and continue */ + modex_data += modex_size - 4; + continue; + } + + modex_data += strlen ((char *) modex_data) + 1; + + mca_btl_uct_process_modex (uct_btl, modex_data, &rdma_tl_data, &am_tl_data, &conn_tl_data); + break; + } + + tl_data = (tl == uct_btl->rdma_tl) ? rdma_tl_data : am_tl_data; + + if (NULL == tl_data) { + opal_mutex_unlock (&endpoint->ep_lock); + return OPAL_ERR_UNREACH; + } + + /* connect the endpoint */ + if (!mca_btl_uct_tl_requires_connection_tl (tl)) { + rc = mca_btl_uct_endpoint_connect_iface (uct_btl, tl, tl_context, tl_endpoint, tl_data); + } else { + rc = mca_btl_uct_endpoint_connect_endpoint (uct_btl, endpoint, tl, tl_context, tl_endpoint, + tl_data, conn_tl_data, ep_addr); + } + + } while (0); + + /* to avoid a possible hold-and wait deadlock. destroy the endpoint after dropping the endpoint lock. */ + if (endpoint->conn_ep && 1 == endpoint->conn_ep->super.obj_reference_count) { + conn_ep = endpoint->conn_ep; + endpoint->conn_ep = NULL; + } + + opal_mutex_unlock (&endpoint->ep_lock); + + if (conn_ep) { + OBJ_RELEASE(conn_ep); + } + + BTL_VERBOSE(("endpoint%s ready for use", (OPAL_ERR_OUT_OF_RESOURCE != rc) ? "" : " not yet")); + + return rc; +} diff --git a/opal/mca/btl/uct/btl_uct_endpoint.h b/opal/mca/btl/uct/btl_uct_endpoint.h new file mode 100644 index 00000000000..d77288d0731 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_endpoint.h @@ -0,0 +1,94 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_BTL_UCT_ENDPOINT_H +#define MCA_BTL_UCT_ENDPOINT_H + +#include "opal/class/opal_list.h" +#include "opal/mca/event/event.h" +#include "btl_uct.h" + +BEGIN_C_DECLS + +mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create (opal_proc_t *proc); +int mca_btl_uct_endpoint_connect (mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, int ep_index, void *ep_addr, int tl_index); + +static int mca_btl_uct_endpoint_test_am (mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, + mca_btl_uct_device_context_t *context, uct_ep_h *ep_handle) +{ + int tl_index = module->am_tl->tl_index; + int ep_index = context->context_id; + + if (OPAL_LIKELY(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & endpoint->uct_eps[tl_index][ep_index].flags)) { + *ep_handle = endpoint->uct_eps[tl_index][ep_index].uct_ep; + return OPAL_SUCCESS; + } + + return OPAL_ERR_NOT_AVAILABLE; +} + +/** + * @brief Check if the endpoint is connected and start the connection if not + * + * @param[in] module UCT BTL module + * @param[in] endpoint UCT BTL endpoint + * @param[in] context UCT BTL device context + * @param[out] ep_handle UCT endpoint handle + * @param[in] tl_index UCT TL index (0 or 1) + * + * @returns OPAL_SUCCESS if the endpoint is connected and ready to us + * @returns OPAL_ERR_RESOURCE_BUSY if the connection is underway + * @returns OPAL_ERROR otherwise + */ +static inline int mca_btl_uct_endpoint_check (mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, + mca_btl_uct_device_context_t *context, uct_ep_h *ep_handle, + const int tl_index) +{ + int ep_index = context->context_id; + int rc; + + if (OPAL_LIKELY(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & endpoint->uct_eps[tl_index][ep_index].flags)) { + *ep_handle = endpoint->uct_eps[tl_index][ep_index].uct_ep; + return OPAL_SUCCESS; + } + + rc = mca_btl_uct_endpoint_connect (module, endpoint, ep_index, NULL, tl_index); + *ep_handle = endpoint->uct_eps[tl_index][ep_index].uct_ep; + BTL_VERBOSE(("mca_btl_uct_endpoint_connect returned %d", rc)); + return rc; +} + +static inline int mca_btl_uct_endpoint_check_rdma (mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, + mca_btl_uct_device_context_t *context, uct_ep_h *ep_handle) +{ + assert (NULL != module->rdma_tl); + return mca_btl_uct_endpoint_check (module, endpoint, context, ep_handle, module->rdma_tl->tl_index); +} + +static inline int mca_btl_uct_endpoint_check_am (mca_btl_uct_module_t *module, mca_btl_uct_endpoint_t *endpoint, + mca_btl_uct_device_context_t *context, uct_ep_h *ep_handle) +{ + assert (NULL != module->am_tl); + return mca_btl_uct_endpoint_check (module, endpoint, context, ep_handle, module->am_tl->tl_index); +} + +END_C_DECLS +#endif diff --git a/opal/mca/btl/uct/btl_uct_frag.c b/opal/mca/btl/uct/btl_uct_frag.c new file mode 100644 index 00000000000..3e5622cac45 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_frag.c @@ -0,0 +1,55 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_uct_frag.h" + +static void mca_btl_uct_frag_completion (uct_completion_t *uct_comp, ucs_status_t status) +{ + mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t *) ((uintptr_t) uct_comp - offsetof (mca_btl_uct_uct_completion_t, uct_comp)); + + BTL_VERBOSE(("frag operation complete. frag = %p. status = %d", (void *) comp->frag, status)); + + comp->status = status; + opal_fifo_push (&comp->dev_context->completion_fifo, &comp->super.super); +} + +static void mca_btl_uct_base_frag_constructor (mca_btl_uct_base_frag_t *frag) +{ + mca_btl_uct_reg_t *reg = (mca_btl_uct_reg_t *) frag->base.super.registration; + + /* zero everything out */ + memset ((char *) frag + sizeof (frag->base), 0, sizeof (*frag) - sizeof (frag->base)); + + OBJ_CONSTRUCT(&frag->comp, mca_btl_uct_uct_completion_t); + + frag->base.des_segments = frag->segments; + frag->base.des_segment_count = 1; + + frag->comp.uct_comp.func = mca_btl_uct_frag_completion; + frag->comp.uct_comp.count = 1; + frag->comp.frag = frag; + + frag->segments[0].seg_addr.pval = frag->base.super.ptr; + frag->uct_iov.buffer = frag->base.super.ptr; + frag->uct_iov.stride = 0; + frag->uct_iov.count = 1; + if (reg) { + frag->uct_iov.memh = reg->uct_memh; + } +} + +static void mca_btl_uct_base_frag_destructor (mca_btl_uct_base_frag_t *frag) +{ + OBJ_DESTRUCT(&frag->comp); +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_base_frag_t, mca_btl_base_descriptor_t, + mca_btl_uct_base_frag_constructor, mca_btl_uct_base_frag_destructor); diff --git a/opal/mca/btl/uct/btl_uct_frag.h b/opal/mca/btl/uct/btl_uct_frag.h new file mode 100644 index 00000000000..8aa8789d0e3 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_frag.h @@ -0,0 +1,63 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(MCA_BTL_UCT_FRAG_H) +#define MCA_BTL_UCT_FRAG_H + +#include "btl_uct.h" + +static inline mca_btl_uct_base_frag_t *mca_btl_uct_frag_alloc (mca_btl_uct_module_t *uct_btl, opal_free_list_t *fl, + mca_btl_base_endpoint_t *endpoint) +{ + mca_btl_uct_base_frag_t *frag = (mca_btl_uct_base_frag_t *) opal_free_list_get (fl); + if (OPAL_LIKELY(NULL != frag)) { + frag->free_list = fl; + frag->endpoint = endpoint; + frag->btl = uct_btl; + } + + return frag; +} + +static inline void mca_btl_uct_frag_return (mca_btl_uct_base_frag_t *frag) +{ + opal_free_list_return (frag->free_list, &frag->base.super); +} + +static inline void mca_btl_uct_frag_complete (mca_btl_uct_base_frag_t *frag, int rc) { + mca_btl_uct_module_t *uct_btl = frag->btl; + + /* call callback if specified */ + if (frag->base.des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK) { + frag->base.des_cbfunc(&uct_btl->super, frag->endpoint, &frag->base, rc); + } + + if (OPAL_LIKELY(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) { + mca_btl_uct_frag_return (frag); + } +} + +static inline mca_btl_uct_base_frag_t *mca_btl_uct_frag_alloc_short (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint) +{ + return mca_btl_uct_frag_alloc (uct_btl, &uct_btl->short_frags, endpoint); +} + +static inline mca_btl_uct_base_frag_t *mca_btl_uct_frag_alloc_eager (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint) +{ + return mca_btl_uct_frag_alloc (uct_btl, &uct_btl->eager_frags, endpoint); +} + +static inline mca_btl_uct_base_frag_t *mca_btl_uct_frag_alloc_max (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint) +{ + return mca_btl_uct_frag_alloc (uct_btl, &uct_btl->max_frags, endpoint); +} + +#endif /* !defined(MCA_BTL_UCT_FRAG_H) */ diff --git a/opal/mca/btl/uct/btl_uct_module.c b/opal/mca/btl/uct/btl_uct_module.c new file mode 100644 index 00000000000..245c3eddd12 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_module.c @@ -0,0 +1,386 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" +#include +#include "opal/class/opal_bitmap.h" +#include "opal/mca/btl/btl.h" +#include "opal/datatype/opal_convertor.h" +#include "opal/mca/mpool/base/base.h" +#include "opal/mca/mpool/mpool.h" + +#include "btl_uct.h" +#include "btl_uct_endpoint.h" +#include "btl_uct_am.h" + +#include "opal/memoryhooks/memory.h" +#include "opal/mca/memory/base/base.h" +#include + +static void mca_btl_uct_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc) +{ + ucm_vm_munmap(buf, length); +} + +struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep (struct mca_btl_base_module_t *module, opal_proc_t *proc) +{ + mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) module; + mca_btl_base_endpoint_t *ep; + int rc; + + opal_mutex_lock (&uct_module->endpoint_lock); + + do { + rc = opal_hash_table_get_value_uint64 (&uct_module->id_to_endpoint, (intptr_t) proc, (void **) &ep); + if (OPAL_SUCCESS == rc) { + BTL_VERBOSE(("returning existing endpoint for proc %s", OPAL_NAME_PRINT(proc->proc_name))); + break; + } + + /* Create and Init endpoints */ + ep = mca_btl_uct_endpoint_create (proc); + if (OPAL_UNLIKELY(NULL == ep)) { + BTL_ERROR(("btl/uct error initializing endpoint")); + break; + } + + BTL_VERBOSE(("endpoint initialized. new endpoint: %p", ep)); + + /* add this endpoint to the connection lookup table */ + (void) opal_hash_table_set_value_uint64 (&uct_module->id_to_endpoint, (intptr_t) proc, ep); + } while (0); + + opal_mutex_unlock (&uct_module->endpoint_lock); + + return ep; +} + +static int mca_btl_uct_add_procs (mca_btl_base_module_t *btl, + size_t nprocs, opal_proc_t **opal_procs, + mca_btl_base_endpoint_t **peers, + opal_bitmap_t *reachable) +{ + mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) btl; + int rc; + + if (false == uct_module->initialized) { + mca_btl_uct_tl_t *am_tl = uct_module->am_tl; + mca_btl_uct_tl_t *rdma_tl = uct_module->rdma_tl; + + /* NTH: might want to vary this size based off the universe size (if + * one exists). the table is only used for connection lookup and + * endpoint removal. */ + rc = opal_hash_table_init (&uct_module->id_to_endpoint, 512); + if (OPAL_SUCCESS != rc) { + BTL_ERROR(("error initializing the endpoint hash. rc = %d", rc)); + return rc; + } + + if (am_tl) { + rc = opal_free_list_init (&uct_module->short_frags, sizeof (mca_btl_uct_base_frag_t), + opal_cache_line_size, OBJ_CLASS(mca_btl_uct_base_frag_t), + am_tl->uct_iface_attr.cap.am.max_short, opal_cache_line_size, + 0, 1024, 64, NULL, 0, NULL, NULL, NULL); + + rc = opal_free_list_init (&uct_module->eager_frags, sizeof (mca_btl_uct_base_frag_t), + opal_cache_line_size, OBJ_CLASS(mca_btl_uct_base_frag_t), + btl->btl_eager_limit, opal_cache_line_size, + 0, 1024, 64, NULL, 0, uct_module->rcache, NULL, NULL); + + rc = opal_free_list_init (&uct_module->max_frags, sizeof (mca_btl_uct_base_frag_t), + opal_cache_line_size, OBJ_CLASS(mca_btl_uct_base_frag_t), + btl->btl_max_send_size, opal_cache_line_size, 0, 128, 8, + NULL, 0, uct_module->rcache, NULL, NULL); + } + + if (rdma_tl) { + rc = opal_free_list_init (&uct_module->rdma_completions, sizeof (mca_btl_uct_uct_completion_t), + opal_cache_line_size, OBJ_CLASS(mca_btl_uct_uct_completion_t), + 0, opal_cache_line_size, 0, 4096, 128, NULL, 0, NULL, NULL, + NULL); + } + + if (mca_btl_uct_component.disable_ucx_memory_hooks) { + ucm_set_external_event(UCM_EVENT_VM_UNMAPPED); + opal_mem_hooks_register_release(mca_btl_uct_mem_release_cb, NULL); + } + + uct_module->initialized = true; + } + + for (size_t i = 0 ; i < nprocs ; ++i) { + /* all endpoints are reachable for uct */ + peers[i] = mca_btl_uct_get_ep (btl, opal_procs[i]); + if (OPAL_UNLIKELY(NULL == peers[i])) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + opal_bitmap_set_bit(reachable, i); + } + + return OPAL_SUCCESS; +} + +static int mca_btl_uct_del_procs (mca_btl_base_module_t *btl, size_t nprocs, + opal_proc_t **procs, mca_btl_base_endpoint_t **peers) +{ + mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) btl; + mca_btl_base_endpoint_t *ep; + int rc; + + for (size_t i = 0 ; i < nprocs ; ++i) { + if (NULL == procs[i]) { + continue; + } + + rc = opal_hash_table_get_value_uint64 (&uct_module->id_to_endpoint, (intptr_t) procs[i], (void **) &ep); + if (OPAL_SUCCESS != rc) { + continue; + } + + (void) opal_hash_table_remove_value_uint64 (&uct_module->id_to_endpoint, (intptr_t) procs[i]); + OBJ_RELEASE(ep); + } + + return OPAL_SUCCESS; +} + + +/** + * @brief Register a memory region for put/get/atomic operations. + * + * @param btl (IN) BTL module + * @param endpoint(IN) BTL addressing information (or NULL for all endpoints) + * @param base (IN) Pointer to start of region + * @param size (IN) Size of region + * @param flags (IN) Flags indicating what operation will be performed. Valid + * values are MCA_BTL_DES_FLAGS_PUT, MCA_BTL_DES_FLAGS_GET, + * and MCA_BTL_DES_FLAGS_ATOMIC + * + * @returns a memory registration handle valid for both local and remote operations + * @returns NULL if the region could not be registered + * + * This function registers the specified region with the hardware for use with + * the btl_put, btl_get, btl_atomic_cas, btl_atomic_op, and btl_atomic_fop + * functions. Care should be taken to not hold an excessive number of registrations + * as they may use limited system/NIC resources. + */ +static struct mca_btl_base_registration_handle_t * +mca_btl_uct_register_mem (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *base, + size_t size, uint32_t flags) +{ + mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) btl; + mca_btl_uct_reg_t *reg; + int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; + int rc; + + rc = uct_module->rcache->rcache_register (uct_module->rcache, base, size, 0, access_flags, + (mca_rcache_base_registration_t **) ®); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + return NULL; + } + + return ®->handle; +} + +/** + * @brief Deregister a memory region + * + * @param btl (IN) BTL module region was registered with + * @param handle (IN) BTL registration handle to deregister + * + * This function deregisters the memory region associated with the specified handle. Care + * should be taken to not perform any RDMA or atomic operation on this memory region + * after it is deregistered. It is erroneous to specify a memory handle associated with + * a remote node. + */ +static int mca_btl_uct_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) +{ + mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) btl; + mca_btl_uct_reg_t *reg = + (mca_btl_uct_reg_t *)((intptr_t) handle - offsetof (mca_btl_uct_reg_t, handle)); + + (void) uct_module->rcache->rcache_deregister (uct_module->rcache, ®->base); + + return OPAL_SUCCESS; +} + +int mca_btl_uct_reg_mem (void *reg_data, void *base, size_t size, mca_rcache_base_registration_t *reg) +{ + mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) reg_data; + mca_btl_uct_reg_t *uct_reg = (mca_btl_uct_reg_t *) reg; + ucs_status_t ucs_status; + int uct_flags = 0; + + BTL_VERBOSE(("attempting to register range {%p,%p} with uct", base, (char *) base + size)); + + if (MCA_BTL_REG_FLAG_REMOTE_READ & reg->access_flags) { + uct_flags |= UCT_MD_MEM_ACCESS_REMOTE_GET; + } + if (MCA_BTL_REG_FLAG_REMOTE_WRITE & reg->access_flags) { + uct_flags |= UCT_MD_MEM_ACCESS_REMOTE_PUT; + } + if (MCA_BTL_REG_FLAG_REMOTE_ATOMIC & reg->access_flags) { + uct_flags |= UCT_MD_MEM_ACCESS_REMOTE_ATOMIC; + } + + /* UCT barfs if there are no access flags */ + if (0 == uct_flags) { + uct_flags = UCT_MD_MEM_ACCESS_ALL; + } + + ucs_status = uct_md_mem_reg (uct_module->md->uct_md, base, size, uct_flags, &uct_reg->uct_memh); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("Error registering memory with UCT. code: %d", ucs_status)); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (reg->access_flags & (MCA_BTL_REG_FLAG_REMOTE_READ | MCA_BTL_REG_FLAG_REMOTE_WRITE | MCA_BTL_REG_FLAG_REMOTE_ATOMIC)) { + /* requested registration may be used by a remote process so go ahead and pack + * the registration handle */ + ucs_status = uct_md_mkey_pack (uct_module->md->uct_md, uct_reg->uct_memh, uct_reg->handle.packed_handle); + if (OPAL_UNLIKELY(UCS_OK != ucs_status)) { + BTL_VERBOSE(("Could not pack remote key. code: %d", ucs_status)); + uct_md_mem_dereg (uct_module->md->uct_md, uct_reg->uct_memh); + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + + return OPAL_SUCCESS; +} + +int mca_btl_uct_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg) +{ + mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) reg_data; + mca_btl_uct_reg_t *uct_reg = (mca_btl_uct_reg_t *) reg; + + uct_md_mem_dereg (uct_module->md->uct_md, uct_reg->uct_memh); + + return OPAL_SUCCESS; +} + + +/* + * Cleanup/release module resources. + */ + +int mca_btl_uct_finalize (mca_btl_base_module_t* btl) +{ + mca_btl_uct_module_t *uct_module = (mca_btl_uct_module_t *) btl; + mca_btl_uct_endpoint_t *endpoint; + uint64_t key; + + /* clean up any leftover endpoints */ + OPAL_HASH_TABLE_FOREACH(key, uint64, endpoint, &uct_module->id_to_endpoint) { + OBJ_RELEASE(endpoint); + } + OBJ_DESTRUCT(&uct_module->id_to_endpoint); + OBJ_DESTRUCT(&uct_module->short_frags); + OBJ_DESTRUCT(&uct_module->eager_frags); + OBJ_DESTRUCT(&uct_module->max_frags); + OBJ_DESTRUCT(&uct_module->rdma_completions); + OBJ_DESTRUCT(&uct_module->pending_frags); + OBJ_DESTRUCT(&uct_module->lock); + + if (uct_module->rcache) { + mca_rcache_base_module_destroy (uct_module->rcache); + } + + if (NULL != uct_module->am_tl) { + OBJ_RELEASE(uct_module->am_tl); + } + + if (NULL != uct_module->conn_tl) { + OBJ_RELEASE(uct_module->conn_tl); + } + + if (NULL != uct_module->rdma_tl) { + OBJ_RELEASE(uct_module->rdma_tl); + } + + ucs_async_context_destroy (uct_module->ucs_async); + + OBJ_DESTRUCT(&uct_module->endpoint_lock); + + free (uct_module->md_name); + free (uct_module); + + return OPAL_SUCCESS; +} + +mca_btl_uct_module_t mca_btl_uct_module_template = { + .super = { + /* initialize functions. this btl only support RDMA and atomics + * for now so it does not provide prepare_src, alloc, free, or send */ + .btl_component = &mca_btl_uct_component.super, + .btl_add_procs = mca_btl_uct_add_procs, + .btl_del_procs = mca_btl_uct_del_procs, + .btl_finalize = mca_btl_uct_finalize, + .btl_put = mca_btl_uct_put, + .btl_get = mca_btl_uct_get, + .btl_register_mem = mca_btl_uct_register_mem, + .btl_deregister_mem = mca_btl_uct_deregister_mem, + .btl_atomic_op = mca_btl_uct_aop, + .btl_atomic_fop = mca_btl_uct_afop, + .btl_atomic_cswap = mca_btl_uct_acswap, + .btl_flush = mca_btl_uct_flush, + + .btl_sendi = mca_btl_uct_sendi, + .btl_send = mca_btl_uct_send, + .btl_alloc = mca_btl_uct_alloc, + .btl_free = mca_btl_uct_free, + + /* set the default flags for this btl. uct provides us with rdma and both + * fetching and non-fetching atomics (though limited to add and cswap) */ + .btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS, + .btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_CSWAP | + MCA_BTL_ATOMIC_SUPPORTS_SWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT, + + /* set the default limits on put and get */ + .btl_put_limit = 1 << 23, + .btl_put_alignment = 0, + .btl_get_limit = 1 << 23, + .btl_get_alignment = 0, + + .btl_rndv_eager_limit = 8192, + .btl_rdma_pipeline_frag_size = 4 * 1024 * 1024, + .btl_rdma_pipeline_send_length = 8192, + .btl_eager_limit = 8192, + .btl_max_send_size = 65536, + } +}; + +OBJ_CLASS_INSTANCE(mca_btl_uct_reg_t, opal_free_list_item_t, NULL, NULL); + +void mca_btl_uct_md_construct (mca_btl_uct_md_t *md) +{ + md->uct_md = NULL; +} + +void mca_btl_uct_md_destruct (mca_btl_uct_md_t *md) +{ + if (md->uct_md) { + uct_md_close (md->uct_md); + md->uct_md = NULL; + } +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_md_t, opal_object_t, mca_btl_uct_md_construct, mca_btl_uct_md_destruct); diff --git a/opal/mca/btl/uct/btl_uct_rdma.c b/opal/mca/btl/uct/btl_uct_rdma.c new file mode 100644 index 00000000000..6bbbe9f57a4 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_rdma.c @@ -0,0 +1,287 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_uct_device_context.h" + +void mca_btl_uct_uct_completion (uct_completion_t *uct_comp, ucs_status_t status) +{ + mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t *) ((uintptr_t) uct_comp - offsetof (mca_btl_uct_uct_completion_t, uct_comp)); + + BTL_VERBOSE(("network operation complete. status = %d", status)); + + comp->status = status; + opal_fifo_push (&comp->dev_context->completion_fifo, &comp->super.super); +} + + +static void mca_btl_uct_uct_completion_construct (mca_btl_uct_uct_completion_t *comp) +{ + comp->frag = NULL; + comp->uct_comp.func = mca_btl_uct_uct_completion; +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_uct_completion_t, opal_free_list_item_t, mca_btl_uct_uct_completion_construct, NULL); + +mca_btl_uct_uct_completion_t * +mca_btl_uct_uct_completion_alloc (mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_uct_device_context_t *dev_context, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t *) opal_free_list_get (&uct_btl->rdma_completions); + if (OPAL_LIKELY(NULL != comp)) { + comp->uct_comp.count = 1; + comp->btl = &uct_btl->super; + comp->endpoint = endpoint; + comp->local_address = local_address; + comp->local_handle = local_handle; + comp->cbfunc = cbfunc; + comp->cbcontext = cbcontext; + comp->cbdata = cbdata; + comp->dev_context = dev_context; + } + + return comp; +} + +void mca_btl_uct_uct_completion_release (mca_btl_uct_uct_completion_t *comp) +{ + if (comp) { + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) comp->btl; + opal_free_list_return (&uct_btl->rdma_completions, &comp->super); + } +} + +static void mca_btl_uct_get_unpack (void *arg, const void *data, size_t length) +{ + memcpy (arg, data, length); +} + +int mca_btl_uct_get (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; + mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_rdma_context (uct_btl); + mca_btl_uct_uct_completion_t *comp = NULL; + ucs_status_t ucs_status; + uct_rkey_bundle_t rkey; + uct_ep_h ep_handle; + int rc; + + BTL_VERBOSE(("performing get operation. local address: %p, length: %lu", local_address, (unsigned long) size)); + + if (cbfunc) { + comp = mca_btl_uct_uct_completion_alloc (uct_btl, endpoint, local_address, local_handle, context, + cbfunc, cbcontext, cbdata); + if (OPAL_UNLIKELY(NULL == comp)) { + BTL_VERBOSE(("culd not allocate completion structure")); + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + + rc = mca_btl_uct_get_rkey (uct_btl, context, endpoint, remote_handle, &rkey, &ep_handle); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + BTL_VERBOSE(("mca_btl_uct_get_rkey returned %d", rc)); + mca_btl_uct_uct_completion_release (comp); + return rc; + } + + mca_btl_uct_context_lock (context); + + if (size <= uct_btl->rdma_tl->uct_iface_attr.cap.get.max_bcopy) { + ucs_status = uct_ep_get_bcopy (ep_handle, mca_btl_uct_get_unpack, local_address, size, remote_address, + rkey.rkey, &comp->uct_comp); + } else { + uct_iov_t iov = {.buffer = local_address, .length = size, .stride = 0, .count = 1, + .memh = MCA_BTL_UCT_REG_REMOTE_TO_LOCAL(local_handle)->uct_memh}; + + ucs_status = uct_ep_get_zcopy (ep_handle, &iov, 1, remote_address, rkey.rkey, &comp->uct_comp); + } + + /* go ahead and progress the worker while we have the lock */ + (void) uct_worker_progress (context->uct_worker); + + mca_btl_uct_context_unlock (context); + + mca_btl_uct_device_handle_completions (context); + + if (UCS_OK == ucs_status && cbfunc) { + /* if UCS_OK is returned the callback will never fire so we have to make the callback + * ourselves */ + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); + mca_btl_uct_uct_completion_release (comp); + } else if (UCS_INPROGRESS == ucs_status) { + ucs_status = UCS_OK; + } + + BTL_VERBOSE(("get issued. status = %d", ucs_status)); + + uct_rkey_release (&rkey); + + return OPAL_LIKELY(UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERR_RESOURCE_BUSY; +} + +struct mca_btl_uct_put_pack_args_t { + void *local_address; + size_t size; +}; + +typedef struct mca_btl_uct_put_pack_args_t mca_btl_uct_put_pack_args_t; + +static size_t mca_btl_uct_put_pack (void *dest, void *arg) +{ + mca_btl_uct_put_pack_args_t *args = (mca_btl_uct_put_pack_args_t *) arg; + + memcpy (dest, args->local_address, args->size); + return args->size; +} + +int mca_btl_uct_put (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, + int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; + mca_btl_uct_device_context_t *context = mca_btl_uct_module_get_rdma_context (uct_btl); + mca_btl_uct_uct_completion_t *comp = NULL; + ucs_status_t ucs_status; + uct_rkey_bundle_t rkey; + uct_ep_h ep_handle; + int rc; + + BTL_VERBOSE(("performing put operation. local address: %p, length: %lu", local_address, (unsigned long) size)); + + if (size > uct_btl->super.btl_put_local_registration_threshold && cbfunc) { + comp = mca_btl_uct_uct_completion_alloc (uct_btl, endpoint, local_address, local_handle, context, + cbfunc, cbcontext, cbdata); + if (OPAL_UNLIKELY(NULL == comp)) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + + rc = mca_btl_uct_get_rkey (uct_btl, context, endpoint, remote_handle, &rkey, &ep_handle); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + mca_btl_uct_uct_completion_release (comp); + return rc; + } + + mca_btl_uct_context_lock (context); + + do { + if (size <= uct_btl->rdma_tl->uct_iface_attr.cap.put.max_short) { + ucs_status = uct_ep_put_short (ep_handle, local_address, size, remote_address, rkey.rkey); + } else if (size <= uct_btl->super.btl_put_local_registration_threshold) { + ssize_t tmp = uct_ep_put_bcopy (ep_handle, mca_btl_uct_put_pack, + &(mca_btl_uct_put_pack_args_t) {.local_address = local_address, .size = size}, + remote_address, rkey.rkey); + ucs_status = (tmp == (ssize_t) size) ? UCS_OK : UCS_ERR_NO_RESOURCE; + } else { + uct_iov_t iov = {.buffer = local_address, .length = size, .stride = 0, .count = 1, + .memh = MCA_BTL_UCT_REG_REMOTE_TO_LOCAL(local_handle)->uct_memh}; + + ucs_status = uct_ep_put_zcopy (ep_handle, &iov, 1, remote_address, rkey.rkey, &comp->uct_comp); + } + + /* go ahead and progress the worker while we have the lock */ + if (UCS_ERR_NO_RESOURCE != ucs_status) { + (void) uct_worker_progress (context->uct_worker); + break; + } + + /* wait for something to complete */ + while (!uct_worker_progress (context->uct_worker)); + } while (1); + + mca_btl_uct_context_unlock (context); + + mca_btl_uct_device_handle_completions (context); + + if (UCS_OK == ucs_status && cbfunc) { + /* if UCS_OK is returned the callback will never fire so we have to make the callback + * ourselves. this callback is possibly being made before the data is visible to the + * remote process. */ + cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS); + mca_btl_uct_uct_completion_release (comp); + } else if (UCS_INPROGRESS == ucs_status) { + ucs_status = UCS_OK; + } + + uct_rkey_release (&rkey); + + return OPAL_LIKELY(UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERR_RESOURCE_BUSY; +} + +int mca_btl_uct_flush (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint) +{ + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; + const int tl_index = uct_btl->rdma_tl->tl_index; + const int context_count = mca_btl_uct_component.num_contexts_per_module; + ucs_status_t ucs_status; + + BTL_VERBOSE(("mca_btl_uct_flush starting")); + + for (int i = 0 ; i < context_count ; ++i) { + mca_btl_uct_device_context_t *context = uct_btl->rdma_tl->uct_dev_contexts[i]; + + if (NULL == context) { + continue; + } + + mca_btl_uct_context_lock (context); + /* this loop is here because at least some of the TLs do no support a + * completion callback. its a real PIA but has to be done for now. */ + do { + uct_worker_progress (context->uct_worker); + + if (NULL != endpoint && endpoint->uct_eps[tl_index][context->context_id].uct_ep) { + ucs_status = uct_ep_flush (endpoint->uct_eps[tl_index][context->context_id].uct_ep, 0, NULL); + } else { + ucs_status = uct_iface_flush (context->uct_iface, 0, NULL); + } + } while (UCS_INPROGRESS == ucs_status); + + mca_btl_uct_context_unlock (context); + mca_btl_uct_device_handle_completions (context); + } + + return OPAL_SUCCESS; +} + +int mca_btl_uct_flush_thread (mca_btl_base_module_t *btl) +{ + mca_btl_uct_module_t *uct_btl = (mca_btl_uct_module_t *) btl; + const int context_id = mca_btl_uct_get_context_index (); + mca_btl_uct_device_context_t *context = uct_btl->rdma_tl->uct_dev_contexts[context_id]; + ucs_status_t ucs_status; + + BTL_VERBOSE(("mca_btl_uct_flush_thread starting")); + + if (NULL == context) { + return OPAL_SUCCESS; + } + + mca_btl_uct_context_lock (context); + + /* this loop is here because at least some of the TLs do no support a + * completion callback. its a real PIA but has to be done for now. */ + do { + uct_worker_progress (context->uct_worker); + ucs_status = uct_iface_flush (context->uct_iface, 0, NULL); + } while (UCS_INPROGRESS == ucs_status); + + mca_btl_uct_context_unlock (context); + + mca_btl_uct_device_handle_completions (context); + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/uct/btl_uct_rdma.h b/opal/mca/btl/uct/btl_uct_rdma.h new file mode 100644 index 00000000000..e9b0d6b19dc --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_rdma.h @@ -0,0 +1,62 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(BTL_UCT_RDMA_H) +#define BTL_UCT_RDMA_H + +#include "btl_uct.h" +#include "btl_uct_endpoint.h" +#include "btl_uct_frag.h" + +/** + * @brief allocate a callback structure + */ +mca_btl_uct_uct_completion_t *mca_btl_uct_uct_completion_alloc (mca_btl_uct_module_t *btl, mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + mca_btl_uct_device_context_t *dev_context, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); +/** + * @brief release a callback structure + */ +void mca_btl_uct_uct_completion_release (mca_btl_uct_uct_completion_t *comp); + +void mca_btl_uct_uct_completion (uct_completion_t *uct_comp, ucs_status_t status); + +/** + * @brief unpack the registration key and ensure the endpoint is connected + * + * @param[in] module uct btl module + * @param[in] context device context to use + * @param[in] endpoint btl endpoint + * @param[in] remote_handle buffer containing remote handle data + * @param[inout] rkey uct registration key bundle + * @param[out] ep_handle uct endpoint handle + */ +static inline int mca_btl_uct_get_rkey (mca_btl_uct_module_t *module, + mca_btl_uct_device_context_t *context, + mca_btl_base_endpoint_t *endpoint, + mca_btl_base_registration_handle_t *remote_handle, + uct_rkey_bundle_t *rkey, + uct_ep_h *ep_handle) +{ + ucs_status_t ucs_status; + int rc; + + rc = mca_btl_uct_endpoint_check_rdma (module, endpoint, context, ep_handle); + if (OPAL_SUCCESS != rc) { + return rc; + } + + ucs_status = uct_rkey_unpack ((void *) remote_handle, rkey); + return (UCS_OK == ucs_status) ? OPAL_SUCCESS : OPAL_ERROR; +} + +#endif /* !defined(BTL_UCT_RDMA_H) */ diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c new file mode 100644 index 00000000000..c4014744fdf --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -0,0 +1,574 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "btl_uct_device_context.h" +#include "btl_uct_am.h" +#include "opal/util/bit_ops.h" + +/** + * @brief Convert UCT capabilities to BTL flags + */ +static uint64_t mca_btl_uct_cap_to_btl_flag[][2] = { + {UCT_IFACE_FLAG_AM_ZCOPY, MCA_BTL_FLAGS_SEND}, + {UCT_IFACE_FLAG_PUT_ZCOPY, MCA_BTL_FLAGS_PUT}, + {UCT_IFACE_FLAG_GET_ZCOPY, MCA_BTL_FLAGS_GET}, + {0,0}, +}; + +/** + * @brief Convert UCT capability flags to BTL flags + * + * @param[in] cap_flags UCT capability flags + * + * @returns equivalent BTL flags + */ +static int32_t mca_btl_uct_module_flags (uint64_t cap_flags) +{ + uint32_t flags = 0; + + for (int i = 0 ; mca_btl_uct_cap_to_btl_flag[i][0] > 0 ; ++i) { + if (cap_flags & mca_btl_uct_cap_to_btl_flag[i][0]) { + flags |= (uint32_t) mca_btl_uct_cap_to_btl_flag[i][1]; + } + } + return flags; +} + +#if OPAL_HAVE_UCT_EP_ATOMIC64_POST +/** + * @brief Convert UCT capabilities to BTL atomic flags + */ +static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { + {UCS_BIT(UCT_ATOMIC_OP_ADD), MCA_BTL_ATOMIC_SUPPORTS_ADD}, + {UCS_BIT(UCT_ATOMIC_OP_AND), MCA_BTL_ATOMIC_SUPPORTS_AND}, + {UCS_BIT(UCT_ATOMIC_OP_OR), MCA_BTL_ATOMIC_SUPPORTS_OR}, + {UCS_BIT(UCT_ATOMIC_OP_XOR), MCA_BTL_ATOMIC_SUPPORTS_XOR}, + {UCS_BIT(UCT_ATOMIC_OP_SWAP), MCA_BTL_ATOMIC_SUPPORTS_SWAP}, + {UCS_BIT(UCT_ATOMIC_OP_CSWAP), MCA_BTL_ATOMIC_SUPPORTS_CSWAP}, + {0, }, +}; + +static void mca_btl_uct_module_set_atomic_flags (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +{ + uint64_t cap_flags = tl->uct_iface_attr.cap.flags; + + /* NTH: only use the fetching atomics for now */ + uint64_t atomic_flags32 = tl->uct_iface_attr.cap.atomic32.fop_flags; + uint64_t atomic_flags64 = tl->uct_iface_attr.cap.atomic64.fop_flags; + + /* NTH: don't really have a way to seperate 32-bit and 64-bit right now */ + uint64_t all_flags = atomic_flags32 & atomic_flags64; + + module->super.btl_atomic_flags = 0; + + if (cap_flags & UCT_IFACE_FLAG_ATOMIC_CPU) { + module->super.btl_atomic_flags |= MCA_BTL_ATOMIC_SUPPORTS_GLOB; + } + + for (int i = 0 ; mca_btl_uct_cap_to_btl_atomic_flag[i][0] ; ++i) { + if (all_flags & mca_btl_uct_cap_to_btl_atomic_flag[i][0]) { + module->super.btl_atomic_flags |= mca_btl_uct_cap_to_btl_atomic_flag[i][1]; + } + } + + if (0 != module->super.btl_atomic_flags) { + /* some atomics are supported */ + module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS; + } +} + +#else +/** + * @brief Convert UCT capabilities to BTL atomic flags + */ +static uint64_t mca_btl_uct_cap_to_btl_atomic_flag[][2] = { + {UCT_IFACE_FLAG_ATOMIC_ADD64, MCA_BTL_ATOMIC_SUPPORTS_ADD}, + {UCT_IFACE_FLAG_ATOMIC_ADD32, MCA_BTL_ATOMIC_SUPPORTS_32BIT}, + {UCT_IFACE_FLAG_ATOMIC_CSWAP64, MCA_BTL_ATOMIC_SUPPORTS_CSWAP}, + {UCT_IFACE_FLAG_ATOMIC_SWAP64, MCA_BTL_ATOMIC_SUPPORTS_SWAP}, + {UCT_IFACE_FLAG_ATOMIC_CPU, MCA_BTL_ATOMIC_SUPPORTS_GLOB}, + {0, }, +}; + +/** + * @brief Convert UCT capability flags to BTL atomic flags + * + * @param[in] cap_flags UCT capability flags + * + * @returns equivalent BTL atomic flags + */ +static void mca_btl_uct_module_set_atomic_flags (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +{ + uint64_t cap_flags = tl->uct_iface_attr.cap.flags; + uint32_t flags = 0; + + module->super.btl_atomic_flags = 0; + + for (int i = 0 ; mca_btl_uct_cap_to_btl_atomic_flag[i][0] > 0 ; ++i) { + if (cap_flags & mca_btl_uct_cap_to_btl_atomic_flag[i][0]) { + module->super.btl_atomic_flags |= (uint32_t) mca_btl_uct_cap_to_btl_atomic_flag[i][1]; + } + } + + if (0 != module->super.btl_atomic_flags) { + /* some atomics are supported */ + module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS; + } +} + +#endif + +static void mca_btl_uct_tl_constructor (mca_btl_uct_tl_t *tl) +{ + memset ((void *)((uintptr_t) tl + sizeof (tl->super)), 0, sizeof (*tl) - sizeof (tl->super)); + OBJ_CONSTRUCT(&tl->tl_lock, opal_mutex_t); +} + +static void mca_btl_uct_tl_destructor (mca_btl_uct_tl_t *tl) +{ + assert (((opal_object_t *) tl)->obj_reference_count == 0); + + for (int context_id = 0 ; context_id < MCA_BTL_UCT_MAX_WORKERS ; ++context_id) { + if (NULL != tl->uct_dev_contexts[context_id]) { + mca_btl_uct_context_destroy (tl->uct_dev_contexts[context_id]); + } + } + + if (tl->uct_md) { + OBJ_RELEASE(tl->uct_md); + } + + free (tl->uct_dev_contexts); + free (tl->uct_tl_name); + free (tl->uct_dev_name); + + if (NULL != tl->uct_tl_config) { + uct_config_release (tl->uct_tl_config); + } + + OBJ_DESTRUCT(&tl->tl_lock); +} + +OBJ_CLASS_INSTANCE(mca_btl_uct_tl_t, opal_list_item_t, mca_btl_uct_tl_constructor, mca_btl_uct_tl_destructor); + +static ucs_status_t mca_btl_uct_conn_req_cb (void *arg, void *data, size_t length, unsigned flags) +{ + mca_btl_uct_module_t *module = (mca_btl_uct_module_t *) arg; + mca_btl_uct_conn_req_t *req = (mca_btl_uct_conn_req_t *) ((uintptr_t) data + 8); + struct opal_proc_t *remote_proc = opal_proc_for_name (req->proc_name); + mca_btl_base_endpoint_t *endpoint = mca_btl_uct_get_ep (&module->super, remote_proc); + mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[req->tl_index] + req->context_id; + int64_t type = *((int64_t *) data); + int32_t ep_flags; + int rc; + + BTL_VERBOSE(("got connection request for endpoint %p. length = %lu", endpoint, length)); + + if (NULL == endpoint) { + BTL_ERROR(("could not create endpoint for connection request")); + return UCS_ERR_UNREACHABLE; + } + + assert (type < 2); + + if (0 == type) { + /* create any necessary resources */ + rc = mca_btl_uct_endpoint_connect (module, endpoint, req->context_id, req->ep_addr, req->tl_index); + if (OPAL_SUCCESS != rc && OPAL_ERR_OUT_OF_RESOURCE != rc) { + BTL_ERROR(("could not setup rdma endpoint")); + return UCS_ERR_UNREACHABLE; + } + + ep_flags = opal_atomic_or_fetch_32 (&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC); + } else { + ep_flags = opal_atomic_or_fetch_32 (&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY); + } + + /* the connection is ready once we have received the connection data and also a connection ready + * message. this might be overkill but there is little documentation at the UCT level on when + * an endpoint can be used. */ + if ((ep_flags & (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY | MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC)) == + (MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY | MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC)) { + mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_tl_context_specific (module, module->comm_tls[req->tl_index], req->context_id); + mca_btl_uct_base_frag_t *frag; + + /* to avoid a race with send adding pending frags grab the lock here */ + OPAL_THREAD_LOCK(&endpoint->ep_lock); + (void) opal_atomic_or_fetch_32 (&tl_endpoint->flags, MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY); + OPAL_THREAD_UNLOCK(&endpoint->ep_lock); + + opal_atomic_wmb (); + + OPAL_THREAD_SCOPED_LOCK(&module->lock, { + OPAL_LIST_FOREACH(frag, &module->pending_frags, mca_btl_uct_base_frag_t) { + if (frag->context_id == req->context_id && endpoint == frag->endpoint) { + frag->ready = true; + } + } + }); + } + + return UCS_OK; +} + +static int mca_btl_uct_setup_connection_tl (mca_btl_uct_module_t *module) +{ + ucs_status_t ucs_status; + + if (NULL == module->conn_tl) { + return OPAL_ERR_NOT_SUPPORTED; + } + + ucs_status = uct_iface_set_am_handler (module->conn_tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_CONNECT_RDMA, + mca_btl_uct_conn_req_cb, module, UCT_CB_FLAG_ASYNC); + if (UCS_OK != ucs_status) { + BTL_ERROR(("could not set active message handler for uct tl")); + } + + return UCS_OK == ucs_status ? OPAL_SUCCESS : OPAL_ERROR; +} + +mca_btl_uct_device_context_t *mca_btl_uct_context_create (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl, int context_id) +{ + uct_iface_params_t iface_params = {.rndv_cb = NULL, .eager_cb = NULL, .stats_root = NULL, + .rx_headroom = 0, .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, + .mode = {.device = {.tl_name = tl->uct_tl_name, + .dev_name = tl->uct_dev_name}}}; + mca_btl_uct_device_context_t *context; + ucs_status_t ucs_status; + + context = calloc (1, sizeof (*context)); + if (OPAL_UNLIKELY(NULL == context)) { + return NULL; + } + + context->context_id = context_id; + context->uct_btl = module; + OBJ_CONSTRUCT(&context->completion_fifo, opal_fifo_t); + OBJ_CONSTRUCT(&context->mutex, opal_recursive_mutex_t); + + do { + /* apparently (in contradiction to the spec) UCT is *not* thread safe. because we have to + * use our own locks just go ahead and use UCS_THREAD_MODE_SINGLE. if they ever fix their + * api then change this back to UCS_THREAD_MODE_MULTI and remove the locks around the + * various UCT calls. */ + ucs_status = uct_worker_create (module->ucs_async, UCS_THREAD_MODE_SINGLE, &context->uct_worker); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("could not create a UCT worker")); + mca_btl_uct_context_destroy (context); + context = NULL; + break; + } + + ucs_status = uct_iface_open (tl->uct_md->uct_md, context->uct_worker, &iface_params, + tl->uct_tl_config, &context->uct_iface); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("could not open UCT interface. error code: %d", ucs_status)); + mca_btl_uct_context_destroy (context); + context = NULL; + break; + } + + BTL_VERBOSE(("enabling progress for tl %p context id %d", tl, context_id)); + + uct_iface_progress_enable (context->uct_iface, UCT_PROGRESS_THREAD_SAFE | UCT_PROGRESS_SEND | + UCT_PROGRESS_RECV); + + if (context_id > 0 && tl == module->am_tl) { + BTL_VERBOSE(("installing AM handler for tl %p context id %d", tl, context_id)); + uct_iface_set_am_handler (context->uct_iface, MCA_BTL_UCT_FRAG, mca_btl_uct_am_handler, + context, UCT_CB_FLAG_SYNC); + } + } while (0); + + return context; +} + +void mca_btl_uct_context_destroy (mca_btl_uct_device_context_t *context) +{ + if (context->uct_iface) { + uct_iface_close (context->uct_iface); + context->uct_iface = NULL; + } + + if (context->uct_worker) { + uct_worker_destroy (context->uct_worker); + context->uct_worker = NULL; + } + + OBJ_DESTRUCT(&context->completion_fifo); + free (context); +} + +static int tl_compare (opal_list_item_t **a, opal_list_item_t **b) +{ + mca_btl_uct_tl_t *tl_a = (mca_btl_uct_tl_t *) *a; + mca_btl_uct_tl_t *tl_b = (mca_btl_uct_tl_t *) *b; + + return tl_a->priority - tl_b->priority; +} + +static mca_btl_uct_tl_t *mca_btl_uct_create_tl (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_desc, int priority) +{ + mca_btl_uct_tl_t *tl = OBJ_NEW(mca_btl_uct_tl_t); + ucs_status_t ucs_status; + + if (OPAL_UNLIKELY(NULL == tl)) { + return NULL; + } + + /* initialize btl tl structure */ + tl->uct_md = md; + OBJ_RETAIN(md); + + tl->uct_tl_name = strdup (tl_desc->tl_name); + tl->uct_dev_name = strdup (tl_desc->dev_name); + tl->priority = priority; + + tl->uct_dev_contexts = calloc (MCA_BTL_UCT_MAX_WORKERS, sizeof (tl->uct_dev_contexts[0])); + if (NULL == tl->uct_dev_contexts) { + OBJ_RELEASE(tl); + return NULL; + } + + (void) uct_md_iface_config_read (md->uct_md, tl_desc->tl_name, NULL, NULL, &tl->uct_tl_config); + + /* always create a 0 context (needed to query) */ + tl->uct_dev_contexts[0] = mca_btl_uct_context_create (module, tl, 0); + if (NULL == tl->uct_dev_contexts[0]) { + BTL_VERBOSE(("could not create a uct device context")); + OBJ_RELEASE(tl); + return NULL; + } + + /* only need to query one of the interfaces to get the attributes */ + ucs_status = uct_iface_query (tl->uct_dev_contexts[0]->uct_iface, &tl->uct_iface_attr); + if (UCS_OK != ucs_status) { + BTL_VERBOSE(("Error querying UCT interface")); + OBJ_RELEASE(tl); + return NULL; + } + + BTL_VERBOSE(("Interface CAPS for tl %s::%s: 0x%lx", module->md_name, tl_desc->tl_name, (unsigned long) tl->uct_iface_attr.cap.flags)); + + return tl; +} + +static void mca_btl_uct_set_tl_rdma (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +{ + BTL_VERBOSE(("tl %s is suitable for RDMA", tl->uct_tl_name)); + + mca_btl_uct_module_set_atomic_flags (module, tl); + + module->super.btl_get_limit = tl->uct_iface_attr.cap.get.max_zcopy; + if (tl->uct_iface_attr.cap.get.max_bcopy) { + module->super.btl_get_alignment = 0; + module->super.btl_get_local_registration_threshold = tl->uct_iface_attr.cap.get.max_bcopy; + } else { + /* this is overkill in terms of alignment but we have no way to enforce a minimum get size */ + module->super.btl_get_alignment = opal_next_poweroftwo_inclusive (tl->uct_iface_attr.cap.get.min_zcopy); + } + + module->super.btl_put_limit = tl->uct_iface_attr.cap.put.max_zcopy; + module->super.btl_put_alignment = 0; + + /* no registration needed when using short put */ + if (tl->uct_iface_attr.cap.put.max_bcopy > tl->uct_iface_attr.cap.put.max_short) { + module->super.btl_put_local_registration_threshold = tl->uct_iface_attr.cap.put.max_bcopy; + } else { + module->super.btl_put_local_registration_threshold = tl->uct_iface_attr.cap.put.max_short; + } + + module->rdma_tl = tl; + OBJ_RETAIN(tl); + + tl->tl_index = (module->am_tl && tl != module->am_tl) ? 1 : 0; + module->comm_tls[tl->tl_index] = tl; + if (tl->max_device_contexts <= 1) { + tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; + } +} + +static void mca_btl_uct_set_tl_am (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +{ + BTL_VERBOSE(("tl %s is suitable for active-messaging", tl->uct_tl_name)); + + if (module->rdma_tl == tl) { + module->shared_endpoints = true; + } + module->am_tl = tl; + OBJ_RETAIN(tl); + + uct_iface_set_am_handler (tl->uct_dev_contexts[0]->uct_iface, MCA_BTL_UCT_FRAG, + mca_btl_uct_am_handler, tl->uct_dev_contexts[0], UCT_CB_FLAG_ASYNC); + + tl->tl_index = (module->rdma_tl && tl != module->rdma_tl) ? 1 : 0; + module->comm_tls[tl->tl_index] = tl; + if (tl->max_device_contexts <= 1) { + tl->max_device_contexts = mca_btl_uct_component.num_contexts_per_module; + } +} + +static int mca_btl_uct_set_tl_conn (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +{ + int rc; + + BTL_VERBOSE(("tl %s is suitable for making connections", tl->uct_tl_name)); + + module->conn_tl = tl; + rc = mca_btl_uct_setup_connection_tl (module); + if (OPAL_SUCCESS != rc) { + return rc; + } + + OBJ_RETAIN(tl); + + if (!tl->max_device_contexts) { + /* if a tl is only being used to create connections do not bother with multiple + * contexts */ + tl->max_device_contexts = 1; + } + + return OPAL_SUCCESS; +} + +static int mca_btl_uct_evaluate_tl (mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl) +{ + int rc; + + BTL_VERBOSE(("evaluating tl %s", tl->uct_tl_name)); + if (NULL == module->rdma_tl && mca_btl_uct_tl_supports_rdma (tl)) { + mca_btl_uct_set_tl_rdma (module, tl); + } + + if (NULL == module->am_tl && mca_btl_uct_tl_support_am (tl)) { + mca_btl_uct_set_tl_am (module, tl); + } + + if (NULL == module->conn_tl && mca_btl_uct_tl_supports_conn (tl)) { + rc = mca_btl_uct_set_tl_conn (module, tl); + if (OPAL_SUCCESS != rc) { + return rc; + } + } + + if (tl == module->rdma_tl || tl == module->am_tl) { + BTL_VERBOSE(("tl has flags 0x%" PRIx64, tl->uct_iface_attr.cap.flags)); + module->super.btl_flags |= mca_btl_uct_module_flags (tl->uct_iface_attr.cap.flags); + + /* the bandwidth and latency numbers relate to both rdma and active messages. need to + * come up with a better estimate. */ + + /* UCT bandwidth is in bytes/sec, BTL is in MB/sec */ + module->super.btl_bandwidth = (uint32_t) (tl->uct_iface_attr.bandwidth / 1048576.0); + /* TODO -- figure out how to translate UCT latency to us */ + module->super.btl_latency = 1; + } + + return OPAL_SUCCESS; +} + +int mca_btl_uct_query_tls (mca_btl_uct_module_t *module, mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count) +{ + bool include = true, any = false; + mca_btl_uct_tl_t *tl; + opal_list_t tl_list; + char **tl_filter; + + OBJ_CONSTRUCT(&tl_list, opal_list_t); + + tl_filter = opal_argv_split (mca_btl_uct_component.allowed_transports, ','); + + if ('^' == tl_filter[0][0]) { + /* user has negated the include list */ + char *tmp = strdup (tl_filter[0] + 1); + + free (tl_filter[0]); + tl_filter[0] = tmp; + include = false; + } else if (0 == strcmp (tl_filter[0], "any")) { + any = true; + } + + for (unsigned i = 0 ; i < tl_count ; ++i) { + bool try_tl = any; + int priority = 0; + + for (unsigned j = 0 ; tl_filter[j] && !try_tl ; ++j) { + try_tl = (0 == strcmp (tl_filter[j], tl_descs[i].tl_name)) == include; + priority = j; + } + + if (!try_tl) { + continue; + } + + tl = mca_btl_uct_create_tl (module, md, tl_descs + i, priority); + + if (tl) { + opal_list_append (&tl_list, &tl->super); + } + } + + if (0 == opal_list_get_size (&tl_list)) { + BTL_VERBOSE(("no suitable tls match filter: %s", mca_btl_uct_component.allowed_transports)); + OBJ_DESTRUCT(&tl_list); + return OPAL_ERR_NOT_AVAILABLE; + } + + opal_list_sort (&tl_list, tl_compare); + + OPAL_LIST_FOREACH(tl, &tl_list, mca_btl_uct_tl_t) { + mca_btl_uct_evaluate_tl (module, tl); + if (NULL != module->am_tl && NULL != module->rdma_tl && + (NULL != module->conn_tl || !(mca_btl_uct_tl_requires_connection_tl (module->am_tl) || + mca_btl_uct_tl_requires_connection_tl (module->rdma_tl)))) { + /* all done */ + break; + } + } + + if (NULL == module->rdma_tl) { + /* no rdma tls */ + BTL_VERBOSE(("no rdma tl matched supplied filter. disabling RDMA support")); + + module->super.btl_put = NULL; + module->super.btl_get = NULL; + module->super.btl_atomic_fop = NULL; + module->super.btl_atomic_op = NULL; + } + + if (NULL == module->am_tl) { + /* no active message tls == no send/recv */ + BTL_VERBOSE(("no active message tl matched supplied filter. disabling send/recv support")); + + module->super.btl_send = NULL; + module->super.btl_sendi = NULL; + module->super.btl_alloc = NULL; + module->super.btl_free = NULL; + } + + OPAL_LIST_DESTRUCT(&tl_list); + + if (!(NULL != module->am_tl && mca_btl_uct_tl_requires_connection_tl (module->am_tl)) && + !(NULL != module->rdma_tl && mca_btl_uct_tl_requires_connection_tl (module->rdma_tl)) && + module->conn_tl) { + /* no connection tl needed for selected transports */ + OBJ_RELEASE(module->conn_tl); + module->conn_tl = NULL; + } else if (NULL == module->conn_tl) { + BTL_VERBOSE(("a connection tl is required but no tls match the filter %s", + mca_btl_uct_component.allowed_transports)); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h new file mode 100644 index 00000000000..5b8b4d58ac8 --- /dev/null +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -0,0 +1,321 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#if !defined(BTL_UCT_TYPES_H) +#define BTL_UCT_TYPES_H + +#include "opal/mca/btl/btl.h" + +/* forward declarations */ +struct mca_btl_uct_module_t; +struct mca_btl_base_endpoint_t; +struct mca_btl_uct_base_frag_t; + +/* TL endpoint flags */ +/** connection data was received */ +#define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REC 0x1 +/** remote endpoint read */ +#define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_REM_READY 0x2 +/** connection was established */ +#define MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY 0x4 + +/* AM tags */ +/** BTL fragment */ +#define MCA_BTL_UCT_FRAG 0x0d +/** connection request */ +#define MCA_BTL_UCT_CONNECT_RDMA 0x0e + +/** maximum number of modules supported by the btl component */ +#define MCA_BTL_UCT_MAX_MODULES 16 +/** maximum number of UCT workers */ +#define MCA_BTL_UCT_MAX_WORKERS 64 + +/** + * @brief MODEx data + */ +struct mca_btl_uct_modex_t { + /** number of modules whose data is stored in this modex */ + int32_t module_count; + + /** variable length modex data */ + uint8_t data[]; +}; + +typedef struct mca_btl_uct_modex_t mca_btl_uct_modex_t; + +/** + * @brief BTL UCT memory domain structure + * + * Each BTL module supports a single memory domain. Each memory domain has + * one or more transport layers. + */ +struct mca_btl_uct_md_t { + /** make this an opal object */ + opal_object_t super; + + /** UCT memory domain handle */ + uct_md_h uct_md; +}; + +typedef struct mca_btl_uct_md_t mca_btl_uct_md_t; + +OBJ_CLASS_DECLARATION(mca_btl_uct_md_t); + + +/** + * @brief Connection request structure + */ +struct mca_btl_uct_conn_req_t { + /** name of the requesting process */ + opal_process_name_t proc_name; + + /** context id that should be connected */ + int context_id; + + /** transport index that should be connected */ + int tl_index; + + /** endpoint address data */ + uint8_t ep_addr[]; +}; + +typedef struct mca_btl_uct_conn_req_t mca_btl_uct_conn_req_t; + +/** + * @brief Transport endpoint stucture + */ +struct mca_btl_uct_tl_endpoint_t { + /** current flags (connected, requested, etc) */ + volatile int32_t flags; + + /** UCT endpoint handle */ + uct_ep_h uct_ep; +}; + +typedef struct mca_btl_uct_tl_endpoint_t mca_btl_uct_tl_endpoint_t; + +/** + * @brief Structure to keep track of connection endpoints + */ +struct mca_btl_uct_connection_ep_t { + /** opal base object */ + opal_object_t super; + + /** UCT endpoint used for connection */ + uct_ep_h uct_ep; +}; + +typedef struct mca_btl_uct_connection_ep_t mca_btl_uct_connection_ep_t; + +OBJ_CLASS_DECLARATION(mca_btl_uct_connection_ep_t); + +/** + * @brief Context for UCT device interface + * + * This structure uses atomic locks to protect the UCT worker (which is not thread safe). + * In order to make device access fast pthread mutexes are not used. To deal with recursion + * (unavoidable with active messages) we implement an atomic lock using C11 atomics (or + * pthread thread-specific values with older compilers). + */ +struct mca_btl_uct_device_context_t { + /** index of this context */ + int context_id; + + /** btl module this context is associated with */ + struct mca_btl_uct_module_t *uct_btl; + + /** mutex for protecting the UCT worker */ + opal_recursive_mutex_t mutex; + + /** UCT worker handle */ + uct_worker_h uct_worker; + + /** UCT interface handle */ + uct_iface_h uct_iface; + + /** complete fragments and rdma operations. this fifo is used to avoid making + * callbacks while holding the device lock. */ + opal_fifo_t completion_fifo; +}; + +typedef struct mca_btl_uct_device_context_t mca_btl_uct_device_context_t; + +/** + * @brief Header for all BTL UCT active messages + */ +union mca_btl_uct_am_header_t { + /** active message header data */ + struct mca_btl_uct_am_header_data_t { + /** callback tag */ + mca_btl_base_tag_t tag; + + /** padding */ + uint8_t padding[7]; + } data; + + /** header value. this is 64-bits to support using this with uct_ep_am_short */ + uint64_t value; +}; + +typedef union mca_btl_uct_am_header_t mca_btl_uct_am_header_t; + +/** + * @brief structure to keep track of btl callback + * + * This structuere is passed to various uct functions. It + * does the translation between the uct callback and the + * btl callback. + */ +struct mca_btl_uct_uct_completion_t { + /** allocated from a free list */ + opal_free_list_item_t super; + + /** uct completion structure */ + uct_completion_t uct_comp; + + /** AM completion context */ + struct mca_btl_uct_base_frag_t *frag; + + /** btl module associated with the callback */ + struct mca_btl_base_module_t *btl; + + /** btl endpoint associated with the callback */ + struct mca_btl_base_endpoint_t *endpoint; + + /** local address */ + void *local_address; + + /** local registration handle */ + mca_btl_base_registration_handle_t *local_handle; + + /** user callback function */ + mca_btl_base_rdma_completion_fn_t cbfunc; + + /** user callback context */ + void *cbcontext; + + /** user callback data */ + void *cbdata; + + /** device context */ + mca_btl_uct_device_context_t *dev_context; + + /** status */ + int status; +}; + +typedef struct mca_btl_uct_uct_completion_t mca_btl_uct_uct_completion_t; + +OBJ_CLASS_DECLARATION(mca_btl_uct_uct_completion_t); + +/** + * @brief Base fragment structure + */ +struct mca_btl_uct_base_frag_t { + /** btl base fragment */ + mca_btl_base_descriptor_t base; + + /** segments (used with the base fragment) */ + mca_btl_base_segment_t segments[2]; + + /** module this fragment is associated with */ + struct mca_btl_uct_module_t *btl; + + /** context this fragment is waiting on */ + int context_id; + + /** is this frag ready to send (only used when pending) */ + bool ready; + + /** endpoint this fragment is associated with */ + struct mca_btl_base_endpoint_t *endpoint; + + /** free list this fragment was allocated from */ + opal_free_list_t *free_list; + + /** fragment btl/uct header */ + mca_btl_uct_am_header_t header; + + /** pre-filled UCT io vector */ + uct_iov_t uct_iov; + + /** completion structure */ + mca_btl_uct_uct_completion_t comp; +}; + +typedef struct mca_btl_uct_base_frag_t mca_btl_uct_base_frag_t; + +OBJ_CLASS_DECLARATION(mca_btl_uct_base_frag_t); + +struct mca_btl_base_endpoint_t { + /** opal base class */ + opal_object_t super; + + /** endpoint proc */ + opal_proc_t *ep_proc; + + /** mutex to protect this structure */ + opal_recursive_mutex_t ep_lock; + + /** cached connection endpoint */ + mca_btl_uct_connection_ep_t *conn_ep; + + /** endpoints into UCT for this BTL endpoint */ + mca_btl_uct_tl_endpoint_t uct_eps[2][MCA_BTL_UCT_MAX_WORKERS]; +}; + +typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; +typedef mca_btl_base_endpoint_t mca_btl_uct_endpoint_t; +OBJ_CLASS_DECLARATION(mca_btl_uct_endpoint_t); + +/** + * @brief BTL UCT abstraction of a UCT transport layer + */ +struct mca_btl_uct_tl_t { + /** make this an opal object */ + opal_list_item_t super; + + /** relative priority 0 == highest */ + int priority; + + /** memory domain associated with this tl */ + mca_btl_uct_md_t *uct_md; + + /** lock protecting tl structures */ + opal_mutex_t tl_lock; + + /** tl configuration (used for creating device contexts) */ + uct_iface_config_t *uct_tl_config; + + /** name of this tl (used for creating device contexts) */ + char *uct_tl_name; + + /** device name for this tl (used for creating device contexts) */ + char *uct_dev_name; + + /** interface attributes */ + uct_iface_attr_t uct_iface_attr; + + /** maxiumum number of device contexts that can be created */ + int max_device_contexts; + + /** array of device contexts */ + mca_btl_uct_device_context_t **uct_dev_contexts; + + /** tl index. this is used to differentiate (if there is any difference) + * between rdma and am endpoints */ + int tl_index; +}; + +typedef struct mca_btl_uct_tl_t mca_btl_uct_tl_t; +OBJ_CLASS_DECLARATION(mca_btl_uct_tl_t); + +#endif /* !defined(BTL_UCT_TYPES_H) */ diff --git a/opal/mca/btl/uct/configure.m4 b/opal/mca/btl/uct/configure.m4 new file mode 100644 index 00000000000..dbeabe2f5f7 --- /dev/null +++ b/opal/mca/btl/uct/configure.m4 @@ -0,0 +1,47 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2006 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2006 QLogic Corp. All rights reserved. +# Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2011-2018 Los Alamos National Security, LLC. +# All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# OPAL_CHECK_UCX(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if UCX support can be found. sets prefix_{CPPFLAGS, +# LDFLAGS, LIBS} as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found + +AC_DEFUN([MCA_opal_btl_uct_CONFIG],[ + AC_CONFIG_FILES([opal/mca/btl/uct/Makefile]) + + OMPI_CHECK_UCX([btl_uct], + [btl_uct_happy="yes"], + [btl_uct_happy="no"]) + + AS_IF([test "$btl_uct_happy" = "yes"], + [$1 + btl_uct_LIBS = "$btl_uct_LIBS -luct" + ], + [$2]) + + # substitute in the things needed to build ucx + AC_SUBST([btl_uct_CPPFLAGS]) + AC_SUBST([btl_uct_LDFLAGS]) + AC_SUBST([btl_uct_LIBS]) +])dnl diff --git a/opal/mca/btl/uct/owner.txt b/opal/mca/btl/uct/owner.txt new file mode 100644 index 00000000000..4918816bc9e --- /dev/null +++ b/opal/mca/btl/uct/owner.txt @@ -0,0 +1,7 @@ +# +# owner/status file +# owner: institution that is responsible for this package +# status: e.g. active, maintenance, unmaintained +# +owner:LANL +status:active From 091d1ca214850e0d1c574211ff7a19c4fd376828 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Mon, 25 Jun 2018 16:05:12 -0600 Subject: [PATCH 2/2] btl/uct: make uct endpoints array a flexible array member Signed-off-by: Nathan Hjelm --- opal/mca/btl/uct/btl_uct_endpoint.c | 16 ++++++++-------- opal/mca/btl/uct/btl_uct_endpoint.h | 10 +++++----- opal/mca/btl/uct/btl_uct_rdma.c | 4 ++-- opal/mca/btl/uct/btl_uct_tl.c | 2 +- opal/mca/btl/uct/btl_uct_types.h | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/opal/mca/btl/uct/btl_uct_endpoint.c b/opal/mca/btl/uct/btl_uct_endpoint.c index 576e01f13da..e0dd6eee50a 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.c +++ b/opal/mca/btl/uct/btl_uct_endpoint.c @@ -17,7 +17,7 @@ static void mca_btl_uct_endpoint_construct (mca_btl_uct_endpoint_t *endpoint) { - memset (endpoint->uct_eps, 0, sizeof (endpoint->uct_eps)); + memset (endpoint->uct_eps, 0, sizeof (endpoint->uct_eps[0]) * mca_btl_uct_component.num_contexts_per_module); endpoint->conn_ep = NULL; OBJ_CONSTRUCT(&endpoint->ep_lock, opal_recursive_mutex_t); } @@ -25,15 +25,13 @@ static void mca_btl_uct_endpoint_construct (mca_btl_uct_endpoint_t *endpoint) static void mca_btl_uct_endpoint_destruct (mca_btl_uct_endpoint_t *endpoint) { for (int tl_index = 0 ; tl_index < 2 ; ++tl_index) { - for (int i = 0 ; i < MCA_BTL_UCT_MAX_WORKERS ; ++i) { - if (NULL != endpoint->uct_eps[tl_index][i].uct_ep) { - uct_ep_destroy (endpoint->uct_eps[tl_index][i].uct_ep); + for (int i = 0 ; i < mca_btl_uct_component.num_contexts_per_module ; ++i) { + if (NULL != endpoint->uct_eps[i][tl_index].uct_ep) { + uct_ep_destroy (endpoint->uct_eps[i][tl_index].uct_ep); } } } - memset (endpoint->uct_eps, 0, sizeof (endpoint->uct_eps)); - OBJ_DESTRUCT(&endpoint->ep_lock); } @@ -43,12 +41,14 @@ OBJ_CLASS_INSTANCE(mca_btl_uct_endpoint_t, opal_object_t, mca_btl_base_endpoint_t *mca_btl_uct_endpoint_create (opal_proc_t *proc) { - mca_btl_uct_endpoint_t *endpoint = OBJ_NEW(mca_btl_uct_endpoint_t); + mca_btl_uct_endpoint_t *endpoint = calloc (1, sizeof (*endpoint) + sizeof (endpoint->uct_eps[0]) * + mca_btl_uct_component.num_contexts_per_module); if (OPAL_UNLIKELY(NULL == endpoint)) { return NULL; } + OBJ_CONSTRUCT(endpoint, mca_btl_uct_endpoint_t); endpoint->ep_proc = proc; return (mca_btl_base_endpoint_t *) endpoint; @@ -295,7 +295,7 @@ static int mca_btl_uct_endpoint_connect_endpoint (mca_btl_uct_module_t *uct_btl, int mca_btl_uct_endpoint_connect (mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint, int context_id, void *ep_addr, int tl_index) { - mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[tl_index] + context_id; + mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[context_id] + tl_index; mca_btl_uct_device_context_t *tl_context = mca_btl_uct_module_get_rdma_context_specific (uct_btl, context_id); mca_btl_uct_tl_t *tl = (tl_index == uct_btl->rdma_tl->tl_index) ? uct_btl->rdma_tl : uct_btl->am_tl; uint8_t *rdma_tl_data = NULL, *conn_tl_data = NULL, *am_tl_data = NULL, *tl_data; diff --git a/opal/mca/btl/uct/btl_uct_endpoint.h b/opal/mca/btl/uct/btl_uct_endpoint.h index d77288d0731..f8d5e6f522d 100644 --- a/opal/mca/btl/uct/btl_uct_endpoint.h +++ b/opal/mca/btl/uct/btl_uct_endpoint.h @@ -37,8 +37,8 @@ static int mca_btl_uct_endpoint_test_am (mca_btl_uct_module_t *module, mca_btl_u int tl_index = module->am_tl->tl_index; int ep_index = context->context_id; - if (OPAL_LIKELY(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & endpoint->uct_eps[tl_index][ep_index].flags)) { - *ep_handle = endpoint->uct_eps[tl_index][ep_index].uct_ep; + if (OPAL_LIKELY(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & endpoint->uct_eps[ep_index][tl_index].flags)) { + *ep_handle = endpoint->uct_eps[ep_index][tl_index].uct_ep; return OPAL_SUCCESS; } @@ -65,13 +65,13 @@ static inline int mca_btl_uct_endpoint_check (mca_btl_uct_module_t *module, mca_ int ep_index = context->context_id; int rc; - if (OPAL_LIKELY(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & endpoint->uct_eps[tl_index][ep_index].flags)) { - *ep_handle = endpoint->uct_eps[tl_index][ep_index].uct_ep; + if (OPAL_LIKELY(MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY & endpoint->uct_eps[ep_index][tl_index].flags)) { + *ep_handle = endpoint->uct_eps[ep_index][tl_index].uct_ep; return OPAL_SUCCESS; } rc = mca_btl_uct_endpoint_connect (module, endpoint, ep_index, NULL, tl_index); - *ep_handle = endpoint->uct_eps[tl_index][ep_index].uct_ep; + *ep_handle = endpoint->uct_eps[ep_index][tl_index].uct_ep; BTL_VERBOSE(("mca_btl_uct_endpoint_connect returned %d", rc)); return rc; } diff --git a/opal/mca/btl/uct/btl_uct_rdma.c b/opal/mca/btl/uct/btl_uct_rdma.c index 6bbbe9f57a4..58f7c504792 100644 --- a/opal/mca/btl/uct/btl_uct_rdma.c +++ b/opal/mca/btl/uct/btl_uct_rdma.c @@ -243,8 +243,8 @@ int mca_btl_uct_flush (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endp do { uct_worker_progress (context->uct_worker); - if (NULL != endpoint && endpoint->uct_eps[tl_index][context->context_id].uct_ep) { - ucs_status = uct_ep_flush (endpoint->uct_eps[tl_index][context->context_id].uct_ep, 0, NULL); + if (NULL != endpoint && endpoint->uct_eps[context->context_id][tl_index].uct_ep) { + ucs_status = uct_ep_flush (endpoint->uct_eps[context->context_id][tl_index].uct_ep, 0, NULL); } else { ucs_status = uct_iface_flush (context->uct_iface, 0, NULL); } diff --git a/opal/mca/btl/uct/btl_uct_tl.c b/opal/mca/btl/uct/btl_uct_tl.c index c4014744fdf..823588ebd6c 100644 --- a/opal/mca/btl/uct/btl_uct_tl.c +++ b/opal/mca/btl/uct/btl_uct_tl.c @@ -165,7 +165,7 @@ static ucs_status_t mca_btl_uct_conn_req_cb (void *arg, void *data, size_t lengt mca_btl_uct_conn_req_t *req = (mca_btl_uct_conn_req_t *) ((uintptr_t) data + 8); struct opal_proc_t *remote_proc = opal_proc_for_name (req->proc_name); mca_btl_base_endpoint_t *endpoint = mca_btl_uct_get_ep (&module->super, remote_proc); - mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[req->tl_index] + req->context_id; + mca_btl_uct_tl_endpoint_t *tl_endpoint = endpoint->uct_eps[req->context_id] + req->tl_index; int64_t type = *((int64_t *) data); int32_t ep_flags; int rc; diff --git a/opal/mca/btl/uct/btl_uct_types.h b/opal/mca/btl/uct/btl_uct_types.h index 5b8b4d58ac8..f7731d9e441 100644 --- a/opal/mca/btl/uct/btl_uct_types.h +++ b/opal/mca/btl/uct/btl_uct_types.h @@ -269,7 +269,7 @@ struct mca_btl_base_endpoint_t { mca_btl_uct_connection_ep_t *conn_ep; /** endpoints into UCT for this BTL endpoint */ - mca_btl_uct_tl_endpoint_t uct_eps[2][MCA_BTL_UCT_MAX_WORKERS]; + mca_btl_uct_tl_endpoint_t uct_eps[][2]; }; typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t;