diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4 index 67059a8c851..be5aa777b3b 100644 --- a/config/opal_check_cuda.m4 +++ b/config/opal_check_cuda.m4 @@ -117,8 +117,8 @@ AC_MSG_CHECKING([if have cuda support]) if test "$opal_check_cuda_happy" = "yes"; then AC_MSG_RESULT([yes (-I$opal_cuda_incdir)]) CUDA_SUPPORT=1 - opal_datatype_cuda_CPPFLAGS="-I$opal_cuda_incdir" - AC_SUBST([opal_datatype_cuda_CPPFLAGS]) + common_cuda_CPPFLAGS="-I$opal_cuda_incdir" + AC_SUBST([common_cuda_CPPFLAGS]) else AC_MSG_RESULT([no]) CUDA_SUPPORT=0 diff --git a/ompi/mca/coll/cuda/coll_cuda_allreduce.c b/ompi/mca/coll/cuda/coll_cuda_allreduce.c index 3c851c64782..416c9c7fa8f 100644 --- a/ompi/mca/coll/cuda/coll_cuda_allreduce.c +++ b/ompi/mca/coll/cuda/coll_cuda_allreduce.c @@ -17,7 +17,7 @@ #include "ompi/op/op.h" #include "opal/datatype/opal_convertor.h" -#include "opal/datatype/opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" /* * allreduce_intra diff --git a/ompi/mca/coll/cuda/coll_cuda_exscan.c b/ompi/mca/coll/cuda/coll_cuda_exscan.c index 23f5a5da839..5f736697fe0 100644 --- a/ompi/mca/coll/cuda/coll_cuda_exscan.c +++ b/ompi/mca/coll/cuda/coll_cuda_exscan.c @@ -17,7 +17,7 @@ #include "ompi/op/op.h" #include "opal/datatype/opal_convertor.h" -#include "opal/datatype/opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c index 2bcce13c75c..5d82667b6bb 100644 --- a/ompi/mca/coll/cuda/coll_cuda_reduce.c +++ b/ompi/mca/coll/cuda/coll_cuda_reduce.c @@ -17,7 +17,7 @@ #include "ompi/op/op.h" #include "opal/datatype/opal_convertor.h" -#include "opal/datatype/opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" /* * reduce_log_inter diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c index 6dded294e69..907257b0da8 100644 --- a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c +++ b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c @@ -17,7 +17,7 @@ #include "ompi/op/op.h" #include "opal/datatype/opal_convertor.h" -#include "opal/datatype/opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" /* * reduce_scatter_block diff --git a/ompi/mca/coll/cuda/coll_cuda_scan.c b/ompi/mca/coll/cuda/coll_cuda_scan.c index 6d70a10c7a3..4e7300c12f8 100644 --- a/ompi/mca/coll/cuda/coll_cuda_scan.c +++ b/ompi/mca/coll/cuda/coll_cuda_scan.c @@ -17,7 +17,7 @@ #include "ompi/op/op.h" #include "opal/datatype/opal_convertor.h" -#include "opal/datatype/opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" /* * scan diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h index 735beaa06e2..4d5f42bc060 100644 --- a/ompi/mca/coll/libnbc/nbc_internal.h +++ b/ompi/mca/coll/libnbc/nbc_internal.h @@ -31,7 +31,7 @@ #include "coll_libnbc.h" #if OPAL_CUDA_SUPPORT #include "opal/datatype/opal_convertor.h" -#include "opal/datatype/opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ #include "ompi/include/ompi/constants.h" #include "ompi/request/request.h" diff --git a/ompi/mca/common/ompio/common_ompio_buffer.c b/ompi/mca/common/ompio/common_ompio_buffer.c index 689e25b32f9..dbd7e30e6b4 100644 --- a/ompi/mca/common/ompio/common_ompio_buffer.c +++ b/ompi/mca/common/ompio/common_ompio_buffer.c @@ -20,7 +20,6 @@ #include "ompi_config.h" #include "opal/datatype/opal_convertor.h" -#include "opal/datatype/opal_datatype_cuda.h" #include "opal/mca/common/cuda/common_cuda.h" #include "opal/util/sys_limits.h" diff --git a/ompi/mca/mtl/base/mtl_base_datatype.h b/ompi/mca/mtl/base/mtl_base_datatype.h index 41559245745..544ca32abc7 100644 --- a/ompi/mca/mtl/base/mtl_base_datatype.h +++ b/ompi/mca/mtl/base/mtl_base_datatype.h @@ -25,16 +25,82 @@ #include "ompi/datatype/ompi_datatype.h" #include "opal/datatype/opal_convertor.h" #include "opal/datatype/opal_datatype_internal.h" +#if OPAL_CUDA_SUPPORT +#include "opal/mca/common/cuda/common_cuda.h" +#include "opal/datatype/opal_convertor.h" +#endif #ifndef MTL_BASE_DATATYPE_H_INCLUDED #define MTL_BASE_DATATYPE_H_INCLUDED +#if OPAL_CUDA_SUPPORT +static int +ompi_mtl_cuda_datatype_pack(struct opal_convertor_t *convertor, + void **buffer, + size_t *buffer_len, + bool *freeAfter) +{ + + struct iovec iov; + uint32_t iov_count = 1; + int is_cuda = convertor->flags & CONVERTOR_CUDA; + +#if !(OPAL_ENABLE_HETEROGENEOUS_SUPPORT) + if (convertor->pDesc && + !(convertor->flags & CONVERTOR_COMPLETED) && + opal_datatype_is_contiguous_memory_layout(convertor->pDesc, + convertor->count)) { + *freeAfter = false; + *buffer = convertor->pBaseBuf; + *buffer_len = convertor->local_size; + return OPAL_SUCCESS; + } +#endif + + opal_convertor_get_packed_size(convertor, buffer_len); + *freeAfter = false; + if( 0 == *buffer_len ) { + *buffer = NULL; + return OMPI_SUCCESS; + } + iov.iov_len = *buffer_len; + iov.iov_base = NULL; + /* opal_convertor_need_buffers always returns true + * if CONVERTOR_CUDA is set, so unset temporarily + */ + convertor->flags &= ~CONVERTOR_CUDA; + + if (opal_convertor_need_buffers(convertor)) { + if (is_cuda) { + convertor->flags |= CONVERTOR_CUDA; + } + iov.iov_base = opal_cuda_malloc(*buffer_len, convertor); + if (NULL == iov.iov_base) return OMPI_ERR_OUT_OF_RESOURCE; + *freeAfter = true; + } else if (is_cuda) { + convertor->flags |= CONVERTOR_CUDA; + } + + opal_convertor_pack( convertor, &iov, &iov_count, buffer_len ); + + *buffer = iov.iov_base; + + return OMPI_SUCCESS; +} +#endif + __opal_attribute_always_inline__ static inline int ompi_mtl_datatype_pack(struct opal_convertor_t *convertor, void **buffer, size_t *buffer_len, bool *freeAfter) { +#if OPAL_CUDA_SUPPORT + return ompi_mtl_cuda_datatype_pack(convertor, + buffer, + buffer_len, + freeAfter); +#endif struct iovec iov; uint32_t iov_count = 1; @@ -71,6 +137,42 @@ ompi_mtl_datatype_pack(struct opal_convertor_t *convertor, return OMPI_SUCCESS; } +#if OPAL_CUDA_SUPPORT +static int +ompi_mtl_cuda_datatype_recv_buf(struct opal_convertor_t *convertor, + void ** buffer, + size_t *buffer_len, + bool *free_on_error) +{ + int is_cuda = convertor->flags & CONVERTOR_CUDA; + opal_convertor_get_packed_size(convertor, buffer_len); + *free_on_error = false; + if( 0 == *buffer_len ) { + *buffer = NULL; + *buffer_len = 0; + return OMPI_SUCCESS; + } + /* opal_convertor_need_buffers always returns true + * if CONVERTOR_CUDA is set, so unset temporarily + */ + convertor->flags &= ~CONVERTOR_CUDA; + if (opal_convertor_need_buffers(convertor)) { + if (is_cuda) { + convertor->flags |= CONVERTOR_CUDA; + } + *buffer = opal_cuda_malloc(*buffer_len, convertor); + *free_on_error = true; + } else { + if (is_cuda) { + convertor->flags |= CONVERTOR_CUDA; + } + *buffer = convertor->pBaseBuf + + convertor->use_desc->desc[convertor->use_desc->used].end_loop.first_elem_disp; + } + return OMPI_SUCCESS; + +} +#endif __opal_attribute_always_inline__ static inline int ompi_mtl_datatype_recv_buf(struct opal_convertor_t *convertor, @@ -78,6 +180,13 @@ ompi_mtl_datatype_recv_buf(struct opal_convertor_t *convertor, size_t *buffer_len, bool *free_on_error) { +#if OPAL_CUDA_SUPPORT + return ompi_mtl_cuda_datatype_recv_buf(convertor, + buffer, + buffer_len, + free_on_error); +#endif + opal_convertor_get_packed_size(convertor, buffer_len); *free_on_error = false; if( 0 == *buffer_len ) { @@ -95,12 +204,48 @@ ompi_mtl_datatype_recv_buf(struct opal_convertor_t *convertor, return OMPI_SUCCESS; } +#if OPAL_CUDA_SUPPORT +static int +ompi_mtl_cuda_datatype_unpack(struct opal_convertor_t *convertor, + void *buffer, + size_t buffer_len) { + struct iovec iov; + uint32_t iov_count = 1; + int is_cuda = convertor->flags & CONVERTOR_CUDA; + /* opal_convertor_need_buffers always returns true + * if CONVERTOR_CUDA is set, so unset temporarily + */ + convertor->flags &= ~CONVERTOR_CUDA; + + if (buffer_len > 0 && opal_convertor_need_buffers(convertor)) { + iov.iov_len = buffer_len; + iov.iov_base = buffer; + + if (is_cuda) { + convertor->flags |= CONVERTOR_CUDA; + } + opal_convertor_unpack(convertor, &iov, &iov_count, &buffer_len ); + + opal_cuda_free(buffer, convertor); + } else if (is_cuda) { + convertor->flags |= CONVERTOR_CUDA; + } + + return OMPI_SUCCESS; + +} +#endif __opal_attribute_always_inline__ static inline int ompi_mtl_datatype_unpack(struct opal_convertor_t *convertor, void *buffer, size_t buffer_len) { +#if OPAL_CUDA_SUPPORT + return ompi_mtl_cuda_datatype_unpack(convertor, + buffer, + buffer_len); +#endif struct iovec iov; uint32_t iov_count = 1; diff --git a/ompi/mca/mtl/ofi/configure.m4 b/ompi/mca/mtl/ofi/configure.m4 index 2ab0a084e0c..678247c5d35 100644 --- a/ompi/mca/mtl/ofi/configure.m4 +++ b/ompi/mca/mtl/ofi/configure.m4 @@ -28,6 +28,17 @@ AC_DEFUN([MCA_ompi_mtl_ofi_CONFIG],[ # Check for OFI OPAL_CHECK_OFI + # Check for CUDA + OPAL_CHECK_CUDA + + # Check for cuda support. If so, we require a minimum libfabric version + # of 1.9. FI_HMEM capabilities are only available starting from v1.9 + opal_ofi_happy="yes" + AS_IF([test "$opal_check_cuda_happy" = "yes"], + [OPAL_CHECK_OFI_VERSION_GE([1,9], + [], + [opal_ofi_happy=no])]) + # The OFI MTL requires at least OFI libfabric v1.5. AS_IF([test "$opal_ofi_happy" = "yes"], [OPAL_CHECK_OFI_VERSION_GE([1,5], diff --git a/ompi/mca/mtl/ofi/help-mtl-ofi.txt b/ompi/mca/mtl/ofi/help-mtl-ofi.txt index 56778f63b53..59de33a539d 100644 --- a/ompi/mca/mtl/ofi/help-mtl-ofi.txt +++ b/ompi/mca/mtl/ofi/help-mtl-ofi.txt @@ -77,3 +77,12 @@ recoverable and your application is likely to abort. Error: %s (%d) [message too big] Message size %llu bigger than supported by selected transport. Max = %llu + +[Buffer Memory Registration Failed] +Open MPI failed to register your buffer. +This error is fatal, your job will abort + + Buffer Type: %s + Buffer Address: %p + Buffer Length: %d + Error: %s (%zd) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 14203576309..6626c754a97 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -47,6 +47,11 @@ #include "mtl_ofi_endpoint.h" #include "mtl_ofi_compat.h" +#if OPAL_CUDA_SUPPORT +#include "opal/mca/common/cuda/common_cuda.h" +#include "opal/datatype/opal_convertor.h" +#endif + BEGIN_C_DECLS extern mca_mtl_ofi_module_t ompi_mtl_ofi; @@ -247,6 +252,99 @@ ompi_mtl_ofi_progress(void) __FILE__, __LINE__, string, fi_strerror(-err)); \ } while(0); +/** + * Memory registration functions + */ + +/** Called before any libfabric or registration calls */ +__opal_attribute_always_inline__ static inline void +ompi_mtl_ofi_set_mr_null(ompi_mtl_ofi_request_t *ofi_req) { + ofi_req->mr = NULL; +} + +/** + * Registers user buffer with Libfabric domain if + * buffer is cuda and provider has fi_mr_hmem + */ +static +int ompi_mtl_ofi_register_buffer(struct opal_convertor_t *convertor, + ompi_mtl_ofi_request_t *ofi_req, + void* buffer) { + ofi_req->mr = NULL; + if (ofi_req->length <= 0 || NULL == buffer) { + return OMPI_SUCCESS; + } + +#if OPAL_CUDA_SUPPORT + if (convertor->flags & CONVERTOR_CUDA) { + /* Register buffer */ + int ret; + struct fi_mr_attr attr = {0}; + struct iovec iov = {0}; + + iov.iov_base = buffer; + iov.iov_len = ofi_req->length; + attr.mr_iov = &iov; + attr.iov_count = 1; + attr.access = FI_SEND | FI_RECV; + attr.offset = 0; + attr.context = NULL; + + attr.iface = FI_HMEM_CUDA; + mca_common_cuda_get_device(&attr.device.cuda); + + ret = fi_mr_regattr(ompi_mtl_ofi.domain, &attr, 0, &ofi_req->mr); + + if (ret) { + opal_show_help("help-mtl-ofi.txt", "Buffer Memory Registration Failed", true, + "CUDA", + buffer, ofi_req->length, + fi_strerror(-ret), ret); + ofi_req->mr = NULL; + return OMPI_ERROR; + } + } +#endif /* OPAL_CUDA_SUPPORT */ + return OMPI_SUCCESS; +} + +/** Deregister buffer */ +__opal_attribute_always_inline__ static inline int +ompi_mtl_ofi_deregister_buffer(ompi_mtl_ofi_request_t *ofi_req) { + if (ofi_req->mr) { + int ret; + ret = fi_close(&ofi_req->mr->fid); + if (ret) { + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_close", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ofi_req->mr->fid); + return OMPI_ERROR; + } + ofi_req->mr = NULL; + } + return OMPI_SUCCESS; +} + +/** Deregister and free a buffer */ +static +int ompi_mtl_ofi_deregister_and_free_buffer(ompi_mtl_ofi_request_t *ofi_req) { + int ret = OMPI_SUCCESS; + ret = ompi_mtl_ofi_deregister_buffer(ofi_req); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + if (OPAL_UNLIKELY(NULL != ofi_req->buffer)) { +#if OPAL_CUDA_SUPPORT + opal_cuda_free(ofi_req->buffer, ofi_req->convertor); +#else + free(ofi_req->buffer); +#endif + } + ofi_req->buffer = NULL; + return ret; +} + /* MTL interface functions */ int ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl); @@ -324,10 +422,7 @@ ompi_mtl_ofi_isend_callback(struct fi_cq_tagged_entry *wc, if (0 == ofi_req->completion_count) { /* Request completed */ - if (OPAL_UNLIKELY(NULL != ofi_req->buffer)) { - free(ofi_req->buffer); - ofi_req->buffer = NULL; - } + ompi_mtl_ofi_deregister_and_free_buffer(ofi_req); ofi_req->super.ompi_req->req_status.MPI_ERROR = ofi_req->status.MPI_ERROR; @@ -418,6 +513,8 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl, fi_addr_t src_addr = 0; fi_addr_t sep_peer_fiaddr = 0; + ompi_mtl_ofi_set_mr_null(&ofi_req); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); @@ -467,7 +564,15 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl, goto free_request_buffer; } + /** Inject does not currently support device memory + * https://github.com/ofiwg/libfabric/issues/5861 + */ +#if OPAL_CUDA_SUPPORT + if (!(convertor->flags & CONVERTOR_CUDA) + && (ompi_mtl_ofi.max_inject_size >= length)) { +#else /* !(OPAL_CUDA_SUPPORT)*/ if (ompi_mtl_ofi.max_inject_size >= length) { +#endif /* OPAL_CUDA_SUPPORT */ if (ofi_cq_data) { MTL_OFI_RETRY_UNTIL_DONE(fi_tinjectdata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep, start, @@ -495,12 +600,16 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl, goto free_request_buffer; } } else { + ompi_ret = ompi_mtl_ofi_register_buffer(convertor, &ofi_req, start); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) { + return ompi_ret; + } ofi_req.completion_count += 1; if (ofi_cq_data) { MTL_OFI_RETRY_UNTIL_DONE(fi_tsenddata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep, start, length, - NULL, + (NULL == ofi_req.mr) ? NULL : ofi_req.mr->mem_desc, comm->c_my_rank, sep_peer_fiaddr, match_bits, @@ -509,7 +618,7 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl, MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep, start, length, - NULL, + (NULL == ofi_req.mr) ? NULL : ofi_req.mr->mem_desc, sep_peer_fiaddr, match_bits, (void *) &ofi_req.ctx), ret); @@ -532,9 +641,7 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl, } free_request_buffer: - if (OPAL_UNLIKELY(NULL != ofi_req.buffer)) { - free(ofi_req.buffer); - } + ompi_mtl_ofi_deregister_and_free_buffer(&ofi_req); return ofi_req.status.MPI_ERROR; } @@ -562,6 +669,8 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl, ompi_mtl_ofi_request_t *ack_req = NULL; /* For synchronous send */ fi_addr_t sep_peer_fiaddr = 0; + ompi_mtl_ofi_set_mr_null(ofi_req); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); @@ -605,11 +714,16 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl, goto free_request_buffer; } + ompi_ret = ompi_mtl_ofi_register_buffer(convertor, ofi_req, start); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) { + return ompi_ret; + } + if (ofi_cq_data) { MTL_OFI_RETRY_UNTIL_DONE(fi_tsenddata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep, start, length, - NULL, + (NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc, comm->c_my_rank, sep_peer_fiaddr, match_bits, @@ -618,7 +732,7 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl, MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep, start, length, - NULL, + (NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc, sep_peer_fiaddr, match_bits, (void *) &ofi_req->ctx), ret); @@ -631,9 +745,8 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl, } free_request_buffer: - if (OPAL_UNLIKELY(OMPI_SUCCESS != ofi_req->status.MPI_ERROR - && NULL != ofi_req->buffer)) { - free(ofi_req->buffer); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ofi_req->status.MPI_ERROR)) { + ompi_mtl_ofi_deregister_and_free_buffer(ofi_req); } return ofi_req->status.MPI_ERROR; @@ -676,6 +789,8 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc, status->MPI_ERROR = MPI_ERR_TRUNCATE; } + ompi_mtl_ofi_deregister_buffer(ofi_req); + /** * Unpack data into recv buffer if necessary. */ @@ -795,6 +910,8 @@ ompi_mtl_ofi_irecv_generic(struct mca_mtl_base_module_t *mtl, size_t length; bool free_after; + ompi_mtl_ofi_set_mr_null(ofi_req); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); @@ -833,18 +950,22 @@ ompi_mtl_ofi_irecv_generic(struct mca_mtl_base_module_t *mtl, ofi_req->remote_addr = remote_addr; ofi_req->match_bits = match_bits; + ompi_ret = ompi_mtl_ofi_register_buffer(convertor, ofi_req, start); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) { + return ompi_ret; + } + MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, start, length, - NULL, + (NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc, remote_addr, match_bits, mask_bits, (void *)&ofi_req->ctx), ret); if (OPAL_UNLIKELY(0 > ret)) { - if (NULL != ofi_req->buffer) { - free(ofi_req->buffer); - } + ompi_mtl_ofi_deregister_and_free_buffer(ofi_req); + MTL_OFI_LOG_FI_ERR(ret, "fi_trecv failed"); return ompi_mtl_ofi_get_error(ret); } @@ -866,6 +987,8 @@ ompi_mtl_ofi_mrecv_callback(struct fi_cq_tagged_entry *wc, status->MPI_ERROR = MPI_SUCCESS; status->_ucount = wc->len; + ompi_mtl_ofi_deregister_and_free_buffer(ofi_req); + free(ofi_req); mrecv_req->completion_callback(mrecv_req); @@ -896,6 +1019,8 @@ ompi_mtl_ofi_mrecv_error_callback(struct fi_cq_err_entry *error, status->MPI_ERROR = MPI_ERR_INTERN; } + ompi_mtl_ofi_deregister_and_free_buffer(ofi_req); + free(ofi_req); mrecv_req->completion_callback(mrecv_req); @@ -921,6 +1046,8 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl, uint64_t msgflags = FI_CLAIM | FI_COMPLETION; struct ompi_communicator_t *comm = (*message)->comm; + ompi_mtl_ofi_set_mr_null(ofi_req); + ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid); set_thread_context(ctxt_id); @@ -941,13 +1068,18 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl, ofi_req->status.MPI_ERROR = OMPI_SUCCESS; ofi_req->mrecv_req = mtl_request; + ompi_ret = ompi_mtl_ofi_register_buffer(convertor, ofi_req, start); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) { + return ompi_ret; + } + /** * fi_trecvmsg with FI_CLAIM */ iov.iov_base = start; iov.iov_len = length; msg.msg_iov = &iov; - msg.desc = NULL; + msg.desc = (NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc; msg.iov_count = 1; msg.addr = 0; msg.tag = ofi_req->match_bits; @@ -957,6 +1089,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl, MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret); if (OPAL_UNLIKELY(0 > ret)) { + ompi_mtl_ofi_deregister_and_free_buffer(ofi_req); MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed"); return ompi_mtl_ofi_get_error(ret); } diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 908be25fbc9..a57dd51f571 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -20,6 +20,9 @@ #include "opal/util/argv.h" #include "opal/util/printf.h" #include "opal/mca/common/ofi/common_ofi.h" +#if OPAL_CUDA_SUPPORT +#include "opal/mca/common/cuda/common_cuda.h" +#endif /* OPAL_CUDA_SUPPORT */ static int ompi_mtl_ofi_component_open(void); static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority); @@ -297,6 +300,9 @@ ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority) static int ompi_mtl_ofi_component_close(void) { +#if OPAL_CUDA_SUPPORT + mca_common_cuda_fini(); +#endif opal_common_ofi_mca_deregister(); return OMPI_SUCCESS; } @@ -591,6 +597,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, exclude_list = opal_argv_split(*opal_common_ofi.prov_exclude, ','); } + /** + * Note: API version 1.5 is the first version that supports + * FI_LOCAL_COMM / FI_REMOTE_COMM checking (and we definitely need + * that checking -- e.g., the shared memory provider supports + * intranode communication (FI_LOCAL_COMM), but not internode + * (FI_REMOTE_COMM), which is insufficient for MTL selection. + */ + fi_version = FI_VERSION(1, 5); + /** * Hints to filter providers * See man fi_getinfo for a list of all filters @@ -608,15 +623,27 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, __FILE__, __LINE__); goto error; } + +#if OPAL_CUDA_SUPPORT + /** If Open MPI is built with CUDA, request device transfer + * capabilities */ + hints->caps |= FI_HMEM; + hints->domain_attr->mr_mode |= FI_MR_HMEM; + /** + * Note: API version 1.9 is the first version that supports FI_HMEM + */ + fi_version = FI_VERSION(1, 9); +#endif /* OPAL_CUDA_SUPPORT */ + /* Make sure to get a RDM provider that can do the tagged matching interface and local communication and remote communication. */ hints->mode = FI_CONTEXT; hints->ep_attr->type = FI_EP_RDM; - hints->caps = FI_TAGGED | FI_LOCAL_COMM | FI_REMOTE_COMM | FI_DIRECTED_RECV; + hints->caps |= FI_TAGGED | FI_LOCAL_COMM | FI_REMOTE_COMM | FI_DIRECTED_RECV; hints->tx_attr->msg_order = FI_ORDER_SAS; hints->rx_attr->msg_order = FI_ORDER_SAS; - hints->rx_attr->op_flags = FI_COMPLETION; - hints->tx_attr->op_flags = FI_COMPLETION; + hints->rx_attr->op_flags = FI_COMPLETION; + hints->tx_attr->op_flags = FI_COMPLETION; if (enable_mpi_threads) { ompi_mtl_ofi.mpi_thread_multiple = true; @@ -660,18 +687,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, hints->domain_attr->resource_mgmt = FI_RM_ENABLED; - /** - * FI_VERSION provides binary backward and forward compatibility support - * Specify the version of OFI is coded to, the provider will select struct - * layouts that are compatible with this version. - * - * Note: API version 1.5 is the first version that supports - * FI_LOCAL_COMM / FI_REMOTE_COMM checking (and we definitely need - * that checking -- e.g., some providers are suitable for RXD or - * RXM, but can't provide local communication). - */ - fi_version = FI_VERSION(1, 5); - /** * The EFA provider in Libfabric versions prior to 1.10 contains a bug * where the FI_LOCAL_COMM and FI_REMOTE_COMM capabilities are not @@ -762,6 +777,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, opal_argv_free(exclude_list); exclude_list = NULL; +#if OPAL_CUDA_SUPPORT + if (!(prov->caps & FI_HMEM)) { + opal_output_verbose(1, opal_common_ofi.output, + "%s:%d: Libfabric provider does not support CUDA buffers\n", + __FILE__, __LINE__); + goto error; + } +#endif /* OPAL_CUDA_SUPPORT */ + /** * Select the format of the OFI tag */ @@ -1037,6 +1061,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, */ ompi_mtl_ofi.any_addr = FI_ADDR_UNSPEC; +#if OPAL_CUDA_SUPPORT + mca_common_cuda_stage_one_init(); +#endif + return &ompi_mtl_ofi.base; error: diff --git a/ompi/mca/mtl/ofi/mtl_ofi_request.h b/ompi/mca/mtl/ofi/mtl_ofi_request.h index 15bbd2b0148..d1c05e9680e 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_request.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_request.h @@ -87,6 +87,10 @@ struct ompi_mtl_ofi_request_t { /** Pointer to Mrecv request to complete */ struct mca_mtl_request_t *mrecv_req; + + /** Stores reference to memory region from registration */ + /* Set to NULL if memory not registered or if non CUDA buffer */ + struct fid_mr *mr; }; typedef struct ompi_mtl_ofi_request_t ompi_mtl_ofi_request_t; diff --git a/ompi/mca/mtl/portals4/mtl_portals4_component.c b/ompi/mca/mtl/portals4/mtl_portals4_component.c index f60cb3cfa18..132358b5638 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_component.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_component.c @@ -428,6 +428,12 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads, id.phys.nid, id.phys.pid)); ompi_mtl_portals4.base.mtl_max_tag = MTL_PORTALS4_MAX_TAG; + + /* Disable opal from checking if buffer being sent is cuda */ +#if OPAL_CUDA_SUPPORT + ompi_mtl_portals4.base.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE; +#endif /* OPAL_CUDA_SUPPORT */ + return &ompi_mtl_portals4.base; error: diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index bba219922ad..6f18572d763 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -51,7 +51,7 @@ #include "opal/align.h" #include "opal/util/sys_limits.h" #if OPAL_CUDA_SUPPORT -#include "opal/datatype/opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ #include "opal/util/info_subscriber.h" diff --git a/ompi/mca/pml/cm/pml_cm.h b/ompi/mca/pml/cm/pml_cm.h index b3c06eb83bf..fa563e0b313 100644 --- a/ompi/mca/pml/cm/pml_cm.h +++ b/ompi/mca/pml/cm/pml_cm.h @@ -379,7 +379,16 @@ mca_pml_cm_send(const void *buf, convertor.pBaseBuf = (unsigned char*)buf + datatype->super.true_lb; convertor.count = count; convertor.pDesc = &datatype->super; - } else + +#if OPAL_CUDA_SUPPORT + /* Switches off CUDA detection if + MTL set MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE during init */ + MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count); + convertor.flags |= flags; + /* Sets CONVERTOR_CUDA flag if CUDA buffer */ + opal_convertor_prepare_for_send( &convertor, &datatype->super, count, buf ); +#endif + } else #endif { ompi_proc = ompi_comm_peer_lookup(comm, dst); diff --git a/ompi/mca/pml/cm/pml_cm_sendreq.h b/ompi/mca/pml/cm/pml_cm_sendreq.h index 3560270b99f..c229f3e6899 100644 --- a/ompi/mca/pml/cm/pml_cm_sendreq.h +++ b/ompi/mca/pml/cm/pml_cm_sendreq.h @@ -242,6 +242,14 @@ do { \ (unsigned char*)buf + datatype->super.true_lb; \ (req_send)->req_base.req_convertor.count = count; \ (req_send)->req_base.req_convertor.pDesc = &datatype->super; \ + /* Switches off CUDA detection if \ + MTL set MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE during init */ \ + MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count); \ + (req_send)->req_base.req_convertor.flags |= flags; \ + /* Sets CONVERTOR_CUDA flag if CUDA buffer */ \ + opal_convertor_prepare_for_send( \ + &req_send->req_base.req_convertor, \ + &datatype->super, count, buf ); \ } else { \ MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count); \ opal_convertor_copy_and_prepare_for_send( \ diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 51239e1210c..3c1e2762e85 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -50,7 +50,6 @@ #include "pml_ob1_sendreq.h" #include "pml_ob1_hdr.h" #if OPAL_CUDA_SUPPORT -#include "opal/datatype/opal_datatype_cuda.h" #include "opal/mca/common/cuda/common_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 1b798125c4b..a752f34463d 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -43,7 +43,6 @@ #include "ompi/mca/bml/base/base.h" #if OPAL_CUDA_SUPPORT -#include "opal/datatype/opal_datatype_cuda.h" #include "opal/mca/common/cuda/common_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ diff --git a/opal/Makefile.am b/opal/Makefile.am index e7484cd8c51..8c208632dd2 100644 --- a/opal/Makefile.am +++ b/opal/Makefile.am @@ -22,8 +22,10 @@ # $HEADER$ # +if OPAL_cuda_support SUBDIRS = \ include \ + mca/common/cuda \ datatype \ etc \ util \ @@ -32,21 +34,43 @@ SUBDIRS = \ $(MCA_opal_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \ . \ $(MCA_opal_FRAMEWORK_COMPONENT_DSO_SUBDIRS) - # libltdl is included by variable because if --disable-dlopen was # used, there will be no generated Makefile in that directory (and # therefore make distclean will fail). DIST_SUBDIRS = \ include \ + mca/common/cuda \ datatype \ etc \ util \ mca/base \ $(MCA_opal_FRAMEWORKS_SUBDIRS) \ $(MCA_opal_FRAMEWORK_COMPONENT_ALL_SUBDIRS) +else +SUBDIRS = \ + include \ + datatype \ + etc \ + util \ + mca/base \ + $(MCA_opal_FRAMEWORKS_SUBDIRS) \ + $(MCA_opal_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \ + . \ + $(MCA_opal_FRAMEWORK_COMPONENT_DSO_SUBDIRS) +# libltdl is included by variable because if --disable-dlopen was +# used, there will be no generated Makefile in that directory (and +# therefore make distclean will fail). +DIST_SUBDIRS = \ + include \ + datatype \ + etc \ + util \ + mca/base \ + $(MCA_opal_FRAMEWORKS_SUBDIRS) \ + $(MCA_opal_FRAMEWORK_COMPONENT_ALL_SUBDIRS) +endif # Build the main OPAL library - lib_LTLIBRARIES = lib@OPAL_LIB_PREFIX@open-pal.la lib@OPAL_LIB_PREFIX@open_pal_la_SOURCES = lib@OPAL_LIB_PREFIX@open_pal_la_LIBADD = \ @@ -62,6 +86,12 @@ lib@OPAL_LIB_PREFIX@open_pal_la_DEPENDENCIES = \ mca/base/libmca_base.la \ util/libopalutil.la \ $(MCA_opal_FRAMEWORK_LIBS) +if OPAL_cuda_support +lib@OPAL_LIB_PREFIX@open_pal_la_LIBADD += \ + mca/common/cuda/libmca_common_cuda.la +lib@OPAL_LIB_PREFIX@open_pal_la_DEPENDENCIES += \ + mca/common/cuda/libmca_common_cuda.la +endif lib@OPAL_LIB_PREFIX@open_pal_la_LDFLAGS = -version-info $(libopen_pal_so_version) \ $(opal_libevent_LDFLAGS) \ $(opal_hwloc_LDFLAGS) \ diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am index 36d13eff3b5..340800a6be3 100644 --- a/opal/datatype/Makefile.am +++ b/opal/datatype/Makefile.am @@ -78,8 +78,3 @@ opaldir = $(opalincludedir)/$(subdir) opal_HEADERS = $(headers) endif -# If we have cuda support, modify file list and flags -if OPAL_cuda_support -libdatatype_la_SOURCES += opal_datatype_cuda.c -headers += opal_datatype_cuda.h -endif diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index 853e5b1632f..36736a3007f 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -39,7 +39,7 @@ #include "opal/datatype/opal_datatype_prototypes.h" #include "opal/datatype/opal_convertor_internal.h" #if OPAL_CUDA_SUPPORT -#include "opal/datatype/opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \ CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) #endif diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c index c70bdd24dfa..e13d17fa59c 100644 --- a/opal/datatype/opal_datatype_copy.c +++ b/opal/datatype/opal_datatype_copy.c @@ -72,7 +72,7 @@ static size_t opal_datatype_memop_block_size = 128 * 1024; #include "opal_datatype_copy.h" #if OPAL_CUDA_SUPPORT -#include "opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" #undef MEM_OP_NAME #define MEM_OP_NAME non_overlap_cuda diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c deleted file mode 100644 index 7869f17e909..00000000000 --- a/opal/datatype/opal_datatype_cuda.c +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include -#include -#include - -#include "opal/align.h" -#include "opal/util/output.h" -#include "opal/datatype/opal_convertor.h" -#include "opal/datatype/opal_datatype_cuda.h" - -static bool initialized = false; -int opal_cuda_verbose = 0; -static int opal_cuda_enabled = 0; /* Starts out disabled */ -static int opal_cuda_output = 0; -static void opal_cuda_support_init(void); -static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL; -static opal_common_cuda_function_table_t ftable; - -/* This function allows the common cuda code to register an - * initialization function that gets called the first time an attempt - * is made to send or receive a GPU pointer. This allows us to delay - * some CUDA initialization until after MPI_Init(). - */ -void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)) { - common_cuda_initialization_function = fptr; -} - -/** - * This function is called when a convertor is instantiated. It has to call - * the opal_cuda_support_init() function once to figure out if CUDA support - * is enabled or not. If CUDA is not enabled, then short circuit out - * for all future calls. - */ -void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf) -{ - /* Only do the initialization on the first GPU access */ - if (!initialized) { - opal_cuda_support_init(); - } - - /* This is needed to handle case where convertor is not fully initialized - * like when trying to do a sendi with convertor on the statck */ - convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy; - - /* If not enabled, then nothing else to do */ - if (!opal_cuda_enabled) { - return; - } - - if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) { - convertor->flags |= CONVERTOR_CUDA; - } -} - -/* Checks the type of pointer - * - * @param dest One pointer to check - * @param source Another pointer to check - */ -bool opal_cuda_check_bufs(char *dest, char *src) -{ - /* Only do the initialization on the first GPU access */ - if (!initialized) { - opal_cuda_support_init(); - } - - if (!opal_cuda_enabled) { - return false; - } - - if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) { - return true; - } else { - return false; - } -} - -/* - * With CUDA enabled, all contiguous copies will pass through this function. - * Therefore, the first check is to see if the convertor is a GPU buffer. - * Note that if there is an error with any of the CUDA calls, the program - * aborts as there is no recovering. - */ - -/* Checks the type of pointer - * - * @param buf check one pointer providing a convertor. - * Provides aditional information, e.g. managed vs. unmanaged GPU buffer - */ -bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor ) -{ - /* Only do the initialization on the first GPU access */ - if (!initialized) { - opal_cuda_support_init(); - } - - if (!opal_cuda_enabled) { - return false; - } - - return ( ftable.gpu_is_gpu_buffer(buf, convertor)); -} - -/* - * With CUDA enabled, all contiguous copies will pass through this function. - * Therefore, the first check is to see if the convertor is a GPU buffer. - * Note that if there is an error with any of the CUDA calls, the program - * aborts as there is no recovering. - */ - -void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor) -{ - int res; - - if (!(convertor->flags & CONVERTOR_CUDA)) { - return memcpy(dest, src, size); - } - - if (convertor->flags & CONVERTOR_CUDA_ASYNC) { - res = ftable.gpu_cu_memcpy_async(dest, (void *)src, size, convertor); - } else { - res = ftable.gpu_cu_memcpy(dest, (void *)src, size); - } - - if (res != 0) { - opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", - res, dest, src, (int)size); - abort(); - } else { - return dest; - } -} - -/* - * This function is needed in cases where we do not have contiguous - * datatypes. The current code has macros that cannot handle a convertor - * argument to the memcpy call. - */ -void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size) -{ - int res; - res = ftable.gpu_cu_memcpy(dest, src, size); - if (res != 0) { - opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", - res, dest, src, (int)size); - abort(); - } else { - return dest; - } -} - -/* - * In some cases, need an implementation of memmove. This is not fast, but - * it is not often needed. - */ -void *opal_cuda_memmove(void *dest, void *src, size_t size) -{ - int res; - - res = ftable.gpu_memmove(dest, src, size); - if(res != 0){ - opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d", - res, dest, src, (int)size); - abort(); - } - return dest; -} - -/** - * This function gets called once to check if the program is running in a cuda - * environment. - */ -static void opal_cuda_support_init(void) -{ - if (initialized) { - return; - } - - /* Set different levels of verbosity in the cuda related code. */ - opal_cuda_output = opal_output_open(NULL); - opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose); - - /* Callback into the common cuda initialization routine. This is only - * set if some work had been done already in the common cuda code.*/ - if (NULL != common_cuda_initialization_function) { - if (0 == common_cuda_initialization_function(&ftable)) { - opal_cuda_enabled = 1; - } - } - - if (1 == opal_cuda_enabled) { - opal_output_verbose(10, opal_cuda_output, - "CUDA: enabled successfully, CUDA device pointers will work"); - } else { - opal_output_verbose(10, opal_cuda_output, - "CUDA: not enabled, CUDA device pointers will not work"); - } - - initialized = true; -} - -/** - * Tell the convertor that copies will be asynchronous CUDA copies. The - * flags are cleared when the convertor is reinitialized. - */ -void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream) -{ - convertor->flags |= CONVERTOR_CUDA_ASYNC; - convertor->stream = stream; -} diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h deleted file mode 100644 index 2789320520a..00000000000 --- a/opal/datatype/opal_datatype_cuda.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2011-2014 NVIDIA Corporation. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef _OPAL_DATATYPE_CUDA_H -#define _OPAL_DATATYPE_CUDA_H - -/* Structure to hold CUDA support functions that gets filled in when the - * common cuda code is initialized. This removes any dependency on - * in the opal cuda datatype code. */ -struct opal_common_cuda_function_table { - int (*gpu_is_gpu_buffer)(const void*, opal_convertor_t*); - int (*gpu_cu_memcpy_async)(void*, const void*, size_t, opal_convertor_t*); - int (*gpu_cu_memcpy)(void*, const void*, size_t); - int (*gpu_memmove)(void*, void*, size_t); -}; -typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t; - -void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf); -bool opal_cuda_check_bufs(char *dest, char *src); -bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor ); -void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor); -void* opal_cuda_memcpy_sync(void * dest, const void * src, size_t size); -void* opal_cuda_memmove(void * dest, void * src, size_t size); -void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)); -void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream); - -#endif diff --git a/opal/datatype/opal_datatype_pack_unpack_predefined.h b/opal/datatype/opal_datatype_pack_unpack_predefined.h index c516feb511d..be98cf2716c 100644 --- a/opal/datatype/opal_datatype_pack_unpack_predefined.h +++ b/opal/datatype/opal_datatype_pack_unpack_predefined.h @@ -54,7 +54,7 @@ #define OPAL_DATATYPE_PACK_UNPACK_PREDEFINED_H_HAS_BEEN_INCLUDED #include "opal_config.h" -#include "opal/datatype/opal_datatype_cuda.h" +#include "opal/mca/common/cuda/common_cuda.h" #include /* Improve predefined pack/unpack performance using mpich methods. diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c index 038296e81a3..81e6a134486 100644 --- a/opal/mca/common/cuda/common_cuda.c +++ b/opal/mca/common/cuda/common_cuda.c @@ -33,7 +33,6 @@ #include "opal/align.h" #include "opal/datatype/opal_convertor.h" -#include "opal/datatype/opal_datatype_cuda.h" #include "opal/util/output.h" #include "opal/util/show_help.h" #include "opal/util/proc.h" @@ -79,7 +78,7 @@ struct cudaFunctionTable { int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr); int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream); int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t); - int (*cuMemAlloc)(CUdeviceptr *, unsigned int); + int (*cuMemAlloc)(CUdeviceptr *, size_t); int (*cuMemFree)(CUdeviceptr buf); int (*cuCtxGetCurrent)(void *cuContext); int (*cuStreamCreate)(CUstream *, int); @@ -501,6 +500,8 @@ static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *fta ftable->gpu_cu_memcpy_async = &mca_common_cuda_cu_memcpy_async; ftable->gpu_cu_memcpy = &mca_common_cuda_cu_memcpy; ftable->gpu_memmove = &mca_common_cuda_memmove; + ftable->gpu_malloc = &mca_common_cuda_malloc; + ftable->gpu_free = &mca_common_cuda_free; opal_output_verbose(30, mca_common_cuda_output, "CUDA: support functions initialized"); @@ -1922,6 +1923,35 @@ static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size) return OPAL_SUCCESS; } +int mca_common_cuda_malloc(void **dptr, size_t size) +{ + int res, count = 0; + if (size > 0) { + res = cuFunc.cuMemAlloc((CUdeviceptr *)dptr, size); + if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { + opal_output(0, "CUDA: cuMemAlloc failed: res=%d", + res); + return res; + } + } + return 0; +} + +int mca_common_cuda_free(void *dptr) +{ + int res; + if (NULL != dptr) { + res = cuFunc.cuMemFree((CUdeviceptr)dptr); + if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) { + opal_output(0, "CUDA: cuMemFree failed: res=%d", + res); + return res; + } + } + return 0; +} + + static int mca_common_cuda_memmove(void *dest, void *src, size_t size) { CUdeviceptr tmp; @@ -2069,4 +2099,258 @@ void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg) true, OPAL_PROC_MY_HOSTNAME, res, dbuf); } } + +static bool initialized = false; +int opal_cuda_verbose = 0; +static int opal_cuda_enabled = 0; /* Starts out disabled */ +static int opal_cuda_output = 0; +static void opal_cuda_support_init(void); +static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL; +static opal_common_cuda_function_table_t ftable; + +/* This function allows the common cuda code to register an + * initialization function that gets called the first time an attempt + * is made to send or receive a GPU pointer. This allows us to delay + * some CUDA initialization until after MPI_Init(). + */ +void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)) { + common_cuda_initialization_function = fptr; +} + +/** + * This function is called when a convertor is instantiated. It has to call + * the opal_cuda_support_init() function once to figure out if CUDA support + * is enabled or not. If CUDA is not enabled, then short circuit out + * for all future calls. + */ +void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf) +{ + /* Only do the initialization on the first GPU access */ + if (!initialized) { + opal_cuda_support_init(); + } + + /* This is needed to handle case where convertor is not fully initialized + * like when trying to do a sendi with convertor on the statck */ + convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy; + + /* If not enabled, then nothing else to do */ + if (!opal_cuda_enabled) { + return; + } + + if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) { + convertor->flags |= CONVERTOR_CUDA; + } +} + +/* Checks the type of pointer + * + * @param dest One pointer to check + * @param source Another pointer to check + */ +bool opal_cuda_check_bufs(char *dest, char *src) +{ + /* Only do the initialization on the first GPU access */ + if (!initialized) { + opal_cuda_support_init(); + } + + if (!opal_cuda_enabled) { + return false; + } + + if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) { + return true; + } else { + return false; + } +} + +/* + * With CUDA enabled, all contiguous copies will pass through this function. + * Therefore, the first check is to see if the convertor is a GPU buffer. + * Note that if there is an error with any of the CUDA calls, the program + * aborts as there is no recovering. + */ + +/* Checks the type of pointer + * + * @param buf check one pointer providing a convertor. + * Provides aditional information, e.g. managed vs. unmanaged GPU buffer + */ +bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor ) +{ + /* Only do the initialization on the first GPU access */ + if (!initialized) { + opal_cuda_support_init(); + } + + if (!opal_cuda_enabled) { + return false; + } + + return ( ftable.gpu_is_gpu_buffer(buf, convertor)); +} + +/* + * This function allocates a buffer using either cuMemAlloc + * or malloc, depending on if the convertor flag CONVERTOR_CUDA + * is set. + * + * @param size Size of buffer to be allocated + * @param convertor The convertor with flags describing if the buf + * should be a Host or Cuda buffer. + * + * @returns void * A pointer to the newly allocated buffer. + */ +void *opal_cuda_malloc(size_t size, opal_convertor_t* convertor) +{ + int res; + void* buffer; + if (!(convertor->flags & CONVERTOR_CUDA)) { + return malloc(size); + } + res = ftable.gpu_malloc(buffer, size); + if (res != 0 ) { + opal_output(0, "CUDA: Error in cuMemAlloc: size=%d", + (int)size); + abort(); + } else { + return buffer; + } +} + +/* + * This function frees a buffer using either cuMemFree() or free(), + * depending on if the convertor flag CONVERTOR_CUDA is set. + * + * @param buffer Pointer to buffer to be freed + * @param convertor The convertor with flags describing if the buf + * should be a Host or Cuda buffer. + * + */ +void opal_cuda_free(void *buffer, opal_convertor_t* convertor) +{ + int res; + if (!(convertor->flags & CONVERTOR_CUDA)) { + free(buffer); + return; + } + res = ftable.gpu_free(buffer); + if (res != 0 ) { + opal_output(0, "CUDA: Error in cuMemFree: ptr=%p", + buffer); + abort(); + } + return; +} + +/* + * With CUDA enabled, all contiguous copies will pass through this function. + * Therefore, the first check is to see if the convertor is a GPU buffer. + * Note that if there is an error with any of the CUDA calls, the program + * aborts as there is no recovering. + */ + +void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor) +{ + int res; + + if (!(convertor->flags & CONVERTOR_CUDA)) { + return memcpy(dest, src, size); + } + + if (convertor->flags & CONVERTOR_CUDA_ASYNC) { + res = ftable.gpu_cu_memcpy_async(dest, (void *)src, size, convertor); + } else { + res = ftable.gpu_cu_memcpy(dest, (void *)src, size); + } + + if (res != 0) { + opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", + res, dest, src, (int)size); + abort(); + } else { + return dest; + } +} + +/* + * This function is needed in cases where we do not have contiguous + * datatypes. The current code has macros that cannot handle a convertor + * argument to the memcpy call. + */ +void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size) +{ + int res; + res = ftable.gpu_cu_memcpy(dest, src, size); + if (res != 0) { + opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", + res, dest, src, (int)size); + abort(); + } else { + return dest; + } +} + +/* + * In some cases, need an implementation of memmove. This is not fast, but + * it is not often needed. + */ +void *opal_cuda_memmove(void *dest, void *src, size_t size) +{ + int res; + + res = ftable.gpu_memmove(dest, src, size); + if(res != 0){ + opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d", + res, dest, src, (int)size); + abort(); + } + return dest; +} + +/** + * This function gets called once to check if the program is running in a cuda + * environment. + */ +static void opal_cuda_support_init(void) +{ + if (initialized) { + return; + } + + /* Set different levels of verbosity in the cuda related code. */ + opal_cuda_output = opal_output_open(NULL); + opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose); + + /* Callback into the common cuda initialization routine. This is only + * set if some work had been done already in the common cuda code.*/ + if (NULL != common_cuda_initialization_function) { + if (0 == common_cuda_initialization_function(&ftable)) { + opal_cuda_enabled = 1; + } + } + + if (1 == opal_cuda_enabled) { + opal_output_verbose(10, opal_cuda_output, + "CUDA: enabled successfully, CUDA device pointers will work"); + } else { + opal_output_verbose(10, opal_cuda_output, + "CUDA: not enabled, CUDA device pointers will not work"); + } + + initialized = true; +} + +/** + * Tell the convertor that copies will be asynchronous CUDA copies. The + * flags are cleared when the convertor is reinitialized. + */ +void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream) +{ + convertor->flags |= CONVERTOR_CUDA_ASYNC; + convertor->stream = stream; +} #endif /* OPAL_CUDA_GDR_SUPPORT */ diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h index 3ff95405299..695201bc134 100644 --- a/opal/mca/common/cuda/common_cuda.h +++ b/opal/mca/common/cuda/common_cuda.h @@ -52,6 +52,9 @@ OPAL_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg); OPAL_DECLSPEC void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg); +OPAL_DECLSPEC int mca_common_cuda_malloc(void **buffer, size_t size); +OPAL_DECLSPEC int mca_common_cuda_free(void *buffer); + OPAL_DECLSPEC int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg, struct mca_btl_base_descriptor_t *, int *done); @@ -108,4 +111,28 @@ static inline int32_t opal_convertor_cuda_need_buffers( opal_convertor_t* pConve return retval; } +/* Structure to hold CUDA support functions that gets filled in when the + * common cuda code is initialized. This removes any dependency on + * in the opal cuda datatype code. */ +struct opal_common_cuda_function_table { + int (*gpu_is_gpu_buffer)(const void*, opal_convertor_t*); + int (*gpu_cu_memcpy_async)(void*, const void*, size_t, opal_convertor_t*); + int (*gpu_cu_memcpy)(void*, const void*, size_t); + int (*gpu_memmove)(void*, void*, size_t); + int (*gpu_malloc)(void*, size_t); + int (*gpu_free)(void*); +}; +typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t; + +void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf); +bool opal_cuda_check_bufs(char *dest, char *src); +bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor ); +void* opal_cuda_malloc(size_t size, opal_convertor_t* convertor); +void opal_cuda_free(void * buffer, opal_convertor_t* convertor); +void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor); +void* opal_cuda_memcpy_sync(void * dest, const void * src, size_t size); +void* opal_cuda_memmove(void * dest, void * src, size_t size); +void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)); +void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream); + #endif /* OPAL_MCA_COMMON_CUDA_H */ diff --git a/opal/mca/common/cuda/configure.m4 b/opal/mca/common/cuda/configure.m4 index 6083e087a3b..5bda090e26f 100644 --- a/opal/mca/common/cuda/configure.m4 +++ b/opal/mca/common/cuda/configure.m4 @@ -27,8 +27,4 @@ AC_DEFUN([MCA_opal_common_cuda_CONFIG],[ [$1], [$2]) - # Copy over the includes needed to build CUDA - common_cuda_CPPFLAGS=$opal_datatype_cuda_CPPFLAGS - AC_SUBST([common_cuda_CPPFLAGS]) - ])dnl