open-mpi · rajachan · Mar 9, 2021 · Feb 16, 2021 · Jan 27, 2021 · Jan 27, 2021
diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4
@@ -117,8 +117,8 @@ AC_MSG_CHECKING([if have cuda support])
 if test "$opal_check_cuda_happy" = "yes"; then
     AC_MSG_RESULT([yes (-I$opal_cuda_incdir)])
     CUDA_SUPPORT=1
-    opal_datatype_cuda_CPPFLAGS="-I$opal_cuda_incdir"
-    AC_SUBST([opal_datatype_cuda_CPPFLAGS])
+    common_cuda_CPPFLAGS="-I$opal_cuda_incdir"
+    AC_SUBST([common_cuda_CPPFLAGS])
 else
     AC_MSG_RESULT([no])
     CUDA_SUPPORT=0

diff --git a/ompi/mca/coll/cuda/coll_cuda_allreduce.c b/ompi/mca/coll/cuda/coll_cuda_allreduce.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 /*
  *	allreduce_intra

diff --git a/ompi/mca/coll/cuda/coll_cuda_exscan.c b/ompi/mca/coll/cuda/coll_cuda_exscan.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
                          struct ompi_datatype_t *dtype,

diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 /*
  *	reduce_log_inter

diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 /*
  *	reduce_scatter_block

diff --git a/ompi/mca/coll/cuda/coll_cuda_scan.c b/ompi/mca/coll/cuda/coll_cuda_scan.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 /*
  *	scan

diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h
@@ -31,7 +31,7 @@
 #include "coll_libnbc.h"
 #if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "ompi/include/ompi/constants.h"
 #include "ompi/request/request.h"

diff --git a/ompi/mca/common/ompio/common_ompio_buffer.c b/ompi/mca/common/ompio/common_ompio_buffer.c
@@ -20,7 +20,6 @@
 #include "ompi_config.h"
 
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/mca/common/cuda/common_cuda.h"
 #include "opal/util/sys_limits.h"
 

diff --git a/ompi/mca/mtl/base/mtl_base_datatype.h b/ompi/mca/mtl/base/mtl_base_datatype.h
@@ -25,16 +25,82 @@
 #include "ompi/datatype/ompi_datatype.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_internal.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_convertor.h"
+#endif
 
 #ifndef MTL_BASE_DATATYPE_H_INCLUDED
 #define MTL_BASE_DATATYPE_H_INCLUDED
 
+#if OPAL_CUDA_SUPPORT
+static int
+ompi_mtl_cuda_datatype_pack(struct opal_convertor_t *convertor,
+                            void **buffer,
+                            size_t *buffer_len,
+                            bool *freeAfter)
+{
+
+    struct iovec iov;
+    uint32_t iov_count = 1;
+    int is_cuda = convertor->flags & CONVERTOR_CUDA;
+
+#if !(OPAL_ENABLE_HETEROGENEOUS_SUPPORT)
+    if (convertor->pDesc &&
+	!(convertor->flags & CONVERTOR_COMPLETED) &&
+	opal_datatype_is_contiguous_memory_layout(convertor->pDesc,
+						  convertor->count)) {
+	    *freeAfter = false;
+	    *buffer = convertor->pBaseBuf;
+	    *buffer_len = convertor->local_size;
+	    return OPAL_SUCCESS;
+    }
+#endif
+
+    opal_convertor_get_packed_size(convertor, buffer_len);
+    *freeAfter  = false;
+    if( 0 == *buffer_len ) {
+        *buffer     = NULL;
+        return OMPI_SUCCESS;
+    }
+    iov.iov_len = *buffer_len;
+    iov.iov_base = NULL;
+    /* opal_convertor_need_buffers always returns true
+     * if CONVERTOR_CUDA is set, so unset temporarily
+     */
+    convertor->flags &= ~CONVERTOR_CUDA;
+
+    if (opal_convertor_need_buffers(convertor)) {
+        if (is_cuda) {
+            convertor->flags |= CONVERTOR_CUDA;
+        }
+        iov.iov_base = opal_cuda_malloc(*buffer_len, convertor);
+        if (NULL == iov.iov_base) return OMPI_ERR_OUT_OF_RESOURCE;
+        *freeAfter = true;
+    } else if (is_cuda) {
+            convertor->flags |= CONVERTOR_CUDA;
+    }
+
+    opal_convertor_pack( convertor, &iov, &iov_count, buffer_len );
+
+    *buffer = iov.iov_base;
+
+    return OMPI_SUCCESS;
+}
+#endif
+
 __opal_attribute_always_inline__ static inline int
 ompi_mtl_datatype_pack(struct opal_convertor_t *convertor,
                        void **buffer,
                        size_t *buffer_len,
                        bool *freeAfter)
 {
+#if OPAL_CUDA_SUPPORT
+    return ompi_mtl_cuda_datatype_pack(convertor,
+                                       buffer,
+                                       buffer_len,
+                                       freeAfter);
+#endif
     struct iovec iov;
     uint32_t iov_count = 1;
 
@@ -71,13 +137,56 @@ ompi_mtl_datatype_pack(struct opal_convertor_t *convertor,
     return OMPI_SUCCESS;
 }
 
+#if OPAL_CUDA_SUPPORT
+static int
+ompi_mtl_cuda_datatype_recv_buf(struct opal_convertor_t *convertor,
+                                void ** buffer,
+                                size_t *buffer_len,
+                                bool *free_on_error)
+{
+    int is_cuda = convertor->flags & CONVERTOR_CUDA;
+    opal_convertor_get_packed_size(convertor, buffer_len);
+    *free_on_error = false;
+    if( 0 == *buffer_len ) {
+        *buffer = NULL;
+        *buffer_len = 0;
+        return OMPI_SUCCESS;
+    }
+    /* opal_convertor_need_buffers always returns true
+     * if CONVERTOR_CUDA is set, so unset temporarily
+     */
+    convertor->flags &= ~CONVERTOR_CUDA;
+    if (opal_convertor_need_buffers(convertor)) {
+        if (is_cuda) {
+            convertor->flags |= CONVERTOR_CUDA;
+        }
+        *buffer = opal_cuda_malloc(*buffer_len, convertor);
+        *free_on_error = true;
+    } else {
+        if (is_cuda) {
+            convertor->flags |= CONVERTOR_CUDA;
+        }
+        *buffer = convertor->pBaseBuf +
+            convertor->use_desc->desc[convertor->use_desc->used].end_loop.first_elem_disp;
+    }
+    return OMPI_SUCCESS;
+
+}
+#endif
 
 __opal_attribute_always_inline__ static inline int
 ompi_mtl_datatype_recv_buf(struct opal_convertor_t *convertor,
                            void ** buffer,
                            size_t *buffer_len,
                            bool *free_on_error)
 {
+#if OPAL_CUDA_SUPPORT
+    return ompi_mtl_cuda_datatype_recv_buf(convertor,
+                                           buffer,
+                                           buffer_len,
+                                           free_on_error);
+#endif
+
     opal_convertor_get_packed_size(convertor, buffer_len);
     *free_on_error = false;
     if( 0 == *buffer_len ) {
@@ -95,12 +204,48 @@ ompi_mtl_datatype_recv_buf(struct opal_convertor_t *convertor,
     return OMPI_SUCCESS;
 }
 
+#if OPAL_CUDA_SUPPORT
+static int
+ompi_mtl_cuda_datatype_unpack(struct opal_convertor_t *convertor,
+                              void *buffer,
+                              size_t buffer_len) {
+    struct iovec iov;
+    uint32_t iov_count = 1;
+    int is_cuda = convertor->flags & CONVERTOR_CUDA;
+    /* opal_convertor_need_buffers always returns true
+     * if CONVERTOR_CUDA is set, so unset temporarily
+     */
+     convertor->flags &= ~CONVERTOR_CUDA;
+
+    if (buffer_len > 0 && opal_convertor_need_buffers(convertor)) {
+        iov.iov_len = buffer_len;
+        iov.iov_base = buffer;
+
+        if (is_cuda) {
+            convertor->flags |= CONVERTOR_CUDA;
+        }
+        opal_convertor_unpack(convertor, &iov, &iov_count, &buffer_len );
+
+        opal_cuda_free(buffer, convertor);
+    } else if (is_cuda) {
+        convertor->flags |= CONVERTOR_CUDA;
+    }
+
+    return OMPI_SUCCESS;
+
+}
+#endif
 
 __opal_attribute_always_inline__ static inline int
 ompi_mtl_datatype_unpack(struct opal_convertor_t *convertor,
                          void *buffer,
                          size_t buffer_len)
 {
+#if OPAL_CUDA_SUPPORT
+    return ompi_mtl_cuda_datatype_unpack(convertor,
+                                         buffer,
+                                         buffer_len);
+#endif
     struct iovec iov;
     uint32_t iov_count = 1;
 

diff --git a/ompi/mca/mtl/ofi/configure.m4 b/ompi/mca/mtl/ofi/configure.m4
@@ -28,6 +28,17 @@ AC_DEFUN([MCA_ompi_mtl_ofi_CONFIG],[
     # Check for OFI
     OPAL_CHECK_OFI
 
+    # Check for CUDA
+    OPAL_CHECK_CUDA
+
+    # Check for cuda support. If so, we require a minimum libfabric version
+    # of 1.9. FI_HMEM capabilities are only available starting from v1.9
+    opal_ofi_happy="yes"
+    AS_IF([test "$opal_check_cuda_happy" = "yes"],
+          [OPAL_CHECK_OFI_VERSION_GE([1,9],
+                                     [],
+                                     [opal_ofi_happy=no])])
+
     # The OFI MTL requires at least OFI libfabric v1.5.
     AS_IF([test "$opal_ofi_happy" = "yes"],
           [OPAL_CHECK_OFI_VERSION_GE([1,5],

diff --git a/ompi/mca/mtl/ofi/help-mtl-ofi.txt b/ompi/mca/mtl/ofi/help-mtl-ofi.txt
@@ -77,3 +77,12 @@ recoverable and your application is likely to abort.
   Error: %s (%d)
 [message too big]
 Message size %llu bigger than supported by selected transport. Max = %llu
+
+[Buffer Memory Registration Failed]
+Open MPI failed to register your buffer.
+This error is fatal, your job will abort
+
+  Buffer Type: %s
+  Buffer Address: %p
+  Buffer Length: %d
+  Error: %s (%zd)