diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4
index 67059a8c851..be5aa777b3b 100644
--- a/config/opal_check_cuda.m4
+++ b/config/opal_check_cuda.m4
@@ -117,8 +117,8 @@ AC_MSG_CHECKING([if have cuda support])
 if test "$opal_check_cuda_happy" = "yes"; then
     AC_MSG_RESULT([yes (-I$opal_cuda_incdir)])
     CUDA_SUPPORT=1
-    opal_datatype_cuda_CPPFLAGS="-I$opal_cuda_incdir"
-    AC_SUBST([opal_datatype_cuda_CPPFLAGS])
+    common_cuda_CPPFLAGS="-I$opal_cuda_incdir"
+    AC_SUBST([common_cuda_CPPFLAGS])
 else
     AC_MSG_RESULT([no])
     CUDA_SUPPORT=0
diff --git a/ompi/mca/coll/cuda/coll_cuda_allreduce.c b/ompi/mca/coll/cuda/coll_cuda_allreduce.c
index 3c851c64782..416c9c7fa8f 100644
--- a/ompi/mca/coll/cuda/coll_cuda_allreduce.c
+++ b/ompi/mca/coll/cuda/coll_cuda_allreduce.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 /*
  *	allreduce_intra
diff --git a/ompi/mca/coll/cuda/coll_cuda_exscan.c b/ompi/mca/coll/cuda/coll_cuda_exscan.c
index 23f5a5da839..5f736697fe0 100644
--- a/ompi/mca/coll/cuda/coll_cuda_exscan.c
+++ b/ompi/mca/coll/cuda/coll_cuda_exscan.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
                          struct ompi_datatype_t *dtype,
diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce.c b/ompi/mca/coll/cuda/coll_cuda_reduce.c
index 2bcce13c75c..5d82667b6bb 100644
--- a/ompi/mca/coll/cuda/coll_cuda_reduce.c
+++ b/ompi/mca/coll/cuda/coll_cuda_reduce.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 /*
  *	reduce_log_inter
diff --git a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
index 6dded294e69..907257b0da8 100644
--- a/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
+++ b/ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 /*
  *	reduce_scatter_block
diff --git a/ompi/mca/coll/cuda/coll_cuda_scan.c b/ompi/mca/coll/cuda/coll_cuda_scan.c
index 6d70a10c7a3..4e7300c12f8 100644
--- a/ompi/mca/coll/cuda/coll_cuda_scan.c
+++ b/ompi/mca/coll/cuda/coll_cuda_scan.c
@@ -17,7 +17,7 @@
 
 #include "ompi/op/op.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 /*
  *	scan
diff --git a/ompi/mca/coll/libnbc/nbc_internal.h b/ompi/mca/coll/libnbc/nbc_internal.h
index 735beaa06e2..4d5f42bc060 100644
--- a/ompi/mca/coll/libnbc/nbc_internal.h
+++ b/ompi/mca/coll/libnbc/nbc_internal.h
@@ -31,7 +31,7 @@
 #include "coll_libnbc.h"
 #if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "ompi/include/ompi/constants.h"
 #include "ompi/request/request.h"
diff --git a/ompi/mca/common/ompio/common_ompio_buffer.c b/ompi/mca/common/ompio/common_ompio_buffer.c
index 689e25b32f9..dbd7e30e6b4 100644
--- a/ompi/mca/common/ompio/common_ompio_buffer.c
+++ b/ompi/mca/common/ompio/common_ompio_buffer.c
@@ -20,7 +20,6 @@
 #include "ompi_config.h"
 
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/mca/common/cuda/common_cuda.h"
 #include "opal/util/sys_limits.h"
 
diff --git a/ompi/mca/mtl/base/mtl_base_datatype.h b/ompi/mca/mtl/base/mtl_base_datatype.h
index 41559245745..544ca32abc7 100644
--- a/ompi/mca/mtl/base/mtl_base_datatype.h
+++ b/ompi/mca/mtl/base/mtl_base_datatype.h
@@ -25,16 +25,82 @@
 #include "ompi/datatype/ompi_datatype.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_internal.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_convertor.h"
+#endif
 
 #ifndef MTL_BASE_DATATYPE_H_INCLUDED
 #define MTL_BASE_DATATYPE_H_INCLUDED
 
+#if OPAL_CUDA_SUPPORT
+static int
+ompi_mtl_cuda_datatype_pack(struct opal_convertor_t *convertor,
+                            void **buffer,
+                            size_t *buffer_len,
+                            bool *freeAfter)
+{
+
+    struct iovec iov;
+    uint32_t iov_count = 1;
+    int is_cuda = convertor->flags & CONVERTOR_CUDA;
+
+#if !(OPAL_ENABLE_HETEROGENEOUS_SUPPORT)
+    if (convertor->pDesc &&
+	!(convertor->flags & CONVERTOR_COMPLETED) &&
+	opal_datatype_is_contiguous_memory_layout(convertor->pDesc,
+						  convertor->count)) {
+	    *freeAfter = false;
+	    *buffer = convertor->pBaseBuf;
+	    *buffer_len = convertor->local_size;
+	    return OPAL_SUCCESS;
+    }
+#endif
+
+    opal_convertor_get_packed_size(convertor, buffer_len);
+    *freeAfter  = false;
+    if( 0 == *buffer_len ) {
+        *buffer     = NULL;
+        return OMPI_SUCCESS;
+    }
+    iov.iov_len = *buffer_len;
+    iov.iov_base = NULL;
+    /* opal_convertor_need_buffers always returns true
+     * if CONVERTOR_CUDA is set, so unset temporarily
+     */
+    convertor->flags &= ~CONVERTOR_CUDA;
+
+    if (opal_convertor_need_buffers(convertor)) {
+        if (is_cuda) {
+            convertor->flags |= CONVERTOR_CUDA;
+        }
+        iov.iov_base = opal_cuda_malloc(*buffer_len, convertor);
+        if (NULL == iov.iov_base) return OMPI_ERR_OUT_OF_RESOURCE;
+        *freeAfter = true;
+    } else if (is_cuda) {
+            convertor->flags |= CONVERTOR_CUDA;
+    }
+
+    opal_convertor_pack( convertor, &iov, &iov_count, buffer_len );
+
+    *buffer = iov.iov_base;
+
+    return OMPI_SUCCESS;
+}
+#endif
+
 __opal_attribute_always_inline__ static inline int
 ompi_mtl_datatype_pack(struct opal_convertor_t *convertor,
                        void **buffer,
                        size_t *buffer_len,
                        bool *freeAfter)
 {
+#if OPAL_CUDA_SUPPORT
+    return ompi_mtl_cuda_datatype_pack(convertor,
+                                       buffer,
+                                       buffer_len,
+                                       freeAfter);
+#endif
     struct iovec iov;
     uint32_t iov_count = 1;
 
@@ -71,6 +137,42 @@ ompi_mtl_datatype_pack(struct opal_convertor_t *convertor,
     return OMPI_SUCCESS;
 }
 
+#if OPAL_CUDA_SUPPORT
+static int
+ompi_mtl_cuda_datatype_recv_buf(struct opal_convertor_t *convertor,
+                                void ** buffer,
+                                size_t *buffer_len,
+                                bool *free_on_error)
+{
+    int is_cuda = convertor->flags & CONVERTOR_CUDA;
+    opal_convertor_get_packed_size(convertor, buffer_len);
+    *free_on_error = false;
+    if( 0 == *buffer_len ) {
+        *buffer = NULL;
+        *buffer_len = 0;
+        return OMPI_SUCCESS;
+    }
+    /* opal_convertor_need_buffers always returns true
+     * if CONVERTOR_CUDA is set, so unset temporarily
+     */
+    convertor->flags &= ~CONVERTOR_CUDA;
+    if (opal_convertor_need_buffers(convertor)) {
+        if (is_cuda) {
+            convertor->flags |= CONVERTOR_CUDA;
+        }
+        *buffer = opal_cuda_malloc(*buffer_len, convertor);
+        *free_on_error = true;
+    } else {
+        if (is_cuda) {
+            convertor->flags |= CONVERTOR_CUDA;
+        }
+        *buffer = convertor->pBaseBuf +
+            convertor->use_desc->desc[convertor->use_desc->used].end_loop.first_elem_disp;
+    }
+    return OMPI_SUCCESS;
+
+}
+#endif
 
 __opal_attribute_always_inline__ static inline int
 ompi_mtl_datatype_recv_buf(struct opal_convertor_t *convertor,
@@ -78,6 +180,13 @@ ompi_mtl_datatype_recv_buf(struct opal_convertor_t *convertor,
                            size_t *buffer_len,
                            bool *free_on_error)
 {
+#if OPAL_CUDA_SUPPORT
+    return ompi_mtl_cuda_datatype_recv_buf(convertor,
+                                           buffer,
+                                           buffer_len,
+                                           free_on_error);
+#endif
+
     opal_convertor_get_packed_size(convertor, buffer_len);
     *free_on_error = false;
     if( 0 == *buffer_len ) {
@@ -95,12 +204,48 @@ ompi_mtl_datatype_recv_buf(struct opal_convertor_t *convertor,
     return OMPI_SUCCESS;
 }
 
+#if OPAL_CUDA_SUPPORT
+static int
+ompi_mtl_cuda_datatype_unpack(struct opal_convertor_t *convertor,
+                              void *buffer,
+                              size_t buffer_len) {
+    struct iovec iov;
+    uint32_t iov_count = 1;
+    int is_cuda = convertor->flags & CONVERTOR_CUDA;
+    /* opal_convertor_need_buffers always returns true
+     * if CONVERTOR_CUDA is set, so unset temporarily
+     */
+     convertor->flags &= ~CONVERTOR_CUDA;
+
+    if (buffer_len > 0 && opal_convertor_need_buffers(convertor)) {
+        iov.iov_len = buffer_len;
+        iov.iov_base = buffer;
+
+        if (is_cuda) {
+            convertor->flags |= CONVERTOR_CUDA;
+        }
+        opal_convertor_unpack(convertor, &iov, &iov_count, &buffer_len );
+
+        opal_cuda_free(buffer, convertor);
+    } else if (is_cuda) {
+        convertor->flags |= CONVERTOR_CUDA;
+    }
+
+    return OMPI_SUCCESS;
+
+}
+#endif
 
 __opal_attribute_always_inline__ static inline int
 ompi_mtl_datatype_unpack(struct opal_convertor_t *convertor,
                          void *buffer,
                          size_t buffer_len)
 {
+#if OPAL_CUDA_SUPPORT
+    return ompi_mtl_cuda_datatype_unpack(convertor,
+                                         buffer,
+                                         buffer_len);
+#endif
     struct iovec iov;
     uint32_t iov_count = 1;
 
diff --git a/ompi/mca/mtl/ofi/configure.m4 b/ompi/mca/mtl/ofi/configure.m4
index 2ab0a084e0c..678247c5d35 100644
--- a/ompi/mca/mtl/ofi/configure.m4
+++ b/ompi/mca/mtl/ofi/configure.m4
@@ -28,6 +28,17 @@ AC_DEFUN([MCA_ompi_mtl_ofi_CONFIG],[
     # Check for OFI
     OPAL_CHECK_OFI
 
+    # Check for CUDA
+    OPAL_CHECK_CUDA
+
+    # Check for cuda support. If so, we require a minimum libfabric version
+    # of 1.9. FI_HMEM capabilities are only available starting from v1.9
+    opal_ofi_happy="yes"
+    AS_IF([test "$opal_check_cuda_happy" = "yes"],
+          [OPAL_CHECK_OFI_VERSION_GE([1,9],
+                                     [],
+                                     [opal_ofi_happy=no])])
+
     # The OFI MTL requires at least OFI libfabric v1.5.
     AS_IF([test "$opal_ofi_happy" = "yes"],
           [OPAL_CHECK_OFI_VERSION_GE([1,5],
diff --git a/ompi/mca/mtl/ofi/help-mtl-ofi.txt b/ompi/mca/mtl/ofi/help-mtl-ofi.txt
index 56778f63b53..59de33a539d 100644
--- a/ompi/mca/mtl/ofi/help-mtl-ofi.txt
+++ b/ompi/mca/mtl/ofi/help-mtl-ofi.txt
@@ -77,3 +77,12 @@ recoverable and your application is likely to abort.
   Error: %s (%d)
 [message too big]
 Message size %llu bigger than supported by selected transport. Max = %llu
+
+[Buffer Memory Registration Failed]
+Open MPI failed to register your buffer.
+This error is fatal, your job will abort
+
+  Buffer Type: %s
+  Buffer Address: %p
+  Buffer Length: %d
+  Error: %s (%zd)
diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h
index 14203576309..6626c754a97 100644
--- a/ompi/mca/mtl/ofi/mtl_ofi.h
+++ b/ompi/mca/mtl/ofi/mtl_ofi.h
@@ -47,6 +47,11 @@
 #include "mtl_ofi_endpoint.h"
 #include "mtl_ofi_compat.h"
 
+#if OPAL_CUDA_SUPPORT
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_convertor.h"
+#endif
+
 BEGIN_C_DECLS
 
 extern mca_mtl_ofi_module_t ompi_mtl_ofi;
@@ -247,6 +252,99 @@ ompi_mtl_ofi_progress(void)
                             __FILE__, __LINE__, string, fi_strerror(-err)); \
     } while(0);
 
+/**
+ * Memory registration functions
+ */
+
+/** Called before any libfabric or registration calls */
+__opal_attribute_always_inline__ static inline void
+ompi_mtl_ofi_set_mr_null(ompi_mtl_ofi_request_t *ofi_req) {
+    ofi_req->mr = NULL;
+}
+
+/**
+ * Registers user buffer with Libfabric domain if
+ * buffer is cuda and provider has fi_mr_hmem
+ */
+static
+int ompi_mtl_ofi_register_buffer(struct opal_convertor_t *convertor,
+                                 ompi_mtl_ofi_request_t *ofi_req,
+                                 void* buffer) {
+    ofi_req->mr = NULL;
+    if (ofi_req->length <= 0 || NULL == buffer) {
+        return OMPI_SUCCESS;
+    }
+
+#if OPAL_CUDA_SUPPORT
+    if (convertor->flags & CONVERTOR_CUDA) {
+        /* Register buffer */
+        int ret;
+        struct fi_mr_attr attr = {0};
+        struct iovec iov = {0};
+
+        iov.iov_base = buffer;
+        iov.iov_len = ofi_req->length;
+        attr.mr_iov = &iov;
+        attr.iov_count = 1;
+        attr.access = FI_SEND | FI_RECV;
+        attr.offset = 0;
+        attr.context = NULL;
+
+        attr.iface = FI_HMEM_CUDA;
+        mca_common_cuda_get_device(&attr.device.cuda);
+
+        ret = fi_mr_regattr(ompi_mtl_ofi.domain, &attr, 0, &ofi_req->mr);
+
+        if (ret) {
+            opal_show_help("help-mtl-ofi.txt", "Buffer Memory Registration Failed", true,
+                           "CUDA",
+                           buffer, ofi_req->length,
+                           fi_strerror(-ret), ret);
+            ofi_req->mr = NULL;
+            return OMPI_ERROR;
+        }
+    }
+#endif /* OPAL_CUDA_SUPPORT */
+    return OMPI_SUCCESS;
+}
+
+/** Deregister buffer */
+__opal_attribute_always_inline__ static inline int
+ompi_mtl_ofi_deregister_buffer(ompi_mtl_ofi_request_t *ofi_req) {
+    if (ofi_req->mr) {
+        int ret;
+        ret = fi_close(&ofi_req->mr->fid);
+        if (ret) {
+            opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
+                           "fi_close",
+                           ompi_process_info.nodename, __FILE__, __LINE__,
+                           fi_strerror(-ret), ofi_req->mr->fid);
+            return OMPI_ERROR;
+        }
+        ofi_req->mr = NULL;
+    }
+    return OMPI_SUCCESS;
+}
+
+/** Deregister and free a buffer */
+static
+int ompi_mtl_ofi_deregister_and_free_buffer(ompi_mtl_ofi_request_t *ofi_req) {
+    int ret = OMPI_SUCCESS;
+    ret = ompi_mtl_ofi_deregister_buffer(ofi_req);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+        return ret;
+    }
+    if (OPAL_UNLIKELY(NULL != ofi_req->buffer)) {
+#if OPAL_CUDA_SUPPORT
+        opal_cuda_free(ofi_req->buffer, ofi_req->convertor);
+#else
+        free(ofi_req->buffer);
+#endif
+    }
+    ofi_req->buffer = NULL;
+    return ret;
+}
+
 /* MTL interface functions */
 int ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl);
 
@@ -324,10 +422,7 @@ ompi_mtl_ofi_isend_callback(struct fi_cq_tagged_entry *wc,
 
     if (0 == ofi_req->completion_count) {
         /* Request completed */
-        if (OPAL_UNLIKELY(NULL != ofi_req->buffer)) {
-            free(ofi_req->buffer);
-            ofi_req->buffer = NULL;
-        }
+        ompi_mtl_ofi_deregister_and_free_buffer(ofi_req);
 
         ofi_req->super.ompi_req->req_status.MPI_ERROR =
             ofi_req->status.MPI_ERROR;
@@ -418,6 +513,8 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
     fi_addr_t src_addr = 0;
     fi_addr_t sep_peer_fiaddr = 0;
 
+    ompi_mtl_ofi_set_mr_null(&ofi_req);
+
     ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid);
     set_thread_context(ctxt_id);
 
@@ -467,7 +564,15 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
             goto free_request_buffer;
     }
 
+    /** Inject does not currently support device memory
+     *  https://github.com/ofiwg/libfabric/issues/5861
+     */
+#if OPAL_CUDA_SUPPORT
+    if (!(convertor->flags & CONVERTOR_CUDA)
+        && (ompi_mtl_ofi.max_inject_size >= length)) {
+#else /* !(OPAL_CUDA_SUPPORT)*/
     if (ompi_mtl_ofi.max_inject_size >= length) {
+#endif /* OPAL_CUDA_SUPPORT */
         if (ofi_cq_data) {
             MTL_OFI_RETRY_UNTIL_DONE(fi_tinjectdata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
                                             start,
@@ -495,12 +600,16 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
             goto free_request_buffer;
         }
     } else {
+        ompi_ret = ompi_mtl_ofi_register_buffer(convertor, &ofi_req, start);
+        if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
+            return ompi_ret;
+        }
         ofi_req.completion_count += 1;
         if (ofi_cq_data) {
             MTL_OFI_RETRY_UNTIL_DONE(fi_tsenddata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
                                           start,
                                           length,
-                                          NULL,
+                                          (NULL == ofi_req.mr) ? NULL : ofi_req.mr->mem_desc,
                                           comm->c_my_rank,
                                           sep_peer_fiaddr,
                                           match_bits,
@@ -509,7 +618,7 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
             MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
                                           start,
                                           length,
-                                          NULL,
+                                          (NULL == ofi_req.mr) ? NULL : ofi_req.mr->mem_desc,
                                           sep_peer_fiaddr,
                                           match_bits,
                                           (void *) &ofi_req.ctx), ret);
@@ -532,9 +641,7 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
     }
 
 free_request_buffer:
-    if (OPAL_UNLIKELY(NULL != ofi_req.buffer)) {
-        free(ofi_req.buffer);
-    }
+    ompi_mtl_ofi_deregister_and_free_buffer(&ofi_req);
 
     return ofi_req.status.MPI_ERROR;
 }
@@ -562,6 +669,8 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
     ompi_mtl_ofi_request_t *ack_req = NULL; /* For synchronous send */
     fi_addr_t sep_peer_fiaddr = 0;
 
+    ompi_mtl_ofi_set_mr_null(ofi_req);
+
     ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid);
     set_thread_context(ctxt_id);
 
@@ -605,11 +714,16 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
             goto free_request_buffer;
     }
 
+    ompi_ret = ompi_mtl_ofi_register_buffer(convertor, ofi_req, start);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
+        return ompi_ret;
+    }
+
     if (ofi_cq_data) {
         MTL_OFI_RETRY_UNTIL_DONE(fi_tsenddata(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
                                       start,
                                       length,
-                                      NULL,
+                                      (NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc,
                                       comm->c_my_rank,
                                       sep_peer_fiaddr,
                                       match_bits,
@@ -618,7 +732,7 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
         MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ofi_ctxt[ctxt_id].tx_ep,
                                       start,
                                       length,
-                                      NULL,
+                                      (NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc,
                                       sep_peer_fiaddr,
                                       match_bits,
                                       (void *) &ofi_req->ctx), ret);
@@ -631,9 +745,8 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
     }
 
 free_request_buffer:
-    if (OPAL_UNLIKELY(OMPI_SUCCESS != ofi_req->status.MPI_ERROR
-            && NULL != ofi_req->buffer)) {
-        free(ofi_req->buffer);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ofi_req->status.MPI_ERROR)) {
+        ompi_mtl_ofi_deregister_and_free_buffer(ofi_req);
     }
 
     return ofi_req->status.MPI_ERROR;
@@ -676,6 +789,8 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
         status->MPI_ERROR = MPI_ERR_TRUNCATE;
     }
 
+    ompi_mtl_ofi_deregister_buffer(ofi_req);
+
     /**
      * Unpack data into recv buffer if necessary.
      */
@@ -795,6 +910,8 @@ ompi_mtl_ofi_irecv_generic(struct mca_mtl_base_module_t *mtl,
     size_t length;
     bool free_after;
 
+    ompi_mtl_ofi_set_mr_null(ofi_req);
+
     ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid);
     set_thread_context(ctxt_id);
 
@@ -833,18 +950,22 @@ ompi_mtl_ofi_irecv_generic(struct mca_mtl_base_module_t *mtl,
     ofi_req->remote_addr = remote_addr;
     ofi_req->match_bits = match_bits;
 
+    ompi_ret = ompi_mtl_ofi_register_buffer(convertor, ofi_req, start);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
+        return ompi_ret;
+    }
+
     MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep,
                                       start,
                                       length,
-                                      NULL,
+                                      (NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc,
                                       remote_addr,
                                       match_bits,
                                       mask_bits,
                                       (void *)&ofi_req->ctx), ret);
     if (OPAL_UNLIKELY(0 > ret)) {
-        if (NULL != ofi_req->buffer) {
-            free(ofi_req->buffer);
-        }
+        ompi_mtl_ofi_deregister_and_free_buffer(ofi_req);
+
         MTL_OFI_LOG_FI_ERR(ret, "fi_trecv failed");
         return ompi_mtl_ofi_get_error(ret);
     }
@@ -866,6 +987,8 @@ ompi_mtl_ofi_mrecv_callback(struct fi_cq_tagged_entry *wc,
     status->MPI_ERROR = MPI_SUCCESS;
     status->_ucount = wc->len;
 
+    ompi_mtl_ofi_deregister_and_free_buffer(ofi_req);
+
     free(ofi_req);
 
     mrecv_req->completion_callback(mrecv_req);
@@ -896,6 +1019,8 @@ ompi_mtl_ofi_mrecv_error_callback(struct fi_cq_err_entry *error,
             status->MPI_ERROR = MPI_ERR_INTERN;
     }
 
+    ompi_mtl_ofi_deregister_and_free_buffer(ofi_req);
+
     free(ofi_req);
 
     mrecv_req->completion_callback(mrecv_req);
@@ -921,6 +1046,8 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
     uint64_t msgflags = FI_CLAIM | FI_COMPLETION;
     struct ompi_communicator_t *comm = (*message)->comm;
 
+    ompi_mtl_ofi_set_mr_null(ofi_req);
+
     ctxt_id = ompi_mtl_ofi_map_comm_to_ctxt(comm->c_contextid);
     set_thread_context(ctxt_id);
 
@@ -941,13 +1068,18 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
     ofi_req->status.MPI_ERROR = OMPI_SUCCESS;
     ofi_req->mrecv_req = mtl_request;
 
+    ompi_ret = ompi_mtl_ofi_register_buffer(convertor, ofi_req, start);
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
+        return ompi_ret;
+    }
+
     /**
      * fi_trecvmsg with FI_CLAIM
      */
     iov.iov_base = start;
     iov.iov_len = length;
     msg.msg_iov = &iov;
-    msg.desc = NULL;
+    msg.desc = (NULL == ofi_req->mr) ? NULL : ofi_req->mr->mem_desc;
     msg.iov_count = 1;
     msg.addr = 0;
     msg.tag = ofi_req->match_bits;
@@ -957,6 +1089,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
 
     MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
     if (OPAL_UNLIKELY(0 > ret)) {
+        ompi_mtl_ofi_deregister_and_free_buffer(ofi_req);
         MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
         return ompi_mtl_ofi_get_error(ret);
     }
diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c
index 908be25fbc9..a57dd51f571 100644
--- a/ompi/mca/mtl/ofi/mtl_ofi_component.c
+++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c
@@ -20,6 +20,9 @@
 #include "opal/util/argv.h"
 #include "opal/util/printf.h"
 #include "opal/mca/common/ofi/common_ofi.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/mca/common/cuda/common_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 static int ompi_mtl_ofi_component_open(void);
 static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
@@ -297,6 +300,9 @@ ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority)
 static int
 ompi_mtl_ofi_component_close(void)
 {
+#if OPAL_CUDA_SUPPORT
+    mca_common_cuda_fini();
+#endif
     opal_common_ofi_mca_deregister();
     return OMPI_SUCCESS;
 }
@@ -591,6 +597,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
         exclude_list = opal_argv_split(*opal_common_ofi.prov_exclude, ',');
     }
 
+    /**
+     * Note: API version 1.5 is the first version that supports
+     * FI_LOCAL_COMM / FI_REMOTE_COMM checking (and we definitely need
+     * that checking -- e.g., the shared memory provider supports
+     * intranode communication (FI_LOCAL_COMM), but not internode
+     * (FI_REMOTE_COMM), which is insufficient for MTL selection.
+     */
+    fi_version = FI_VERSION(1, 5);
+
     /**
      * Hints to filter providers
      * See man fi_getinfo for a list of all filters
@@ -608,15 +623,27 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
                             __FILE__, __LINE__);
         goto error;
     }
+
+#if OPAL_CUDA_SUPPORT
+    /** If Open MPI is built with CUDA, request device transfer
+     *  capabilities */
+    hints->caps |= FI_HMEM;
+    hints->domain_attr->mr_mode |= FI_MR_HMEM;
+    /**
+     * Note: API version 1.9 is the first version that supports FI_HMEM
+     */
+    fi_version = FI_VERSION(1, 9);
+#endif /* OPAL_CUDA_SUPPORT */
+
     /* Make sure to get a RDM provider that can do the tagged matching
        interface and local communication and remote communication. */
     hints->mode               = FI_CONTEXT;
     hints->ep_attr->type      = FI_EP_RDM;
-    hints->caps               = FI_TAGGED | FI_LOCAL_COMM | FI_REMOTE_COMM | FI_DIRECTED_RECV;
+    hints->caps               |= FI_TAGGED | FI_LOCAL_COMM | FI_REMOTE_COMM | FI_DIRECTED_RECV;
     hints->tx_attr->msg_order = FI_ORDER_SAS;
     hints->rx_attr->msg_order = FI_ORDER_SAS;
-    hints->rx_attr->op_flags = FI_COMPLETION;
-    hints->tx_attr->op_flags = FI_COMPLETION;
+    hints->rx_attr->op_flags  = FI_COMPLETION;
+    hints->tx_attr->op_flags  = FI_COMPLETION;
 
     if (enable_mpi_threads) {
         ompi_mtl_ofi.mpi_thread_multiple = true;
@@ -660,18 +687,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
 
     hints->domain_attr->resource_mgmt    = FI_RM_ENABLED;
 
-    /**
-     * FI_VERSION provides binary backward and forward compatibility support
-     * Specify the version of OFI is coded to, the provider will select struct
-     * layouts that are compatible with this version.
-     *
-     * Note: API version 1.5 is the first version that supports
-     * FI_LOCAL_COMM / FI_REMOTE_COMM checking (and we definitely need
-     * that checking -- e.g., some providers are suitable for RXD or
-     * RXM, but can't provide local communication).
-     */
-    fi_version = FI_VERSION(1, 5);
-
     /**
      * The EFA provider in Libfabric versions prior to 1.10 contains a bug
      * where the FI_LOCAL_COMM and FI_REMOTE_COMM capabilities are not
@@ -762,6 +777,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
     opal_argv_free(exclude_list);
     exclude_list = NULL;
 
+#if OPAL_CUDA_SUPPORT
+    if (!(prov->caps & FI_HMEM)) {
+        opal_output_verbose(1, opal_common_ofi.output,
+                            "%s:%d: Libfabric provider does not support CUDA buffers\n",
+                            __FILE__, __LINE__);
+        goto error;
+    }
+#endif /* OPAL_CUDA_SUPPORT */
+
     /**
      * Select the format of the OFI tag
      */
@@ -1037,6 +1061,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
      */
     ompi_mtl_ofi.any_addr = FI_ADDR_UNSPEC;
 
+#if OPAL_CUDA_SUPPORT
+    mca_common_cuda_stage_one_init();
+#endif
+
     return &ompi_mtl_ofi.base;
 
 error:
diff --git a/ompi/mca/mtl/ofi/mtl_ofi_request.h b/ompi/mca/mtl/ofi/mtl_ofi_request.h
index 15bbd2b0148..d1c05e9680e 100644
--- a/ompi/mca/mtl/ofi/mtl_ofi_request.h
+++ b/ompi/mca/mtl/ofi/mtl_ofi_request.h
@@ -87,6 +87,10 @@ struct ompi_mtl_ofi_request_t {
 
     /** Pointer to Mrecv request to complete */
     struct mca_mtl_request_t *mrecv_req;
+
+    /** Stores reference to memory region from registration */
+    /*  Set to NULL if memory not registered or if non CUDA buffer */
+    struct fid_mr *mr;
 };
 typedef struct ompi_mtl_ofi_request_t ompi_mtl_ofi_request_t;
 
diff --git a/ompi/mca/mtl/portals4/mtl_portals4_component.c b/ompi/mca/mtl/portals4/mtl_portals4_component.c
index f60cb3cfa18..132358b5638 100644
--- a/ompi/mca/mtl/portals4/mtl_portals4_component.c
+++ b/ompi/mca/mtl/portals4/mtl_portals4_component.c
@@ -428,6 +428,12 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
                          id.phys.nid, id.phys.pid));
 
     ompi_mtl_portals4.base.mtl_max_tag = MTL_PORTALS4_MAX_TAG;
+
+    /* Disable opal from checking if buffer being sent is cuda */
+#if OPAL_CUDA_SUPPORT
+    ompi_mtl_portals4.base.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE;
+#endif /* OPAL_CUDA_SUPPORT */
+
     return &ompi_mtl_portals4.base;
 
  error:
diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c
index bba219922ad..6f18572d763 100644
--- a/ompi/mca/osc/rdma/osc_rdma_component.c
+++ b/ompi/mca/osc/rdma/osc_rdma_component.c
@@ -51,7 +51,7 @@
 #include "opal/align.h"
 #include "opal/util/sys_limits.h"
 #if OPAL_CUDA_SUPPORT
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "opal/util/info_subscriber.h"
 
diff --git a/ompi/mca/pml/cm/pml_cm.h b/ompi/mca/pml/cm/pml_cm.h
index b3c06eb83bf..fa563e0b313 100644
--- a/ompi/mca/pml/cm/pml_cm.h
+++ b/ompi/mca/pml/cm/pml_cm.h
@@ -379,7 +379,16 @@ mca_pml_cm_send(const void *buf,
 		convertor.pBaseBuf   = (unsigned char*)buf + datatype->super.true_lb;
 		convertor.count      = count;
 		convertor.pDesc      = &datatype->super;
-	} else
+
+#if OPAL_CUDA_SUPPORT
+        /* Switches off CUDA detection if
+           MTL set MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE during init */
+        MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
+        convertor.flags      |= flags;
+        /* Sets CONVERTOR_CUDA flag if CUDA buffer */
+        opal_convertor_prepare_for_send( &convertor, &datatype->super, count, buf );
+#endif
+    } else
 #endif
 	{
 		ompi_proc = ompi_comm_peer_lookup(comm, dst);
diff --git a/ompi/mca/pml/cm/pml_cm_sendreq.h b/ompi/mca/pml/cm/pml_cm_sendreq.h
index 3560270b99f..c229f3e6899 100644
--- a/ompi/mca/pml/cm/pml_cm_sendreq.h
+++ b/ompi/mca/pml/cm/pml_cm_sendreq.h
@@ -242,6 +242,14 @@ do {                                                                    \
             (unsigned char*)buf + datatype->super.true_lb;              \
         (req_send)->req_base.req_convertor.count      = count;          \
         (req_send)->req_base.req_convertor.pDesc      = &datatype->super; \
+        /* Switches off CUDA detection if                               \
+         MTL set MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE during init */     \
+        MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);   \
+        (req_send)->req_base.req_convertor.flags |= flags;              \
+        /* Sets CONVERTOR_CUDA flag if CUDA buffer */                   \
+        opal_convertor_prepare_for_send(                                \
+            &req_send->req_base.req_convertor,                          \
+            &datatype->super, count, buf );                             \
     } else {                                                            \
         MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);   \
         opal_convertor_copy_and_prepare_for_send(                       \
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
index 51239e1210c..3c1e2762e85 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
@@ -50,7 +50,6 @@
 #include "pml_ob1_sendreq.h"
 #include "pml_ob1_hdr.h"
 #if OPAL_CUDA_SUPPORT
-#include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/mca/common/cuda/common_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index 1b798125c4b..a752f34463d 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -43,7 +43,6 @@
 #include "ompi/mca/bml/base/base.h"
 
 #if OPAL_CUDA_SUPPORT
-#include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/mca/common/cuda/common_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/Makefile.am b/opal/Makefile.am
index e7484cd8c51..8c208632dd2 100644
--- a/opal/Makefile.am
+++ b/opal/Makefile.am
@@ -22,8 +22,10 @@
 # $HEADER$
 #
 
+if OPAL_cuda_support
 SUBDIRS = \
 	include \
+        mca/common/cuda \
         datatype \
         etc \
         util \
@@ -32,21 +34,43 @@ SUBDIRS = \
 	$(MCA_opal_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \
         . \
 	$(MCA_opal_FRAMEWORK_COMPONENT_DSO_SUBDIRS)
-
 # libltdl is included by variable because if --disable-dlopen was
 # used, there will be no generated Makefile in that directory (and
 # therefore make distclean will fail).
 DIST_SUBDIRS = \
 	include \
+        mca/common/cuda \
         datatype \
         etc \
 	util \
 	mca/base \
 	$(MCA_opal_FRAMEWORKS_SUBDIRS) \
 	$(MCA_opal_FRAMEWORK_COMPONENT_ALL_SUBDIRS)
+else
+SUBDIRS = \
+	include \
+        datatype \
+        etc \
+        util \
+	mca/base \
+	$(MCA_opal_FRAMEWORKS_SUBDIRS) \
+	$(MCA_opal_FRAMEWORK_COMPONENT_STATIC_SUBDIRS) \
+        . \
+	$(MCA_opal_FRAMEWORK_COMPONENT_DSO_SUBDIRS)
+# libltdl is included by variable because if --disable-dlopen was
+# used, there will be no generated Makefile in that directory (and
+# therefore make distclean will fail).
+DIST_SUBDIRS = \
+	include \
+        datatype \
+        etc \
+	util \
+	mca/base \
+	$(MCA_opal_FRAMEWORKS_SUBDIRS) \
+	$(MCA_opal_FRAMEWORK_COMPONENT_ALL_SUBDIRS)
+endif
 
 # Build the main OPAL library
-
 lib_LTLIBRARIES = lib@OPAL_LIB_PREFIX@open-pal.la
 lib@OPAL_LIB_PREFIX@open_pal_la_SOURCES =
 lib@OPAL_LIB_PREFIX@open_pal_la_LIBADD = \
@@ -62,6 +86,12 @@ lib@OPAL_LIB_PREFIX@open_pal_la_DEPENDENCIES = \
         mca/base/libmca_base.la \
         util/libopalutil.la \
         $(MCA_opal_FRAMEWORK_LIBS)
+if OPAL_cuda_support
+lib@OPAL_LIB_PREFIX@open_pal_la_LIBADD += \
+        mca/common/cuda/libmca_common_cuda.la
+lib@OPAL_LIB_PREFIX@open_pal_la_DEPENDENCIES += \
+        mca/common/cuda/libmca_common_cuda.la
+endif
 lib@OPAL_LIB_PREFIX@open_pal_la_LDFLAGS = -version-info $(libopen_pal_so_version) \
 	$(opal_libevent_LDFLAGS) \
 	$(opal_hwloc_LDFLAGS) \
diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
index 36d13eff3b5..340800a6be3 100644
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@@ -78,8 +78,3 @@ opaldir = $(opalincludedir)/$(subdir)
 opal_HEADERS = $(headers)
 endif
 
-# If we have cuda support, modify file list and flags
-if OPAL_cuda_support
-libdatatype_la_SOURCES += opal_datatype_cuda.c
-headers += opal_datatype_cuda.h
-endif
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 853e5b1632f..36736a3007f 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -39,7 +39,7 @@
 #include "opal/datatype/opal_datatype_prototypes.h"
 #include "opal/datatype/opal_convertor_internal.h"
 #if OPAL_CUDA_SUPPORT
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
     CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif
diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c
index c70bdd24dfa..e13d17fa59c 100644
--- a/opal/datatype/opal_datatype_copy.c
+++ b/opal/datatype/opal_datatype_copy.c
@@ -72,7 +72,7 @@ static size_t opal_datatype_memop_block_size = 128 * 1024;
 #include "opal_datatype_copy.h"
 
 #if OPAL_CUDA_SUPPORT
-#include "opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 
 #undef MEM_OP_NAME
 #define MEM_OP_NAME non_overlap_cuda
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
deleted file mode 100644
index 7869f17e909..00000000000
--- a/opal/datatype/opal_datatype_cuda.c
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright (c) 2011-2014 NVIDIA Corporation.  All rights reserved.
- * $COPYRIGHT$
- *
- * Additional copyrights may follow
- *
- * $HEADER$
- */
-
-#include "opal_config.h"
-
-#include <errno.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "opal/align.h"
-#include "opal/util/output.h"
-#include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
-
-static bool initialized = false;
-int opal_cuda_verbose = 0;
-static int opal_cuda_enabled = 0; /* Starts out disabled */
-static int opal_cuda_output = 0;
-static void opal_cuda_support_init(void);
-static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
-static opal_common_cuda_function_table_t ftable;
-
-/* This function allows the common cuda code to register an
- * initialization function that gets called the first time an attempt
- * is made to send or receive a GPU pointer.  This allows us to delay
- * some CUDA initialization until after MPI_Init().
- */
-void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)) {
-    common_cuda_initialization_function = fptr;
-}
-
-/**
- * This function is called when a convertor is instantiated.  It has to call
- * the opal_cuda_support_init() function once to figure out if CUDA support
- * is enabled or not.  If CUDA is not enabled, then short circuit out
- * for all future calls.
- */
-void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
-{
-    /* Only do the initialization on the first GPU access */
-    if (!initialized) {
-        opal_cuda_support_init();
-    }
-
-    /* This is needed to handle case where convertor is not fully initialized
-     * like when trying to do a sendi with convertor on the statck */
-    convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy;
-
-    /* If not enabled, then nothing else to do */
-    if (!opal_cuda_enabled) {
-        return;
-    }
-
-    if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
-        convertor->flags |= CONVERTOR_CUDA;
-    }
-}
-
-/* Checks the type of pointer
- *
- * @param dest   One pointer to check
- * @param source Another pointer to check
- */
-bool opal_cuda_check_bufs(char *dest, char *src)
-{
-    /* Only do the initialization on the first GPU access */
-    if (!initialized) {
-        opal_cuda_support_init();
-    }
-
-    if (!opal_cuda_enabled) {
-        return false;
-    }
-
-    if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
-        return true;
-    } else {
-        return false;
-    }
-}
-
-/*
- * With CUDA enabled, all contiguous copies will pass through this function.
- * Therefore, the first check is to see if the convertor is a GPU buffer.
- * Note that if there is an error with any of the CUDA calls, the program
- * aborts as there is no recovering.
- */
-
-/* Checks the type of pointer
- *
- * @param buf   check one pointer providing a convertor.
- *  Provides aditional information, e.g. managed vs. unmanaged GPU buffer
- */
-bool  opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor )
-{
-    /* Only do the initialization on the first GPU access */
-    if (!initialized) {
-        opal_cuda_support_init();
-    }
-
-    if (!opal_cuda_enabled) {
-        return false;
-    }
-
-    return ( ftable.gpu_is_gpu_buffer(buf, convertor));
-}
-
-/*
- * With CUDA enabled, all contiguous copies will pass through this function.
- * Therefore, the first check is to see if the convertor is a GPU buffer.
- * Note that if there is an error with any of the CUDA calls, the program
- * aborts as there is no recovering.
- */
-
-void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
-{
-    int res;
-
-    if (!(convertor->flags & CONVERTOR_CUDA)) {
-        return memcpy(dest, src, size);
-    }
-
-    if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
-        res = ftable.gpu_cu_memcpy_async(dest, (void *)src, size, convertor);
-    } else {
-        res = ftable.gpu_cu_memcpy(dest, (void *)src, size);
-    }
-
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
-                    res, dest, src, (int)size);
-        abort();
-    } else {
-        return dest;
-    }
-}
-
-/*
- * This function is needed in cases where we do not have contiguous
- * datatypes.  The current code has macros that cannot handle a convertor
- * argument to the memcpy call.
- */
-void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
-{
-    int res;
-    res = ftable.gpu_cu_memcpy(dest, src, size);
-    if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
-                    res, dest, src, (int)size);
-        abort();
-    } else {
-        return dest;
-    }
-}
-
-/*
- * In some cases, need an implementation of memmove.  This is not fast, but
- * it is not often needed.
- */
-void *opal_cuda_memmove(void *dest, void *src, size_t size)
-{
-    int res;
-
-    res = ftable.gpu_memmove(dest, src, size);
-    if(res != 0){
-        opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d",
-                    res, dest, src, (int)size);
-        abort();
-    }
-    return dest;
-}
-
-/**
- * This function gets called once to check if the program is running in a cuda
- * environment.
- */
-static void opal_cuda_support_init(void)
-{
-    if (initialized) {
-        return;
-    }
-
-    /* Set different levels of verbosity in the cuda related code. */
-    opal_cuda_output = opal_output_open(NULL);
-    opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose);
-
-    /* Callback into the common cuda initialization routine. This is only
-     * set if some work had been done already in the common cuda code.*/
-    if (NULL != common_cuda_initialization_function) {
-        if (0 == common_cuda_initialization_function(&ftable)) {
-            opal_cuda_enabled = 1;
-        }
-    }
-
-    if (1 == opal_cuda_enabled) {
-        opal_output_verbose(10, opal_cuda_output,
-                            "CUDA: enabled successfully, CUDA device pointers will work");
-    } else {
-        opal_output_verbose(10, opal_cuda_output,
-                            "CUDA: not enabled, CUDA device pointers will not work");
-    }
-
-    initialized = true;
-}
-
-/**
- * Tell the convertor that copies will be asynchronous CUDA copies.  The
- * flags are cleared when the convertor is reinitialized.
- */
-void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream)
-{
-    convertor->flags |= CONVERTOR_CUDA_ASYNC;
-    convertor->stream = stream;
-}
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
deleted file mode 100644
index 2789320520a..00000000000
--- a/opal/datatype/opal_datatype_cuda.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2011-2014 NVIDIA Corporation.  All rights reserved.
- * $COPYRIGHT$
- *
- * Additional copyrights may follow
- *
- * $HEADER$
- */
-
-#ifndef _OPAL_DATATYPE_CUDA_H
-#define _OPAL_DATATYPE_CUDA_H
-
-/* Structure to hold CUDA support functions that gets filled in when the
- * common cuda code is initialized.  This removes any dependency on <cuda.h>
- * in the opal cuda datatype code. */
-struct opal_common_cuda_function_table {
-    int (*gpu_is_gpu_buffer)(const void*, opal_convertor_t*);
-    int (*gpu_cu_memcpy_async)(void*, const void*, size_t, opal_convertor_t*);
-    int (*gpu_cu_memcpy)(void*, const void*, size_t);
-    int (*gpu_memmove)(void*, void*, size_t);
-};
-typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
-
-void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
-bool opal_cuda_check_bufs(char *dest, char *src);
-bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor );
-void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
-void* opal_cuda_memcpy_sync(void * dest, const void * src, size_t size);
-void* opal_cuda_memmove(void * dest, void * src, size_t size);
-void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
-void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
-
-#endif
diff --git a/opal/datatype/opal_datatype_pack_unpack_predefined.h b/opal/datatype/opal_datatype_pack_unpack_predefined.h
index c516feb511d..be98cf2716c 100644
--- a/opal/datatype/opal_datatype_pack_unpack_predefined.h
+++ b/opal/datatype/opal_datatype_pack_unpack_predefined.h
@@ -54,7 +54,7 @@
 #define OPAL_DATATYPE_PACK_UNPACK_PREDEFINED_H_HAS_BEEN_INCLUDED
 
 #include "opal_config.h"
-#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
 #include <stdint.h>
 
 /*  Improve predefined pack/unpack performance using mpich methods.
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 038296e81a3..81e6a134486 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -33,7 +33,6 @@
 
 #include "opal/align.h"
 #include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/util/output.h"
 #include "opal/util/show_help.h"
 #include "opal/util/proc.h"
@@ -79,7 +78,7 @@ struct cudaFunctionTable {
     int (*cuPointerGetAttribute)(void *, CUpointer_attribute, CUdeviceptr);
     int (*cuMemcpyAsync)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
     int (*cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t);
-    int (*cuMemAlloc)(CUdeviceptr *, unsigned int);
+    int (*cuMemAlloc)(CUdeviceptr *, size_t);
     int (*cuMemFree)(CUdeviceptr buf);
     int (*cuCtxGetCurrent)(void *cuContext);
     int (*cuStreamCreate)(CUstream *, int);
@@ -501,6 +500,8 @@ static int mca_common_cuda_stage_two_init(opal_common_cuda_function_table_t *fta
     ftable->gpu_cu_memcpy_async = &mca_common_cuda_cu_memcpy_async;
     ftable->gpu_cu_memcpy = &mca_common_cuda_cu_memcpy;
     ftable->gpu_memmove = &mca_common_cuda_memmove;
+    ftable->gpu_malloc = &mca_common_cuda_malloc;
+    ftable->gpu_free = &mca_common_cuda_free;
 
     opal_output_verbose(30, mca_common_cuda_output,
                         "CUDA: support functions initialized");
@@ -1922,6 +1923,35 @@ static int mca_common_cuda_cu_memcpy(void *dest, const void *src, size_t size)
     return OPAL_SUCCESS;
 }
 
+int mca_common_cuda_malloc(void **dptr, size_t size)
+{
+    int res, count = 0;
+    if (size > 0) {
+        res = cuFunc.cuMemAlloc((CUdeviceptr *)dptr, size);
+        if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
+            opal_output(0, "CUDA: cuMemAlloc failed: res=%d",
+                        res);
+            return res;
+        }
+    }
+    return 0;
+}
+
+int mca_common_cuda_free(void *dptr)
+{
+    int res;
+    if (NULL != dptr) {
+        res = cuFunc.cuMemFree((CUdeviceptr)dptr);
+        if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
+            opal_output(0, "CUDA: cuMemFree failed: res=%d",
+                        res);
+            return res;
+        }
+    }
+    return 0;
+}
+
+
 static int mca_common_cuda_memmove(void *dest, void *src, size_t size)
 {
     CUdeviceptr tmp;
@@ -2069,4 +2099,258 @@ void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg)
                        true, OPAL_PROC_MY_HOSTNAME, res, dbuf);
     }
 }
+
+static bool initialized = false;
+int opal_cuda_verbose = 0;
+static int opal_cuda_enabled = 0; /* Starts out disabled */
+static int opal_cuda_output = 0;
+static void opal_cuda_support_init(void);
+static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
+static opal_common_cuda_function_table_t ftable;
+
+/* This function allows the common cuda code to register an
+ * initialization function that gets called the first time an attempt
+ * is made to send or receive a GPU pointer.  This allows us to delay
+ * some CUDA initialization until after MPI_Init().
+ */
+void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)) {
+    common_cuda_initialization_function = fptr;
+}
+
+/**
+ * This function is called when a convertor is instantiated.  It has to call
+ * the opal_cuda_support_init() function once to figure out if CUDA support
+ * is enabled or not.  If CUDA is not enabled, then short circuit out
+ * for all future calls.
+ */
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
+{
+    /* Only do the initialization on the first GPU access */
+    if (!initialized) {
+        opal_cuda_support_init();
+    }
+
+    /* This is needed to handle case where convertor is not fully initialized
+     * like when trying to do a sendi with convertor on the statck */
+    convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy;
+
+    /* If not enabled, then nothing else to do */
+    if (!opal_cuda_enabled) {
+        return;
+    }
+
+    if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
+        convertor->flags |= CONVERTOR_CUDA;
+    }
+}
+
+/* Checks the type of pointer
+ *
+ * @param dest   One pointer to check
+ * @param source Another pointer to check
+ */
+bool opal_cuda_check_bufs(char *dest, char *src)
+{
+    /* Only do the initialization on the first GPU access */
+    if (!initialized) {
+        opal_cuda_support_init();
+    }
+
+    if (!opal_cuda_enabled) {
+        return false;
+    }
+
+    if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+/*
+ * With CUDA enabled, all contiguous copies will pass through this function.
+ * Therefore, the first check is to see if the convertor is a GPU buffer.
+ * Note that if there is an error with any of the CUDA calls, the program
+ * aborts as there is no recovering.
+ */
+
+/* Checks the type of pointer
+ *
+ * @param buf   check one pointer providing a convertor.
+ *  Provides aditional information, e.g. managed vs. unmanaged GPU buffer
+ */
+bool  opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor )
+{
+    /* Only do the initialization on the first GPU access */
+    if (!initialized) {
+        opal_cuda_support_init();
+    }
+
+    if (!opal_cuda_enabled) {
+        return false;
+    }
+
+    return ( ftable.gpu_is_gpu_buffer(buf, convertor));
+}
+
+/*
+ * This function allocates a buffer using either cuMemAlloc
+ * or malloc, depending on if the convertor flag CONVERTOR_CUDA
+ * is set.
+ *
+ * @param size       Size of buffer to be allocated
+ * @param convertor  The convertor with flags describing if the buf
+ *                   should be a Host or Cuda buffer.
+ *
+ * @returns void *   A pointer to the newly allocated buffer.
+ */
+void *opal_cuda_malloc(size_t size, opal_convertor_t* convertor)
+{
+    int res;
+    void* buffer;
+    if (!(convertor->flags & CONVERTOR_CUDA)) {
+        return malloc(size);
+    }
+    res = ftable.gpu_malloc(buffer, size);
+    if (res != 0 ) {
+        opal_output(0, "CUDA: Error in cuMemAlloc: size=%d",
+                    (int)size);
+        abort();
+    } else {
+        return buffer;
+    }
+}
+
+/*
+ * This function frees a buffer using either cuMemFree() or free(),
+ * depending on if the convertor flag CONVERTOR_CUDA is set.
+ *
+ * @param buffer     Pointer to buffer to be freed
+ * @param convertor  The convertor with flags describing if the buf
+ *                   should be a Host or Cuda buffer.
+ *
+ */
+void opal_cuda_free(void *buffer, opal_convertor_t* convertor)
+{
+    int res;
+    if (!(convertor->flags & CONVERTOR_CUDA)) {
+        free(buffer);
+        return;
+    }
+    res = ftable.gpu_free(buffer);
+    if (res != 0 ) {
+        opal_output(0, "CUDA: Error in cuMemFree: ptr=%p",
+                    buffer);
+        abort();
+    }
+    return;
+}
+
+/*
+ * With CUDA enabled, all contiguous copies will pass through this function.
+ * Therefore, the first check is to see if the convertor is a GPU buffer.
+ * Note that if there is an error with any of the CUDA calls, the program
+ * aborts as there is no recovering.
+ */
+
+void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_t* convertor)
+{
+    int res;
+
+    if (!(convertor->flags & CONVERTOR_CUDA)) {
+        return memcpy(dest, src, size);
+    }
+
+    if (convertor->flags & CONVERTOR_CUDA_ASYNC) {
+        res = ftable.gpu_cu_memcpy_async(dest, (void *)src, size, convertor);
+    } else {
+        res = ftable.gpu_cu_memcpy(dest, (void *)src, size);
+    }
+
+    if (res != 0) {
+        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
+                    res, dest, src, (int)size);
+        abort();
+    } else {
+        return dest;
+    }
+}
+
+/*
+ * This function is needed in cases where we do not have contiguous
+ * datatypes.  The current code has macros that cannot handle a convertor
+ * argument to the memcpy call.
+ */
+void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
+{
+    int res;
+    res = ftable.gpu_cu_memcpy(dest, src, size);
+    if (res != 0) {
+        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
+                    res, dest, src, (int)size);
+        abort();
+    } else {
+        return dest;
+    }
+}
+
+/*
+ * In some cases, need an implementation of memmove.  This is not fast, but
+ * it is not often needed.
+ */
+void *opal_cuda_memmove(void *dest, void *src, size_t size)
+{
+    int res;
+
+    res = ftable.gpu_memmove(dest, src, size);
+    if(res != 0){
+        opal_output(0, "CUDA: Error in gpu memmove: res=%d, dest=%p, src=%p, size=%d",
+                    res, dest, src, (int)size);
+        abort();
+    }
+    return dest;
+}
+
+/**
+ * This function gets called once to check if the program is running in a cuda
+ * environment.
+ */
+static void opal_cuda_support_init(void)
+{
+    if (initialized) {
+        return;
+    }
+
+    /* Set different levels of verbosity in the cuda related code. */
+    opal_cuda_output = opal_output_open(NULL);
+    opal_output_set_verbosity(opal_cuda_output, opal_cuda_verbose);
+
+    /* Callback into the common cuda initialization routine. This is only
+     * set if some work had been done already in the common cuda code.*/
+    if (NULL != common_cuda_initialization_function) {
+        if (0 == common_cuda_initialization_function(&ftable)) {
+            opal_cuda_enabled = 1;
+        }
+    }
+
+    if (1 == opal_cuda_enabled) {
+        opal_output_verbose(10, opal_cuda_output,
+                            "CUDA: enabled successfully, CUDA device pointers will work");
+    } else {
+        opal_output_verbose(10, opal_cuda_output,
+                            "CUDA: not enabled, CUDA device pointers will not work");
+    }
+
+    initialized = true;
+}
+
+/**
+ * Tell the convertor that copies will be asynchronous CUDA copies.  The
+ * flags are cleared when the convertor is reinitialized.
+ */
+void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream)
+{
+    convertor->flags |= CONVERTOR_CUDA_ASYNC;
+    convertor->stream = stream;
+}
 #endif /* OPAL_CUDA_GDR_SUPPORT */
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 3ff95405299..695201bc134 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -52,6 +52,9 @@ OPAL_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg);
 
 OPAL_DECLSPEC void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg);
 
+OPAL_DECLSPEC int mca_common_cuda_malloc(void **buffer, size_t size);
+OPAL_DECLSPEC int mca_common_cuda_free(void *buffer);
+
 OPAL_DECLSPEC int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
                                          struct mca_btl_base_descriptor_t *, int *done);
 
@@ -108,4 +111,28 @@ static inline int32_t opal_convertor_cuda_need_buffers( opal_convertor_t* pConve
     return retval;
 }
 
+/* Structure to hold CUDA support functions that gets filled in when the
+ * common cuda code is initialized.  This removes any dependency on <cuda.h>
+ * in the opal cuda datatype code. */
+struct opal_common_cuda_function_table {
+    int (*gpu_is_gpu_buffer)(const void*, opal_convertor_t*);
+    int (*gpu_cu_memcpy_async)(void*, const void*, size_t, opal_convertor_t*);
+    int (*gpu_cu_memcpy)(void*, const void*, size_t);
+    int (*gpu_memmove)(void*, void*, size_t);
+    int (*gpu_malloc)(void*, size_t);
+    int (*gpu_free)(void*);
+};
+typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
+
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
+bool opal_cuda_check_bufs(char *dest, char *src);
+bool opal_cuda_check_one_buf(char *buf, opal_convertor_t *convertor );
+void* opal_cuda_malloc(size_t size, opal_convertor_t* convertor);
+void opal_cuda_free(void * buffer, opal_convertor_t* convertor);
+void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
+void* opal_cuda_memcpy_sync(void * dest, const void * src, size_t size);
+void* opal_cuda_memmove(void * dest, void * src, size_t size);
+void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
+void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
+
 #endif /* OPAL_MCA_COMMON_CUDA_H */
diff --git a/opal/mca/common/cuda/configure.m4 b/opal/mca/common/cuda/configure.m4
index 6083e087a3b..5bda090e26f 100644
--- a/opal/mca/common/cuda/configure.m4
+++ b/opal/mca/common/cuda/configure.m4
@@ -27,8 +27,4 @@ AC_DEFUN([MCA_opal_common_cuda_CONFIG],[
           [$1],
           [$2])
 
-    # Copy over the includes needed to build CUDA
-    common_cuda_CPPFLAGS=$opal_datatype_cuda_CPPFLAGS
-    AC_SUBST([common_cuda_CPPFLAGS])
-
 ])dnl