Skip to content

Add CUDA support for the OFI MTL #8536

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 9, 2021
4 changes: 2 additions & 2 deletions config/opal_check_cuda.m4
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ AC_MSG_CHECKING([if have cuda support])
if test "$opal_check_cuda_happy" = "yes"; then
AC_MSG_RESULT([yes (-I$opal_cuda_incdir)])
CUDA_SUPPORT=1
opal_datatype_cuda_CPPFLAGS="-I$opal_cuda_incdir"
AC_SUBST([opal_datatype_cuda_CPPFLAGS])
common_cuda_CPPFLAGS="-I$opal_cuda_incdir"
AC_SUBST([common_cuda_CPPFLAGS])
else
AC_MSG_RESULT([no])
CUDA_SUPPORT=0
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/coll/cuda/coll_cuda_allreduce.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#include "ompi/op/op.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_cuda.h"
#include "opal/mca/common/cuda/common_cuda.h"

/*
* allreduce_intra
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/coll/cuda/coll_cuda_exscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#include "ompi/op/op.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_cuda.h"
#include "opal/mca/common/cuda/common_cuda.h"

int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
struct ompi_datatype_t *dtype,
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/coll/cuda/coll_cuda_reduce.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#include "ompi/op/op.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_cuda.h"
#include "opal/mca/common/cuda/common_cuda.h"

/*
* reduce_log_inter
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/coll/cuda/coll_cuda_reduce_scatter_block.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#include "ompi/op/op.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_cuda.h"
#include "opal/mca/common/cuda/common_cuda.h"

/*
* reduce_scatter_block
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/coll/cuda/coll_cuda_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

#include "ompi/op/op.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_cuda.h"
#include "opal/mca/common/cuda/common_cuda.h"

/*
* scan
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/coll/libnbc/nbc_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
#include "coll_libnbc.h"
#if OPAL_CUDA_SUPPORT
#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_cuda.h"
#include "opal/mca/common/cuda/common_cuda.h"
#endif /* OPAL_CUDA_SUPPORT */
#include "ompi/include/ompi/constants.h"
#include "ompi/request/request.h"
Expand Down
1 change: 0 additions & 1 deletion ompi/mca/common/ompio/common_ompio_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include "ompi_config.h"

#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_cuda.h"
#include "opal/mca/common/cuda/common_cuda.h"
#include "opal/util/sys_limits.h"

Expand Down
145 changes: 145 additions & 0 deletions ompi/mca/mtl/base/mtl_base_datatype.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,82 @@
#include "ompi/datatype/ompi_datatype.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/datatype/opal_datatype_internal.h"
#if OPAL_CUDA_SUPPORT
#include "opal/mca/common/cuda/common_cuda.h"
#include "opal/datatype/opal_convertor.h"
#endif

#ifndef MTL_BASE_DATATYPE_H_INCLUDED
#define MTL_BASE_DATATYPE_H_INCLUDED

#if OPAL_CUDA_SUPPORT
static int
ompi_mtl_cuda_datatype_pack(struct opal_convertor_t *convertor,
void **buffer,
size_t *buffer_len,
bool *freeAfter)
{

struct iovec iov;
uint32_t iov_count = 1;
int is_cuda = convertor->flags & CONVERTOR_CUDA;

#if !(OPAL_ENABLE_HETEROGENEOUS_SUPPORT)
if (convertor->pDesc &&
!(convertor->flags & CONVERTOR_COMPLETED) &&
opal_datatype_is_contiguous_memory_layout(convertor->pDesc,
convertor->count)) {
*freeAfter = false;
*buffer = convertor->pBaseBuf;
*buffer_len = convertor->local_size;
return OPAL_SUCCESS;
}
#endif

opal_convertor_get_packed_size(convertor, buffer_len);
*freeAfter = false;
if( 0 == *buffer_len ) {
*buffer = NULL;
return OMPI_SUCCESS;
}
iov.iov_len = *buffer_len;
iov.iov_base = NULL;
/* opal_convertor_need_buffers always returns true
* if CONVERTOR_CUDA is set, so unset temporarily
*/
convertor->flags &= ~CONVERTOR_CUDA;

if (opal_convertor_need_buffers(convertor)) {
if (is_cuda) {
convertor->flags |= CONVERTOR_CUDA;
}
iov.iov_base = opal_cuda_malloc(*buffer_len, convertor);
if (NULL == iov.iov_base) return OMPI_ERR_OUT_OF_RESOURCE;
*freeAfter = true;
} else if (is_cuda) {
convertor->flags |= CONVERTOR_CUDA;
}

opal_convertor_pack( convertor, &iov, &iov_count, buffer_len );

*buffer = iov.iov_base;

return OMPI_SUCCESS;
}
#endif

__opal_attribute_always_inline__ static inline int
ompi_mtl_datatype_pack(struct opal_convertor_t *convertor,
void **buffer,
size_t *buffer_len,
bool *freeAfter)
{
#if OPAL_CUDA_SUPPORT
return ompi_mtl_cuda_datatype_pack(convertor,
buffer,
buffer_len,
freeAfter);
#endif
struct iovec iov;
uint32_t iov_count = 1;

Expand Down Expand Up @@ -71,13 +137,56 @@ ompi_mtl_datatype_pack(struct opal_convertor_t *convertor,
return OMPI_SUCCESS;
}

#if OPAL_CUDA_SUPPORT
static int
ompi_mtl_cuda_datatype_recv_buf(struct opal_convertor_t *convertor,
void ** buffer,
size_t *buffer_len,
bool *free_on_error)
{
int is_cuda = convertor->flags & CONVERTOR_CUDA;
opal_convertor_get_packed_size(convertor, buffer_len);
*free_on_error = false;
if( 0 == *buffer_len ) {
*buffer = NULL;
*buffer_len = 0;
return OMPI_SUCCESS;
}
/* opal_convertor_need_buffers always returns true
* if CONVERTOR_CUDA is set, so unset temporarily
*/
convertor->flags &= ~CONVERTOR_CUDA;
if (opal_convertor_need_buffers(convertor)) {
if (is_cuda) {
convertor->flags |= CONVERTOR_CUDA;
}
*buffer = opal_cuda_malloc(*buffer_len, convertor);
*free_on_error = true;
} else {
if (is_cuda) {
convertor->flags |= CONVERTOR_CUDA;
}
*buffer = convertor->pBaseBuf +
convertor->use_desc->desc[convertor->use_desc->used].end_loop.first_elem_disp;
}
return OMPI_SUCCESS;

}
#endif

__opal_attribute_always_inline__ static inline int
ompi_mtl_datatype_recv_buf(struct opal_convertor_t *convertor,
void ** buffer,
size_t *buffer_len,
bool *free_on_error)
{
#if OPAL_CUDA_SUPPORT
return ompi_mtl_cuda_datatype_recv_buf(convertor,
buffer,
buffer_len,
free_on_error);
#endif

opal_convertor_get_packed_size(convertor, buffer_len);
*free_on_error = false;
if( 0 == *buffer_len ) {
Expand All @@ -95,12 +204,48 @@ ompi_mtl_datatype_recv_buf(struct opal_convertor_t *convertor,
return OMPI_SUCCESS;
}

#if OPAL_CUDA_SUPPORT
static int
ompi_mtl_cuda_datatype_unpack(struct opal_convertor_t *convertor,
void *buffer,
size_t buffer_len) {
struct iovec iov;
uint32_t iov_count = 1;
int is_cuda = convertor->flags & CONVERTOR_CUDA;
/* opal_convertor_need_buffers always returns true
* if CONVERTOR_CUDA is set, so unset temporarily
*/
convertor->flags &= ~CONVERTOR_CUDA;

if (buffer_len > 0 && opal_convertor_need_buffers(convertor)) {
iov.iov_len = buffer_len;
iov.iov_base = buffer;

if (is_cuda) {
convertor->flags |= CONVERTOR_CUDA;
}
opal_convertor_unpack(convertor, &iov, &iov_count, &buffer_len );

opal_cuda_free(buffer, convertor);
} else if (is_cuda) {
convertor->flags |= CONVERTOR_CUDA;
}

return OMPI_SUCCESS;

}
#endif

__opal_attribute_always_inline__ static inline int
ompi_mtl_datatype_unpack(struct opal_convertor_t *convertor,
void *buffer,
size_t buffer_len)
{
#if OPAL_CUDA_SUPPORT
return ompi_mtl_cuda_datatype_unpack(convertor,
buffer,
buffer_len);
#endif
struct iovec iov;
uint32_t iov_count = 1;

Expand Down
11 changes: 11 additions & 0 deletions ompi/mca/mtl/ofi/configure.m4
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,17 @@ AC_DEFUN([MCA_ompi_mtl_ofi_CONFIG],[
# Check for OFI
OPAL_CHECK_OFI

# Check for CUDA
OPAL_CHECK_CUDA

# Check for cuda support. If so, we require a minimum libfabric version
# of 1.9. FI_HMEM capabilities are only available starting from v1.9
opal_ofi_happy="yes"
AS_IF([test "$opal_check_cuda_happy" = "yes"],
[OPAL_CHECK_OFI_VERSION_GE([1,9],
[],
[opal_ofi_happy=no])])

# The OFI MTL requires at least OFI libfabric v1.5.
AS_IF([test "$opal_ofi_happy" = "yes"],
[OPAL_CHECK_OFI_VERSION_GE([1,5],
Expand Down
9 changes: 9 additions & 0 deletions ompi/mca/mtl/ofi/help-mtl-ofi.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,12 @@ recoverable and your application is likely to abort.
Error: %s (%d)
[message too big]
Message size %llu bigger than supported by selected transport. Max = %llu

[Buffer Memory Registration Failed]
Open MPI failed to register your buffer.
This error is fatal, your job will abort

Buffer Type: %s
Buffer Address: %p
Buffer Length: %d
Error: %s (%zd)
Loading