diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h index 40423c5ce0488..618da44c675d8 100644 --- a/flang/include/flang/Runtime/CUDA/allocator.h +++ b/flang/include/flang/Runtime/CUDA/allocator.h @@ -9,6 +9,7 @@ #ifndef FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_ #define FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_ +#include "common.h" #include "flang/Runtime/descriptor.h" #include "flang/Runtime/entry-names.h" @@ -19,16 +20,16 @@ extern "C" { void RTDECL(CUFRegisterAllocator)(); } -void *CUFAllocPinned(std::size_t, std::int64_t); +void *CUFAllocPinned(std::size_t, std::int64_t = kCudaNoStream); void CUFFreePinned(void *); void *CUFAllocDevice(std::size_t, std::int64_t); void CUFFreeDevice(void *); -void *CUFAllocManaged(std::size_t, std::int64_t); +void *CUFAllocManaged(std::size_t, std::int64_t = kCudaNoStream); void CUFFreeManaged(void *); -void *CUFAllocUnified(std::size_t, std::int64_t); +void *CUFAllocUnified(std::size_t, std::int64_t = kCudaNoStream); void CUFFreeUnified(void *); } // namespace Fortran::runtime::cuda diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp index e41ed77e40ff9..d848f1811dcf3 100644 --- a/flang/runtime/CUDA/allocator.cpp +++ b/flang/runtime/CUDA/allocator.cpp @@ -33,8 +33,7 @@ void RTDEF(CUFRegisterAllocator)() { } } -void *CUFAllocPinned( - std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) { +void *CUFAllocPinned(std::size_t sizeInBytes, std::int64_t) { void *p; CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes)); return p; @@ -42,17 +41,20 @@ void *CUFAllocPinned( void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); } -void *CUFAllocDevice( - std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) { +void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t stream) { void *p; - CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); + if (stream >= 0) { + CUDA_REPORT_IF_ERROR( + cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)stream)); + } else { + CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); + } return p; } void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); } -void *CUFAllocManaged( - std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) { +void *CUFAllocManaged(std::size_t sizeInBytes, std::int64_t) { void *p; CUDA_REPORT_IF_ERROR( cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal)); @@ -61,10 +63,9 @@ void *CUFAllocManaged( void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); } -void *CUFAllocUnified( - std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) { +void *CUFAllocUnified(std::size_t sizeInBytes, std::int64_t) { // Call alloc managed for the time being. - return CUFAllocManaged(sizeInBytes, asyncId); + return CUFAllocManaged(sizeInBytes); } void CUFFreeUnified(void *p) { diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp index 435172890472d..6ea842e775c11 100644 --- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp +++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp @@ -43,6 +43,23 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) { EXPECT_FALSE(a->IsAllocated()); } +TEST(AllocatableCUFTest, SimpleStreamDeviceAllocate) { + using Fortran::common::TypeCategory; + RTNAME(CUFRegisterAllocator)(); + // REAL(4), DEVICE, ALLOCATABLE :: a(:) + auto a{createAllocatable(TypeCategory::Real, 4)}; + a->SetAllocIdx(kDeviceAllocatorPos); + EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx()); + EXPECT_FALSE(a->HasAddendum()); + RTNAME(AllocatableSetBounds)(*a, 0, 1, 10); + RTNAME(AllocatableAllocate) + (*a, 1, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_TRUE(a->IsAllocated()); + RTNAME(AllocatableDeallocate) + (*a, /*hasStat=*/false, /*errMsg=*/nullptr, __FILE__, __LINE__); + EXPECT_FALSE(a->IsAllocated()); +} + TEST(AllocatableCUFTest, SimplePinnedAllocate) { using Fortran::common::TypeCategory; RTNAME(CUFRegisterAllocator)();