[flang][cuda] Pass the pinned variable in allocate calls #125310

clementval · 2025-01-31T22:35:47Z

Add the pinned variable in the CUFAllocatableAllocate and CUFPointerAllocate signatures and update CUFOpConversion to pass the value when available.
The pinned variable is currently set with the stat information. More runtime work is needed to set the variable close to the cudaMallocHost calls.

Add the pinned variable in the CUFAllocatableAllocate and CUFPointerAllocate signatures and update CUFOpConversion to pass the value when available.

llvmbot · 2025-01-31T22:36:19Z

@llvm/pr-subscribers-flang-runtime

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

Add the pinned variable in the CUFAllocatableAllocate and CUFPointerAllocate signatures and update CUFOpConversion to pass the value when available.
The pinned variable is currently set with the stat information. More runtime work is needed to set the variable close to the cudaMallocHost calls.

Patch is 21.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125310.diff

7 Files Affected:

(modified) flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h (+7)
(modified) flang/include/flang/Runtime/CUDA/allocatable.h (+12-10)
(modified) flang/include/flang/Runtime/CUDA/pointer.h (+12-10)
(modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+12-11)
(modified) flang/runtime/CUDA/allocatable.cpp (+14-9)
(modified) flang/runtime/CUDA/pointer.cpp (+15-9)
(modified) flang/test/Fir/CUDA/cuda-allocate.fir (+21-6)

diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index eaa1de76154d93..116b927a86a7a9 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -401,6 +401,13 @@ constexpr TypeBuilderFunc getModel<bool &>() {
   };
 }
 template <>
+constexpr TypeBuilderFunc getModel<bool *>() {
+  return [](mlir::MLIRContext *context) -> mlir::Type {
+    TypeBuilderFunc f{getModel<bool>()};
+    return fir::ReferenceType::get(f(context));
+  };
+}
+template <>
 constexpr TypeBuilderFunc getModel<unsigned short>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 0a96f73b6be44b..822f2d4a2b297d 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -18,28 +18,30 @@ extern "C" {
 
 /// Perform allocation of the descriptor.
 int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform deallocation of the descriptor with synchronization of it when
 /// necessary.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index 78c7a1a92b7ea9..7fbd8f8e061f20 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -18,28 +18,30 @@ extern "C" {
 
 /// Perform allocation of the descriptor.
 int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 } // extern "C"
 
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 77aa11f0603f69..549498f5585a68 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -103,7 +103,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   mlir::Value sourceLine;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
     sourceLine = fir::factory::locationToLineNo(
-        builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
+        builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
   else
     sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
 
@@ -119,22 +119,28 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   }
   llvm::SmallVector<mlir::Value> args;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+    mlir::Value pinned =
+        op.getPinned()
+            ? op.getPinned()
+            : builder.createNullConstant(
+                  loc, fir::ReferenceType::get(
+                           mlir::IntegerType::get(op.getContext(), 1)));
     if (op.getSource()) {
       mlir::Value stream =
           op.getStream()
               ? op.getStream()
               : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
-      args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           op.getSource(), stream, hasStat,
-                                           errmsg, sourceFile, sourceLine);
+      args = fir::runtime::createArguments(
+          builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
+          hasStat, errmsg, sourceFile, sourceLine);
     } else {
       mlir::Value stream =
           op.getStream()
               ? op.getStream()
               : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           stream, hasStat, errmsg, sourceFile,
-                                           sourceLine);
+                                           stream, pinned, hasStat, errmsg,
+                                           sourceFile, sourceLine);
     }
   } else {
     args =
@@ -153,11 +159,6 @@ struct CUFAllocateOpConversion
   mlir::LogicalResult
   matchAndRewrite(cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    // TODO: Pinned is a reference to a logical value that can be set to true
-    // when pinned allocation succeed. This will require a new entry point.
-    if (op.getPinned())
-      return mlir::failure();
-
     auto mod = op->getParentOfType<mlir::ModuleOp>();
     fir::FirOpBuilder builder(rewriter, mod);
     mlir::Location loc = op.getLoc();
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9be54e8906903d..6df3b06793b3e6 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -23,10 +23,10 @@ extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
 int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
-      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -41,8 +41,8 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
 }
 
 int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   if (desc.HasAddendum()) {
     Terminator terminator{sourceFile, sourceLine};
     // TODO: This require a bit more work to set the correct type descriptor
@@ -53,14 +53,19 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
   // Perform the standard allocation.
   int stat{RTNAME(AllocatableAllocate)(
       desc, hasStat, errMsg, sourceFile, sourceLine)};
+  if (pinned) {
+    // Set pinned according to stat. More infrastructre is needed to set it
+    // closer to the actual allocation call.
+    *pinned = (stat == StatOk);
+  }
   return stat;
 }
 
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
-      alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -70,10 +75,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocateSync)(
-      alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git a/flang/runtime/CUDA/pointer.cpp b/flang/runtime/CUDA/pointer.cpp
index 3252410bd8d2c2..d3ebe97b4e4acc 100644
--- a/flang/runtime/CUDA/pointer.cpp
+++ b/flang/runtime/CUDA/pointer.cpp
@@ -21,8 +21,9 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
   if (desc.HasAddendum()) {
     Terminator terminator{sourceFile, sourceLine};
     // TODO: This require a bit more work to set the correct type descriptor
@@ -33,14 +34,19 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat,
   // Perform the standard allocation.
   int stat{
       RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
+  if (pinned) {
+    // Set pinned according to stat. More infrastructre is needed to set it
+    // closer to the actual allocation call.
+    *pinned = (stat == StatOk);
+  }
   return stat;
 }
 
 int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocate)(
-      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -55,10 +61,10 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
 }
 
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocate)(
-      pointer, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -68,10 +74,10 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
 }
 
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocateSync)(
-      pointer, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 08573110821cc2..095ad92d5deb50 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
 // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
 }
 
 // CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: _FortranAAllocatableSetBounds
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
 // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -179,7 +179,7 @@ func.func @_QQallocate_stream() {
 // CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
 // CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
 // CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPp_alloc() {
@@ -255,4 +255,19 @@ func.func @_QMmod1Ppointer_source_global() {
 // CHECK-LABE...
[truncated]

llvmbot · 2025-01-31T22:36:19Z

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタインクレメン) (clementval)

Changes

Add the pinned variable in the CUFAllocatableAllocate and CUFPointerAllocate signatures and update CUFOpConversion to pass the value when available.
The pinned variable is currently set with the stat information. More runtime work is needed to set the variable close to the cudaMallocHost calls.

Patch is 21.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125310.diff

7 Files Affected:

(modified) flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h (+7)
(modified) flang/include/flang/Runtime/CUDA/allocatable.h (+12-10)
(modified) flang/include/flang/Runtime/CUDA/pointer.h (+12-10)
(modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+12-11)
(modified) flang/runtime/CUDA/allocatable.cpp (+14-9)
(modified) flang/runtime/CUDA/pointer.cpp (+15-9)
(modified) flang/test/Fir/CUDA/cuda-allocate.fir (+21-6)

diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index eaa1de76154d93..116b927a86a7a9 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -401,6 +401,13 @@ constexpr TypeBuilderFunc getModel<bool &>() {
   };
 }
 template <>
+constexpr TypeBuilderFunc getModel<bool *>() {
+  return [](mlir::MLIRContext *context) -> mlir::Type {
+    TypeBuilderFunc f{getModel<bool>()};
+    return fir::ReferenceType::get(f(context));
+  };
+}
+template <>
 constexpr TypeBuilderFunc getModel<unsigned short>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 0a96f73b6be44b..822f2d4a2b297d 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -18,28 +18,30 @@ extern "C" {
 
 /// Perform allocation of the descriptor.
 int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform deallocation of the descriptor with synchronization of it when
 /// necessary.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index 78c7a1a92b7ea9..7fbd8f8e061f20 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -18,28 +18,30 @@ extern "C" {
 
 /// Perform allocation of the descriptor.
 int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t stream = -1,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream = -1, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+    bool hasStat = false, const Descriptor *errMsg = nullptr,
+    const char *sourceFile = nullptr, int sourceLine = 0);
 
 } // extern "C"
 
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 77aa11f0603f69..549498f5585a68 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -103,7 +103,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   mlir::Value sourceLine;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
     sourceLine = fir::factory::locationToLineNo(
-        builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
+        builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
   else
     sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
 
@@ -119,22 +119,28 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   }
   llvm::SmallVector<mlir::Value> args;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+    mlir::Value pinned =
+        op.getPinned()
+            ? op.getPinned()
+            : builder.createNullConstant(
+                  loc, fir::ReferenceType::get(
+                           mlir::IntegerType::get(op.getContext(), 1)));
     if (op.getSource()) {
       mlir::Value stream =
           op.getStream()
               ? op.getStream()
               : builder.createIntegerConstant(loc, fTy.getInput(2), -1);
-      args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           op.getSource(), stream, hasStat,
-                                           errmsg, sourceFile, sourceLine);
+      args = fir::runtime::createArguments(
+          builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
+          hasStat, errmsg, sourceFile, sourceLine);
     } else {
       mlir::Value stream =
           op.getStream()
               ? op.getStream()
               : builder.createIntegerConstant(loc, fTy.getInput(1), -1);
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           stream, hasStat, errmsg, sourceFile,
-                                           sourceLine);
+                                           stream, pinned, hasStat, errmsg,
+                                           sourceFile, sourceLine);
     }
   } else {
     args =
@@ -153,11 +159,6 @@ struct CUFAllocateOpConversion
   mlir::LogicalResult
   matchAndRewrite(cuf::AllocateOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    // TODO: Pinned is a reference to a logical value that can be set to true
-    // when pinned allocation succeed. This will require a new entry point.
-    if (op.getPinned())
-      return mlir::failure();
-
     auto mod = op->getParentOfType<mlir::ModuleOp>();
     fir::FirOpBuilder builder(rewriter, mod);
     mlir::Location loc = op.getLoc();
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9be54e8906903d..6df3b06793b3e6 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -23,10 +23,10 @@ extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
 int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
-      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -41,8 +41,8 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
 }
 
 int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   if (desc.HasAddendum()) {
     Terminator terminator{sourceFile, sourceLine};
     // TODO: This require a bit more work to set the correct type descriptor
@@ -53,14 +53,19 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
   // Perform the standard allocation.
   int stat{RTNAME(AllocatableAllocate)(
       desc, hasStat, errMsg, sourceFile, sourceLine)};
+  if (pinned) {
+    // Set pinned according to stat. More infrastructre is needed to set it
+    // closer to the actual allocation call.
+    *pinned = (stat == StatOk);
+  }
   return stat;
 }
 
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
-      alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -70,10 +75,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocateSync)(
-      alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git a/flang/runtime/CUDA/pointer.cpp b/flang/runtime/CUDA/pointer.cpp
index 3252410bd8d2c2..d3ebe97b4e4acc 100644
--- a/flang/runtime/CUDA/pointer.cpp
+++ b/flang/runtime/CUDA/pointer.cpp
@@ -21,8 +21,9 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
   if (desc.HasAddendum()) {
     Terminator terminator{sourceFile, sourceLine};
     // TODO: This require a bit more work to set the correct type descriptor
@@ -33,14 +34,19 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat,
   // Perform the standard allocation.
   int stat{
       RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
+  if (pinned) {
+    // Set pinned according to stat. More infrastructre is needed to set it
+    // closer to the actual allocation call.
+    *pinned = (stat == StatOk);
+  }
   return stat;
 }
 
 int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocate)(
-      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -55,10 +61,10 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
 }
 
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocate)(
-      pointer, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -68,10 +74,10 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
 }
 
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t stream, bool hasStat,
+    const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocateSync)(
-      pointer, stream, hasStat, errMsg, sourceFile, sourceLine)};
+      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 08573110821cc2..095ad92d5deb50 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
 // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
 }
 
 // CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: _FortranAAllocatableSetBounds
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
 // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -179,7 +179,7 @@ func.func @_QQallocate_stream() {
 // CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
 // CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
 // CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPp_alloc() {
@@ -255,4 +255,19 @@ func.func @_QMmod1Ppointer_source_global() {
 // CHECK-LABE...
[truncated]

[flang][cuda] Pass the pinned variable in allocate calls

120f8c7

Add the pinned variable in the CUFAllocatableAllocate and CUFPointerAllocate signatures and update CUFOpConversion to pass the value when available.

clementval requested review from wangzpgi and Renaud-K January 31, 2025 22:35

llvmbot added flang:runtime flang Flang issues not falling into any other category flang:fir-hlfir labels Jan 31, 2025

Renaud-K approved these changes Feb 2, 2025

View reviewed changes

clementval merged commit f1b075d into llvm:main Feb 3, 2025
12 checks passed

clementval deleted the cuf_pinned branch February 3, 2025 02:06

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[flang][cuda] Pass the pinned variable in allocate calls #125310

[flang][cuda] Pass the pinned variable in allocate calls #125310

Uh oh!

clementval commented Jan 31, 2025

Uh oh!

llvmbot commented Jan 31, 2025

Uh oh!

llvmbot commented Jan 31, 2025

Uh oh!

Uh oh!

Uh oh!

[flang][cuda] Pass the pinned variable in allocate calls #125310

[flang][cuda] Pass the pinned variable in allocate calls #125310

Uh oh!

Conversation

clementval commented Jan 31, 2025

Uh oh!

llvmbot commented Jan 31, 2025

Uh oh!

llvmbot commented Jan 31, 2025

Uh oh!

Uh oh!

Uh oh!