-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[flang][cuda] Pass the pinned variable in allocate calls #125310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Add the pinned variable in the CUFAllocatableAllocate and CUFPointerAllocate signatures and update CUFOpConversion to pass the value when available.
@llvm/pr-subscribers-flang-runtime Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesAdd the pinned variable in the CUFAllocatableAllocate and CUFPointerAllocate signatures and update CUFOpConversion to pass the value when available. Patch is 21.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125310.diff 7 Files Affected:
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index eaa1de76154d93..116b927a86a7a9 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -401,6 +401,13 @@ constexpr TypeBuilderFunc getModel<bool &>() {
};
}
template <>
+constexpr TypeBuilderFunc getModel<bool *>() {
+ return [](mlir::MLIRContext *context) -> mlir::Type {
+ TypeBuilderFunc f{getModel<bool>()};
+ return fir::ReferenceType::get(f(context));
+ };
+}
+template <>
constexpr TypeBuilderFunc getModel<unsigned short>() {
return [](mlir::MLIRContext *context) -> mlir::Type {
return mlir::IntegerType::get(
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 0a96f73b6be44b..822f2d4a2b297d 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -18,28 +18,30 @@ extern "C" {
/// Perform allocation of the descriptor.
int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t stream = -1,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, int64_t stream = -1, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+ const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, int64_t stream = -1, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+ const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
/// Perform deallocation of the descriptor with synchronization of it when
/// necessary.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index 78c7a1a92b7ea9..7fbd8f8e061f20 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -18,28 +18,30 @@ extern "C" {
/// Perform allocation of the descriptor.
int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t stream = -1,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t stream = -1,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
- const Descriptor &source, int64_t stream = -1, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+ const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
- const Descriptor &source, int64_t stream = -1, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+ const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
} // extern "C"
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 77aa11f0603f69..549498f5585a68 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -103,7 +103,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
mlir::Value sourceLine;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
sourceLine = fir::factory::locationToLineNo(
- builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
+ builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
else
sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
@@ -119,22 +119,28 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
}
llvm::SmallVector<mlir::Value> args;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+ mlir::Value pinned =
+ op.getPinned()
+ ? op.getPinned()
+ : builder.createNullConstant(
+ loc, fir::ReferenceType::get(
+ mlir::IntegerType::get(op.getContext(), 1)));
if (op.getSource()) {
mlir::Value stream =
op.getStream()
? op.getStream()
: builder.createIntegerConstant(loc, fTy.getInput(2), -1);
- args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
- op.getSource(), stream, hasStat,
- errmsg, sourceFile, sourceLine);
+ args = fir::runtime::createArguments(
+ builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
+ hasStat, errmsg, sourceFile, sourceLine);
} else {
mlir::Value stream =
op.getStream()
? op.getStream()
: builder.createIntegerConstant(loc, fTy.getInput(1), -1);
args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
- stream, hasStat, errmsg, sourceFile,
- sourceLine);
+ stream, pinned, hasStat, errmsg,
+ sourceFile, sourceLine);
}
} else {
args =
@@ -153,11 +159,6 @@ struct CUFAllocateOpConversion
mlir::LogicalResult
matchAndRewrite(cuf::AllocateOp op,
mlir::PatternRewriter &rewriter) const override {
- // TODO: Pinned is a reference to a logical value that can be set to true
- // when pinned allocation succeed. This will require a new entry point.
- if (op.getPinned())
- return mlir::failure();
-
auto mod = op->getParentOfType<mlir::ModuleOp>();
fir::FirOpBuilder builder(rewriter, mod);
mlir::Location loc = op.getLoc();
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9be54e8906903d..6df3b06793b3e6 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -23,10 +23,10 @@ extern "C" {
RT_EXT_API_GROUP_BEGIN
int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
- bool hasStat, const Descriptor *errMsg, const char *sourceFile,
- int sourceLine) {
+ bool *pinned, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocate)(
- desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
@@ -41,8 +41,8 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
}
int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
- bool hasStat, const Descriptor *errMsg, const char *sourceFile,
- int sourceLine) {
+ bool *pinned, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine) {
if (desc.HasAddendum()) {
Terminator terminator{sourceFile, sourceLine};
// TODO: This require a bit more work to set the correct type descriptor
@@ -53,14 +53,19 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
// Perform the standard allocation.
int stat{RTNAME(AllocatableAllocate)(
desc, hasStat, errMsg, sourceFile, sourceLine)};
+ if (pinned) {
+ // Set pinned according to stat. More infrastructre is needed to set it
+ // closer to the actual allocation call.
+ *pinned = (stat == StatOk);
+ }
return stat;
}
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, int64_t stream, bool hasStat,
+ const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocate)(
- alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
@@ -70,10 +75,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
}
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, int64_t stream, bool hasStat,
+ const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocateSync)(
- alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
diff --git a/flang/runtime/CUDA/pointer.cpp b/flang/runtime/CUDA/pointer.cpp
index 3252410bd8d2c2..d3ebe97b4e4acc 100644
--- a/flang/runtime/CUDA/pointer.cpp
+++ b/flang/runtime/CUDA/pointer.cpp
@@ -21,8 +21,9 @@ namespace Fortran::runtime::cuda {
extern "C" {
RT_EXT_API_GROUP_BEGIN
-int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool *pinned,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
if (desc.HasAddendum()) {
Terminator terminator{sourceFile, sourceLine};
// TODO: This require a bit more work to set the correct type descriptor
@@ -33,14 +34,19 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat,
// Perform the standard allocation.
int stat{
RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
+ if (pinned) {
+ // Set pinned according to stat. More infrastructre is needed to set it
+ // closer to the actual allocation call.
+ *pinned = (stat == StatOk);
+ }
return stat;
}
int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
- bool hasStat, const Descriptor *errMsg, const char *sourceFile,
- int sourceLine) {
+ bool *pinned, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFPointerAllocate)(
- desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
@@ -55,10 +61,10 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
}
int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
- const Descriptor &source, int64_t stream, bool hasStat,
+ const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFPointerAllocate)(
- pointer, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
@@ -68,10 +74,10 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
}
int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
- const Descriptor &source, int64_t stream, bool hasStat,
+ const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFPointerAllocateSync)(
- pointer, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 08573110821cc2..095ad92d5deb50 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
// CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
// CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
}
// CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
// CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
// CHECK: _FortranAAllocatableSetBounds
// CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -179,7 +179,7 @@ func.func @_QQallocate_stream() {
// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
func.func @_QPp_alloc() {
@@ -255,4 +255,19 @@ func.func @_QMmod1Ppointer_source_global() {
// CHECK-LABE...
[truncated]
|
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesAdd the pinned variable in the CUFAllocatableAllocate and CUFPointerAllocate signatures and update CUFOpConversion to pass the value when available. Patch is 21.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125310.diff 7 Files Affected:
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index eaa1de76154d93..116b927a86a7a9 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -401,6 +401,13 @@ constexpr TypeBuilderFunc getModel<bool &>() {
};
}
template <>
+constexpr TypeBuilderFunc getModel<bool *>() {
+ return [](mlir::MLIRContext *context) -> mlir::Type {
+ TypeBuilderFunc f{getModel<bool>()};
+ return fir::ReferenceType::get(f(context));
+ };
+}
+template <>
constexpr TypeBuilderFunc getModel<unsigned short>() {
return [](mlir::MLIRContext *context) -> mlir::Type {
return mlir::IntegerType::get(
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 0a96f73b6be44b..822f2d4a2b297d 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -18,28 +18,30 @@ extern "C" {
/// Perform allocation of the descriptor.
int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t stream = -1,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t stream = -1,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, int64_t stream = -1, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+ const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, int64_t stream = -1, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+ const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
/// Perform deallocation of the descriptor with synchronization of it when
/// necessary.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index 78c7a1a92b7ea9..7fbd8f8e061f20 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -18,28 +18,30 @@ extern "C" {
/// Perform allocation of the descriptor.
int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t stream = -1,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t stream = -1,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
- const Descriptor &source, int64_t stream = -1, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+ const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
- const Descriptor &source, int64_t stream = -1, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+ const Descriptor &source, int64_t stream = -1, bool *pinned = nullptr,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
} // extern "C"
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 77aa11f0603f69..549498f5585a68 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -103,7 +103,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
mlir::Value sourceLine;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
sourceLine = fir::factory::locationToLineNo(
- builder, loc, op.getSource() ? fTy.getInput(6) : fTy.getInput(5));
+ builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
else
sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
@@ -119,22 +119,28 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
}
llvm::SmallVector<mlir::Value> args;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+ mlir::Value pinned =
+ op.getPinned()
+ ? op.getPinned()
+ : builder.createNullConstant(
+ loc, fir::ReferenceType::get(
+ mlir::IntegerType::get(op.getContext(), 1)));
if (op.getSource()) {
mlir::Value stream =
op.getStream()
? op.getStream()
: builder.createIntegerConstant(loc, fTy.getInput(2), -1);
- args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
- op.getSource(), stream, hasStat,
- errmsg, sourceFile, sourceLine);
+ args = fir::runtime::createArguments(
+ builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
+ hasStat, errmsg, sourceFile, sourceLine);
} else {
mlir::Value stream =
op.getStream()
? op.getStream()
: builder.createIntegerConstant(loc, fTy.getInput(1), -1);
args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
- stream, hasStat, errmsg, sourceFile,
- sourceLine);
+ stream, pinned, hasStat, errmsg,
+ sourceFile, sourceLine);
}
} else {
args =
@@ -153,11 +159,6 @@ struct CUFAllocateOpConversion
mlir::LogicalResult
matchAndRewrite(cuf::AllocateOp op,
mlir::PatternRewriter &rewriter) const override {
- // TODO: Pinned is a reference to a logical value that can be set to true
- // when pinned allocation succeed. This will require a new entry point.
- if (op.getPinned())
- return mlir::failure();
-
auto mod = op->getParentOfType<mlir::ModuleOp>();
fir::FirOpBuilder builder(rewriter, mod);
mlir::Location loc = op.getLoc();
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9be54e8906903d..6df3b06793b3e6 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -23,10 +23,10 @@ extern "C" {
RT_EXT_API_GROUP_BEGIN
int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
- bool hasStat, const Descriptor *errMsg, const char *sourceFile,
- int sourceLine) {
+ bool *pinned, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocate)(
- desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
@@ -41,8 +41,8 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t stream,
}
int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
- bool hasStat, const Descriptor *errMsg, const char *sourceFile,
- int sourceLine) {
+ bool *pinned, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine) {
if (desc.HasAddendum()) {
Terminator terminator{sourceFile, sourceLine};
// TODO: This require a bit more work to set the correct type descriptor
@@ -53,14 +53,19 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
// Perform the standard allocation.
int stat{RTNAME(AllocatableAllocate)(
desc, hasStat, errMsg, sourceFile, sourceLine)};
+ if (pinned) {
+ // Set pinned according to stat. More infrastructre is needed to set it
+ // closer to the actual allocation call.
+ *pinned = (stat == StatOk);
+ }
return stat;
}
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, int64_t stream, bool hasStat,
+ const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocate)(
- alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
@@ -70,10 +75,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
}
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, int64_t stream, bool hasStat,
+ const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocateSync)(
- alloc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
diff --git a/flang/runtime/CUDA/pointer.cpp b/flang/runtime/CUDA/pointer.cpp
index 3252410bd8d2c2..d3ebe97b4e4acc 100644
--- a/flang/runtime/CUDA/pointer.cpp
+++ b/flang/runtime/CUDA/pointer.cpp
@@ -21,8 +21,9 @@ namespace Fortran::runtime::cuda {
extern "C" {
RT_EXT_API_GROUP_BEGIN
-int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool *pinned,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
if (desc.HasAddendum()) {
Terminator terminator{sourceFile, sourceLine};
// TODO: This require a bit more work to set the correct type descriptor
@@ -33,14 +34,19 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t stream, bool hasStat,
// Perform the standard allocation.
int stat{
RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
+ if (pinned) {
+ // Set pinned according to stat. More infrastructre is needed to set it
+ // closer to the actual allocation call.
+ *pinned = (stat == StatOk);
+ }
return stat;
}
int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
- bool hasStat, const Descriptor *errMsg, const char *sourceFile,
- int sourceLine) {
+ bool *pinned, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFPointerAllocate)(
- desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
@@ -55,10 +61,10 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t stream,
}
int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
- const Descriptor &source, int64_t stream, bool hasStat,
+ const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFPointerAllocate)(
- pointer, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
@@ -68,10 +74,10 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
}
int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
- const Descriptor &source, int64_t stream, bool hasStat,
+ const Descriptor &source, int64_t stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFPointerAllocateSync)(
- pointer, stream, hasStat, errMsg, sourceFile, sourceLine)};
+ pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 08573110821cc2..095ad92d5deb50 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
// CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
// CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
}
// CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
// CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
// CHECK: _FortranAAllocatableSetBounds
// CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -179,7 +179,7 @@ func.func @_QQallocate_stream() {
// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
// CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]] : !fir.ref<i64>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM_LOAD]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i64, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
func.func @_QPp_alloc() {
@@ -255,4 +255,19 @@ func.func @_QMmod1Ppointer_source_global() {
// CHECK-LABE...
[truncated]
|
Add the pinned variable in the CUFAllocatableAllocate and CUFPointerAllocate signatures and update CUFOpConversion to pass the value when available.
The pinned variable is currently set with the stat information. More runtime work is needed to set the variable close to the cudaMallocHost calls.