Skip to content

Commit 4e40b71

Browse files
authored
[flang][cuda] Add specialized gpu.launch_func conversion (#113493)
1 parent 76bdc60 commit 4e40b71

File tree

9 files changed

+328
-1
lines changed

9 files changed

+328
-1
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
//===------- Optimizer/Transforms/CUFGPUToLLVMConversion.h ------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_CUFGPUTOLLVMCONVERSION_H_
10+
#define FORTRAN_OPTIMIZER_TRANSFORMS_CUFGPUTOLLVMCONVERSION_H_
11+
12+
#include "mlir/Pass/Pass.h"
13+
#include "mlir/Pass/PassRegistry.h"
14+
#include "mlir/Transforms/DialectConversion.h"
15+
16+
namespace fir {
17+
class LLVMTypeConverter;
18+
}
19+
20+
namespace cuf {
21+
22+
void populateCUFGPUToLLVMConversionPatterns(
23+
const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
24+
mlir::PatternBenefit benefit = 1);
25+
26+
} // namespace cuf
27+
28+
#endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFGPUTOLLVMCONVERSION_H_

flang/include/flang/Optimizer/Transforms/Passes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ namespace fir {
4141
#define GEN_PASS_DECL_CFGCONVERSION
4242
#define GEN_PASS_DECL_CUFADDCONSTRUCTOR
4343
#define GEN_PASS_DECL_CUFDEVICEGLOBAL
44+
#define GEN_PASS_DECL_CUFGPUTOLLVMCONVERSION
4445
#define GEN_PASS_DECL_CUFOPCONVERSION
4546
#define GEN_PASS_DECL_EXTERNALNAMECONVERSION
4647
#define GEN_PASS_DECL_MEMREFDATAFLOWOPT

flang/include/flang/Optimizer/Transforms/Passes.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,4 +443,11 @@ def CUFAddConstructor : Pass<"cuf-add-constructor", "mlir::ModuleOp"> {
443443
];
444444
}
445445

446+
def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> {
447+
let summary = "Convert some GPU operations lowered from CUF to runtime calls";
448+
let dependentDialects = [
449+
"mlir::LLVM::LLVMDialect"
450+
];
451+
}
452+
446453
#endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES

flang/lib/Optimizer/Transforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ add_flang_library(FIRTransforms
1212
CUFAddConstructor.cpp
1313
CUFDeviceGlobal.cpp
1414
CUFOpConversion.cpp
15+
CUFGPUToLLVMConversion.cpp
1516
ArrayValueCopy.cpp
1617
ExternalNameConversion.cpp
1718
MemoryUtils.cpp
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
//===-- CUFGPUToLLVMConversion.cpp ----------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h"
10+
#include "flang/Common/Fortran.h"
11+
#include "flang/Optimizer/CodeGen/TypeConverter.h"
12+
#include "flang/Optimizer/Support/DataLayout.h"
13+
#include "flang/Runtime/CUDA/common.h"
14+
#include "mlir/Conversion/LLVMCommon/Pattern.h"
15+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
16+
#include "mlir/Pass/Pass.h"
17+
#include "mlir/Transforms/DialectConversion.h"
18+
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
19+
#include "llvm/Support/FormatVariadic.h"
20+
21+
namespace fir {
22+
#define GEN_PASS_DEF_CUFGPUTOLLVMCONVERSION
23+
#include "flang/Optimizer/Transforms/Passes.h.inc"
24+
} // namespace fir
25+
26+
using namespace fir;
27+
using namespace mlir;
28+
using namespace Fortran::runtime;
29+
30+
namespace {
31+
32+
static mlir::Value createKernelArgArray(mlir::Location loc,
33+
mlir::ValueRange operands,
34+
mlir::PatternRewriter &rewriter) {
35+
36+
auto *ctx = rewriter.getContext();
37+
llvm::SmallVector<mlir::Type> structTypes(operands.size(), nullptr);
38+
39+
for (auto [i, arg] : llvm::enumerate(operands))
40+
structTypes[i] = arg.getType();
41+
42+
auto structTy = mlir::LLVM::LLVMStructType::getLiteral(ctx, structTypes);
43+
auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
44+
mlir::Type i32Ty = rewriter.getI32Type();
45+
auto one = rewriter.create<mlir::LLVM::ConstantOp>(
46+
loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 1));
47+
mlir::Value argStruct =
48+
rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, structTy, one);
49+
auto size = rewriter.create<mlir::LLVM::ConstantOp>(
50+
loc, i32Ty, rewriter.getIntegerAttr(i32Ty, structTypes.size()));
51+
mlir::Value argArray =
52+
rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, ptrTy, size);
53+
54+
for (auto [i, arg] : llvm::enumerate(operands)) {
55+
auto indice = rewriter.create<mlir::LLVM::ConstantOp>(
56+
loc, i32Ty, rewriter.getIntegerAttr(i32Ty, i));
57+
mlir::Value structMember = rewriter.create<LLVM::GEPOp>(
58+
loc, ptrTy, structTy, argStruct, mlir::ArrayRef<mlir::Value>({indice}));
59+
rewriter.create<LLVM::StoreOp>(loc, arg, structMember);
60+
mlir::Value arrayMember = rewriter.create<LLVM::GEPOp>(
61+
loc, ptrTy, structTy, argArray, mlir::ArrayRef<mlir::Value>({indice}));
62+
rewriter.create<LLVM::StoreOp>(loc, structMember, arrayMember);
63+
}
64+
return argArray;
65+
}
66+
67+
struct GPULaunchKernelConversion
68+
: public mlir::ConvertOpToLLVMPattern<mlir::gpu::LaunchFuncOp> {
69+
explicit GPULaunchKernelConversion(
70+
const fir::LLVMTypeConverter &typeConverter, mlir::PatternBenefit benefit)
71+
: mlir::ConvertOpToLLVMPattern<mlir::gpu::LaunchFuncOp>(typeConverter,
72+
benefit) {}
73+
74+
using OpAdaptor = typename mlir::gpu::LaunchFuncOp::Adaptor;
75+
76+
mlir::LogicalResult
77+
matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor,
78+
mlir::ConversionPatternRewriter &rewriter) const override {
79+
80+
if (op.hasClusterSize()) {
81+
return mlir::failure();
82+
}
83+
84+
mlir::Location loc = op.getLoc();
85+
auto *ctx = rewriter.getContext();
86+
mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>();
87+
mlir::Value dynamicMemorySize = op.getDynamicSharedMemorySize();
88+
mlir::Type i32Ty = rewriter.getI32Type();
89+
if (!dynamicMemorySize)
90+
dynamicMemorySize = rewriter.create<mlir::LLVM::ConstantOp>(
91+
loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 0));
92+
93+
mlir::Value kernelArgs =
94+
createKernelArgArray(loc, adaptor.getKernelOperands(), rewriter);
95+
96+
auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
97+
auto kernel = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(op.getKernelName());
98+
mlir::Value kernelPtr;
99+
if (!kernel) {
100+
auto funcOp = mod.lookupSymbol<mlir::func::FuncOp>(op.getKernelName());
101+
if (!funcOp)
102+
return mlir::failure();
103+
kernelPtr =
104+
rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, funcOp.getName());
105+
} else {
106+
kernelPtr =
107+
rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName());
108+
}
109+
110+
auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
111+
RTNAME_STRING(CUFLaunchKernel));
112+
113+
auto llvmIntPtrType = mlir::IntegerType::get(
114+
ctx, this->getTypeConverter()->getPointerBitwidth(0));
115+
auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
116+
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
117+
voidTy,
118+
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
119+
llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
120+
/*isVarArg=*/false);
121+
122+
auto cufLaunchKernel = mlir::SymbolRefAttr::get(
123+
mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
124+
if (!funcOp) {
125+
mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
126+
rewriter.setInsertionPointToStart(mod.getBody());
127+
auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
128+
loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
129+
launchKernelFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private);
130+
}
131+
132+
mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy);
133+
134+
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
135+
op, funcTy, cufLaunchKernel,
136+
mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
137+
adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
138+
adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
139+
adaptor.getBlockSizeZ(), dynamicMemorySize, kernelArgs,
140+
nullPtr});
141+
142+
return mlir::success();
143+
}
144+
};
145+
146+
class CUFGPUToLLVMConversion
147+
: public fir::impl::CUFGPUToLLVMConversionBase<CUFGPUToLLVMConversion> {
148+
public:
149+
void runOnOperation() override {
150+
auto *ctx = &getContext();
151+
mlir::RewritePatternSet patterns(ctx);
152+
mlir::ConversionTarget target(*ctx);
153+
154+
mlir::Operation *op = getOperation();
155+
mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
156+
if (!module)
157+
return signalPassFailure();
158+
159+
std::optional<mlir::DataLayout> dl =
160+
fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
161+
fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
162+
/*forceUnifiedTBAATree=*/false, *dl);
163+
cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns);
164+
target.addIllegalOp<mlir::gpu::LaunchFuncOp>();
165+
target.addLegalDialect<mlir::LLVM::LLVMDialect>();
166+
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
167+
std::move(patterns)))) {
168+
mlir::emitError(mlir::UnknownLoc::get(ctx),
169+
"error in CUF GPU op conversion\n");
170+
signalPassFailure();
171+
}
172+
}
173+
};
174+
} // namespace
175+
176+
void cuf::populateCUFGPUToLLVMConversionPatterns(
177+
const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
178+
mlir::PatternBenefit benefit) {
179+
patterns.add<GPULaunchKernelConversion>(converter, benefit);
180+
}

flang/lib/Optimizer/Transforms/CUFOpConversion.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "flang/Runtime/CUDA/descriptor.h"
2121
#include "flang/Runtime/CUDA/memory.h"
2222
#include "flang/Runtime/allocatable.h"
23+
#include "mlir/Conversion/LLVMCommon/Pattern.h"
2324
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
2425
#include "mlir/Pass/Pass.h"
2526
#include "mlir/Transforms/DialectConversion.h"

flang/runtime/CUDA/registration.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "flang/Runtime/CUDA/registration.h"
10+
#include "../terminator.h"
11+
#include "flang/Runtime/CUDA/common.h"
1012

1113
#include "cuda_runtime.h"
1214

@@ -31,5 +33,7 @@ void RTDEF(CUFRegisterFunction)(
3133
__cudaRegisterFunction(module, fctSym, fctName, fctName, -1, (uint3 *)0,
3234
(uint3 *)0, (dim3 *)0, (dim3 *)0, (int *)0);
3335
}
34-
}
36+
37+
} // extern "C"
38+
3539
} // namespace Fortran::runtime::cuda
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
// RUN: fir-opt --cuf-gpu-convert-to-llvm %s | FileCheck %s
2+
3+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
4+
llvm.func @_QMmod1Phost_sub() {
5+
%0 = llvm.mlir.constant(1 : i32) : i32
6+
%1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
7+
%2 = llvm.mlir.constant(40 : i64) : i64
8+
%3 = llvm.mlir.constant(16 : i32) : i32
9+
%4 = llvm.mlir.constant(25 : i32) : i32
10+
%5 = llvm.mlir.constant(21 : i32) : i32
11+
%6 = llvm.mlir.constant(17 : i32) : i32
12+
%7 = llvm.mlir.constant(1 : index) : i64
13+
%8 = llvm.mlir.constant(27 : i32) : i32
14+
%9 = llvm.mlir.constant(6 : i32) : i32
15+
%10 = llvm.mlir.constant(1 : i32) : i32
16+
%11 = llvm.mlir.constant(0 : i32) : i32
17+
%12 = llvm.mlir.constant(10 : index) : i64
18+
%13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
19+
%14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
20+
%15 = llvm.mlir.constant(10 : index) : i64
21+
%16 = llvm.mlir.constant(1 : index) : i64
22+
%17 = llvm.alloca %15 x i32 : (i64) -> !llvm.ptr
23+
%18 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
24+
%19 = llvm.insertvalue %17, %18[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
25+
%20 = llvm.insertvalue %17, %19[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
26+
%21 = llvm.mlir.constant(0 : index) : i64
27+
%22 = llvm.insertvalue %21, %20[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
28+
%23 = llvm.insertvalue %15, %22[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
29+
%24 = llvm.insertvalue %16, %23[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
30+
%25 = llvm.extractvalue %24[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
31+
%26 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
32+
%27 = llvm.insertvalue %25, %26[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
33+
%28 = llvm.insertvalue %25, %27[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
34+
%29 = llvm.mlir.constant(0 : index) : i64
35+
%30 = llvm.insertvalue %29, %28[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
36+
%31 = llvm.mlir.constant(10 : index) : i64
37+
%32 = llvm.insertvalue %31, %30[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
38+
%33 = llvm.mlir.constant(1 : index) : i64
39+
%34 = llvm.insertvalue %33, %32[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
40+
%35 = llvm.mlir.constant(1 : index) : i64
41+
%36 = llvm.mlir.constant(11 : index) : i64
42+
%37 = llvm.mlir.constant(1 : index) : i64
43+
llvm.br ^bb1(%35 : i64)
44+
^bb1(%38: i64): // 2 preds: ^bb0, ^bb2
45+
%39 = llvm.icmp "slt" %38, %36 : i64
46+
llvm.cond_br %39, ^bb2, ^bb3
47+
^bb2: // pred: ^bb1
48+
%40 = llvm.mlir.constant(-1 : index) : i64
49+
%41 = llvm.add %38, %40 : i64
50+
%42 = llvm.extractvalue %34[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
51+
%43 = llvm.getelementptr %42[%41] : (!llvm.ptr, i64) -> !llvm.ptr, i32
52+
llvm.store %11, %43 : i32, !llvm.ptr
53+
%44 = llvm.add %38, %37 : i64
54+
llvm.br ^bb1(%44 : i64)
55+
^bb3: // pred: ^bb1
56+
%45 = llvm.call @_FortranACUFDataTransferPtrPtr(%14, %25, %2, %11, %13, %5) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()>
57+
gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr)
58+
%46 = llvm.call @_FortranACUFDataTransferPtrPtr(%25, %14, %2, %10, %13, %4) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()>
59+
%47 = llvm.call @_FortranAioBeginExternalListOutput(%9, %13, %8) {fastmathFlags = #llvm.fastmath<contract>} : (i32, !llvm.ptr, i32) -> !llvm.ptr
60+
%48 = llvm.mlir.constant(9 : i32) : i32
61+
%49 = llvm.mlir.zero : !llvm.ptr
62+
%50 = llvm.getelementptr %49[1] : (!llvm.ptr) -> !llvm.ptr, i32
63+
%51 = llvm.ptrtoint %50 : !llvm.ptr to i64
64+
%52 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
65+
%53 = llvm.insertvalue %51, %52[1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
66+
%54 = llvm.mlir.constant(20240719 : i32) : i32
67+
%55 = llvm.insertvalue %54, %53[2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
68+
%56 = llvm.mlir.constant(1 : i32) : i32
69+
%57 = llvm.trunc %56 : i32 to i8
70+
%58 = llvm.insertvalue %57, %55[3] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
71+
%59 = llvm.trunc %48 : i32 to i8
72+
%60 = llvm.insertvalue %59, %58[4] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
73+
%61 = llvm.mlir.constant(0 : i32) : i32
74+
%62 = llvm.trunc %61 : i32 to i8
75+
%63 = llvm.insertvalue %62, %60[5] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
76+
%64 = llvm.mlir.constant(0 : i32) : i32
77+
%65 = llvm.trunc %64 : i32 to i8
78+
%66 = llvm.insertvalue %65, %63[6] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
79+
%67 = llvm.mlir.constant(0 : i64) : i64
80+
%68 = llvm.mlir.constant(1 : i64) : i64
81+
%69 = llvm.insertvalue %68, %66[7, 0, 0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
82+
%70 = llvm.insertvalue %12, %69[7, 0, 1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
83+
%71 = llvm.insertvalue %51, %70[7, 0, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
84+
%72 = llvm.mul %51, %12 : i64
85+
%73 = llvm.insertvalue %25, %71[0] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
86+
llvm.store %73, %1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
87+
llvm.return
88+
}
89+
llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
90+
llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> {
91+
%0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
92+
llvm.return %0 : !llvm.array<2 x i8>
93+
}
94+
llvm.func @_FortranAioBeginExternalListOutput(i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.io, fir.runtime, sym_visibility = "private"}
95+
llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
96+
llvm.func @_FortranACUFDataTransferPtrPtr(!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
97+
llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
98+
gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
99+
}
100+
101+
// CHECK-LABEL: _QMmod1Phost_sub
102+
103+
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1 : !llvm.ptr
104+
// CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], {{.*}})

flang/tools/fir-opt/fir-opt.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ int main(int argc, char **argv) {
4343
DialectRegistry registry;
4444
fir::support::registerDialects(registry);
4545
registry.insert<mlir::gpu::GPUDialect>();
46+
registry.insert<mlir::NVVM::NVVMDialect>();
4647
fir::support::addFIRExtensions(registry);
4748
return failed(MlirOptMain(argc, argv, "FIR modular optimizer driver\n",
4849
registry));

0 commit comments

Comments
 (0)