diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 53fe9c0d2f6f0..e99e079bf18ea 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -3359,8 +3359,8 @@ void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value barrier = convertPtrToNVVMSpace( builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); - mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier, - fir::getBase(args[1]), {}); + mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier, + fir::getBase(args[1]), {}); auto kind = mlir::NVVM::ProxyKindAttr::get( builder.getContext(), mlir::NVVM::ProxyKind::async_shared); auto space = mlir::NVVM::SharedSpaceAttr::get( diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 8f355217899b3..2a936a609895d 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -431,7 +431,7 @@ end subroutine ! CHECK: %[[COUNT:.*]] = arith.constant 256 : i32 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> -! CHECK: nvvm.mbarrier.init.shared %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32 +! CHECK: nvvm.mbarrier.init %[[SHARED_PTR]], %[[COUNT]] : !llvm.ptr<3>, i32 ! CHECK: nvvm.fence.proxy {kind = #nvvm.proxy_kind, space = #nvvm.shared_space} ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref) -> !llvm.ptr diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 4f483859ac18d..b572ef9c1d07b 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -579,7 +579,8 @@ def NVVM_PMEventOp : NVVM_PTXBuilder_Op<"pmevent">, /// mbarrier.init instruction with generic pointer type def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">, - Arguments<(ins LLVM_AnyPointer:$addr, I32:$count, PtxPredicate:$predicate)> { + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, + I32:$count, PtxPredicate:$predicate)> { let summary = "MBarrier Initialization Op"; let description = [{ The `nvvm.mbarrier.init` operation initializes an *mbarrier object* at the specified @@ -592,48 +593,35 @@ def NVVM_MBarrierInitOp : NVVM_PTXBuilder_Op<"mbarrier.init">, - Transaction count (tx-count): 0 The operation takes the following operands: - - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. + - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. - `count`: Integer specifying the number of threads that will participate in barrier synchronization. Must be in the range [1, 2²⁰ - 1]. - `predicate`: Optional predicate for conditional execution. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init) }]; - string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init, {$addr, $count}); - }]; let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)"; + let extraClassDeclaration = [{ bool hasIntrinsic() { if(getPredicate()) return false; return true; } - }]; - let extraClassDefinition = [{ - std::string $cppClass::getPtx() { return std::string("mbarrier.init.b64 [%0], %1;"); } - }]; -} -/// mbarrier.init instruction with shared pointer type -def NVVM_MBarrierInitSharedOp : NVVM_PTXBuilder_Op<"mbarrier.init.shared", [NVVMRequiresSM<80>, DeclareOpInterfaceMethods]>, - Arguments<(ins LLVM_PointerShared:$addr, I32:$count, PtxPredicate:$predicate)> { - let summary = "Shared MBarrier Initialization Op"; - let description = [{ - This Op is the same as `nvvm.mbarrier.init` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. - - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init) + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_init_shared, {$addr, $count}); - }]; - let assemblyFormat = "$addr `,` $count (`,` `predicate` `=` $predicate^)? attr-dict `:` type(operands)"; - let extraClassDeclaration = "bool hasIntrinsic() { return !getPredicate(); }"; - let extraClassDefinition = [{ - std::string $cppClass::getPtx() { return std::string("mbarrier.init.shared.b64 [%0], %1;"); } + auto [id, args] = NVVM::MBarrierInitOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); }]; } def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">, - Arguments<(ins LLVM_AnyPointer:$addr)> { + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> { let summary = "MBarrier Invalidation Operation"; let description = [{ The `nvvm.mbarrier.inval` operation invalidates an *mbarrier object* at the @@ -644,30 +632,27 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">, It is undefined behavior if the *mbarrier object* is already invalid. The operation takes the following operand: - - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. + - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval) }]; - string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval, {$addr}); - }]; - let assemblyFormat = "$addr attr-dict `:` type(operands)"; -} -def NVVM_MBarrierInvalSharedOp : NVVM_Op<"mbarrier.inval.shared">, - Arguments<(ins LLVM_PointerShared:$addr)> { - let summary = "Shared MBarrier Invalidation Operation"; - let description = [{ - This Op is the same as `nvvm.mbarrier.inval` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. + let assemblyFormat = "$addr attr-dict `:` type(operands)"; - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval) + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_inval_shared, {$addr}); + auto [id, args] = NVVM::MBarrierInvalOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); }]; - let assemblyFormat = "$addr attr-dict `:` type(operands)"; } def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">, diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index a9efada28a320..ec182f1db48ac 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -846,13 +846,8 @@ struct NVGPUMBarrierInitLowering Value barrier = getMbarrierPtr(b, mbarrierType, adaptor.getBarriers(), adaptor.getMbarId(), rewriter); Value count = truncToI32(b, adaptor.getCount()); - if (isMbarrierShared(mbarrierType)) { - rewriter.replaceOpWithNewOp( - op, barrier, count, adaptor.getPredicate()); - } else { - rewriter.replaceOpWithNewOp(op, barrier, count, - adaptor.getPredicate()); - } + rewriter.replaceOpWithNewOp(op, barrier, count, + adaptor.getPredicate()); return success(); } }; diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index f0de4dbcc1d4b..53a6f43c0bbcf 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -1607,10 +1607,53 @@ void Tcgen05MmaSmemDescOp::createSmemDescriptor(Operation &op, mt.mapValue(thisOp.getRes()) = smemDesc; } +//===----------------------------------------------------------------------===// +// getPtx methods +//===----------------------------------------------------------------------===// + +std::string NVVM::MBarrierInitOp::getPtx() { + unsigned addressSpace = + llvm::cast(getAddr().getType()).getAddressSpace(); + return (addressSpace == NVVMMemorySpace::Shared) + ? std::string("mbarrier.init.shared.b64 [%0], %1;") + : std::string("mbarrier.init.b64 [%0], %1;"); +} + //===----------------------------------------------------------------------===// // getIntrinsicID/getIntrinsicIDAndArgs methods //===----------------------------------------------------------------------===// +mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + unsigned addressSpace = + llvm::cast(thisOp.getAddr().getType()) + .getAddressSpace(); + llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared) + ? llvm::Intrinsic::nvvm_mbarrier_init_shared + : llvm::Intrinsic::nvvm_mbarrier_init; + + // Fill the Intrinsic Args + llvm::SmallVector args; + args.push_back(mt.lookupValue(thisOp.getAddr())); + args.push_back(mt.lookupValue(thisOp.getCount())); + + return {id, std::move(args)}; +} + +mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + unsigned addressSpace = + llvm::cast(thisOp.getAddr().getType()) + .getAddressSpace(); + llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared) + ? llvm::Intrinsic::nvvm_mbarrier_inval_shared + : llvm::Intrinsic::nvvm_mbarrier_inval; + + return {id, {mt.lookupValue(thisOp.getAddr())}}; +} + #define CP_ASYNC_ID_IMPL(mod, size, suffix) \ llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 5755ca9258283..8cce6308018e2 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -486,7 +486,7 @@ func.func @mbarrier() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + // CHECK: nvvm.mbarrier.init %[[barPtr]] nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> @@ -516,7 +516,7 @@ func.func @mbarrier_nocomplete() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + // CHECK: nvvm.mbarrier.init %[[barPtr]] nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> @@ -592,7 +592,7 @@ func.func @mbarrier_txcount() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]] + // CHECK: nvvm.mbarrier.init %[[barPtr]] nvgpu.mbarrier.init %barrier[%c0], %num_threads : !barrierType %tidxreg = nvvm.read.ptx.sreg.tid.x : i32 @@ -643,7 +643,7 @@ func.func @mbarrier_txcount_pred() { // CHECK: %[[barStr:.+]] = builtin.unrealized_conversion_cast %[[barMemref]] : memref<1xi64, 3> to !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[base:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr:.+]] = llvm.getelementptr %[[base]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.init.shared %[[barPtr]], {{.*}}, predicate = %[[P]] + // CHECK: nvvm.mbarrier.init %[[barPtr]], {{.*}}, predicate = %[[P]] nvgpu.mbarrier.init %barrier[%c0], %mine, predicate = %pred : !barrierType %txcount = arith.constant 256 : index diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index 6960e83be3573..fbc4c0af60360 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -8,7 +8,7 @@ // CHECK-LABEL: @init_mbarrier llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %count : i32, %pred : i1) { //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.shared.b64 [$0], $1;", "r,r,b" - nvvm.mbarrier.init.shared %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 + nvvm.mbarrier.init %barrier, %count, predicate = %pred : !llvm.ptr<3>, i32, i1 //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.init.b64 [$0], $1;", "l,r,b" nvvm.mbarrier.init %barrier_gen, %count, predicate = %pred : !llvm.ptr, i32, i1 llvm.return diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir index 0243f5eb8c862..2505e56407c2b 100644 --- a/mlir/test/Dialect/LLVMIR/nvvm.mlir +++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir @@ -419,8 +419,8 @@ llvm.func private @mbarrier_init_generic(%barrier: !llvm.ptr) { llvm.func private @mbarrier_init_shared(%barrier: !llvm.ptr<3>) { %count = nvvm.read.ptx.sreg.ntid.x : i32 - // CHECK: nvvm.mbarrier.init.shared %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32 - nvvm.mbarrier.init.shared %barrier, %count : !llvm.ptr<3>, i32 + // CHECK: nvvm.mbarrier.init %{{.*}}, %{{.*}} : !llvm.ptr<3>, i32 + nvvm.mbarrier.init %barrier, %count : !llvm.ptr<3>, i32 llvm.return } @@ -433,8 +433,8 @@ llvm.func private @mbarrier_inval_generic(%barrier: !llvm.ptr) { llvm.func private @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) { - // CHECK: nvvm.mbarrier.inval.shared %{{.*}} : !llvm.ptr<3> - nvvm.mbarrier.inval.shared %barrier : !llvm.ptr<3> + // CHECK: nvvm.mbarrier.inval %{{.*}} : !llvm.ptr<3> + nvvm.mbarrier.inval %barrier : !llvm.ptr<3> llvm.return }