diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h index e0f4c71051e50..46f29c6dd8b92 100644 --- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h +++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h @@ -16,7 +16,9 @@ namespace mlir { class LLVMTypeConverter; class ConversionTarget; class RewritePatternSet; -class Pass; + +template +class OperationPass; namespace gpu { class GPUModuleOp; @@ -43,6 +45,14 @@ void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter, /// Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM. void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns); + +/// Creates a pass that lowers GPU dialect operations to NVVM counterparts. The +/// index bitwidth used for the lowering of the device side index computations +/// is configurable. +std::unique_ptr> createLowerGpuOpsToNVVMOpsPass( + unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout, + bool hasRedux = false); + } // namespace mlir #endif // MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_ diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 3218760931b8c..ed37abf85275b 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -486,6 +486,7 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> { def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> { let summary = "Generate NVVM operations for gpu operations"; + let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()"; let dependentDialects = [ "cf::ControlFlowDialect", "memref::MemRefDialect", diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 764b6a779b98c..06469dc82b3fc 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -210,7 +210,11 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern { /// code. struct LowerGpuOpsToNVVMOpsPass : public impl::ConvertGpuOpsToNVVMOpsBase { - using Base::Base; + LowerGpuOpsToNVVMOpsPass() = default; + LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux = false) { + this->indexBitwidth = indexBitwidth; + this->hasRedux = hasRedux; + } void runOnOperation() override { gpu::GPUModuleOp m = getOperation(); @@ -374,3 +378,8 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, "__nv_tanh"); populateOpPatterns(converter, patterns, "__nv_tanf", "__nv_tan"); } + +std::unique_ptr> +mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux) { + return std::make_unique(indexBitwidth, hasRedux); +} diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp index 24c4c4c43a93d..a7fd5a25e6831 100644 --- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp +++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp @@ -52,7 +52,7 @@ void mlir::sparse_tensor::buildSparseCompiler( pm.addPass(createSparseGPUCodegenPass()); pm.addNestedPass(createStripDebugInfoPass()); pm.addNestedPass(createConvertSCFToCFPass()); - pm.addNestedPass(createConvertGpuOpsToNVVMOps()); + pm.addNestedPass(createLowerGpuOpsToNVVMOpsPass()); } // TODO(springerm): Add sparse support to the BufferDeallocation pass and add diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir index 0cb06b7bf1d20..2c1ae3ee840d0 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir @@ -1,5 +1,6 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm -debug-only=serialize-to-isa \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{dump-ptx}))' \ // RUN: 2>&1 | FileCheck %s // CHECK: Generated by LLVM NVPTX Back-End diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir index 80972f244ec02..8eb90fd3ca994 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir @@ -2,9 +2,10 @@ // NOTE: this test requires gpu-sm80 // // RUN: mlir-opt \ -// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \ +// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse,gpu.module(gpu-to-cubin{chip=sm_80 features=+ptx71}))" \ // RUN: %s \ -// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \ +// RUN: | mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \ +// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir index 8c991493a2b01..8571c5ca5f3dc 100644 --- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir @@ -1,7 +1,9 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\ // RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\ // RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \ -// RUN: -convert-arith-to-llvm -test-lower-to-nvvm | \ +// RUN: -convert-arith-to-llvm -gpu-kernel-outlining |\ +// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ +// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir index f26c18c4ae3dd..c671c1843862f 100644 --- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir @@ -2,7 +2,9 @@ // everything on the same thread. // RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -test-lower-to-nvvm | \ +// RUN: -gpu-kernel-outlining |\ +// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ +// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ @@ -13,7 +15,9 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \ // RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -test-lower-to-nvvm | \ +// RUN: -gpu-kernel-outlining |\ +// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ +// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ @@ -23,7 +27,9 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \ // RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -test-lower-to-nvvm | \ +// RUN: -gpu-kernel-outlining |\ +// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ +// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir index 591bf1b4fd182..535ba52d66f00 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \ +// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir index 51bd23f817b33..c4ca46521eeb4 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir @@ -3,7 +3,9 @@ // Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention. // This test also uses gpu.memcpy operations (instead of gpu.host_register). // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin{chip=sm_70}))' \ +// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm="use-bare-pointers-for-host=1 use-bare-pointers-for-kernels=1" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir index 0307b3d504be9..ae410dce281b1 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \ +// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir index b131b8682ddee..f4324a14a36b6 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -8,7 +10,9 @@ // Same as above but with the memref bare pointer lowering convention. // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm="use-bare-pointers-for-kernels=1" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir index 155423db7e050..0a8d38f145279 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir index e5047b6efa3bf..bcd785d35291c 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir index 163e9fdba60c1..aa4b0e8820479 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir index 381db2639c371..2e7d046c39214 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir index 23c6c117e67f3..32cfa27c8988a 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir index 3c5a100b5b90d..30767b9495b6f 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir index d2a5127a34c3b..e6dd91ace9743 100644 --- a/mlir/test/Integration/GPU/CUDA/async.mlir +++ b/mlir/test/Integration/GPU/CUDA/async.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \ -// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm \ // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \ // RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \ // RUN: | mlir-cpu-runner \ diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir index a5d04f7322b49..afcb674858c86 100644 --- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir +++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir @@ -1,5 +1,8 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir index 7657bf4732d32..444e2877c822c 100644 --- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir +++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir index 1a35d1e78b094..fce773974d5ba 100644 --- a/mlir/test/Integration/GPU/CUDA/printf.mlir +++ b/mlir/test/Integration/GPU/CUDA/printf.mlir @@ -1,5 +1,6 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir index 40fcea857d5b4..6a784ca32f9ef 100644 --- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir +++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir index 5a9acdf3d8da6..5f6e5d75aff5b 100644 --- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir +++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir @@ -1,5 +1,7 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-to-llvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp index 99e19dae0d72b..5db6f56fb4b38 100644 --- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp +++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp @@ -65,7 +65,7 @@ struct TestLowerToNVVMOptions llvm::cl::init("nvptx64-nvidia-cuda")}; PassOptions::Option cubinChip{ *this, "cubin-chip", llvm::cl::desc("Chip to use to serialize to cubin."), - llvm::cl::init("sm_50")}; + llvm::cl::init("sm_80")}; PassOptions::Option cubinFeatures{ *this, "cubin-features", llvm::cl::desc("Features to use to serialize to cubin."), @@ -126,14 +126,13 @@ void buildGpuPassPipeline(OpPassManager &pm, // TODO: C++20 designated initializers. // The following pass is inconsistent. - // TODO: fix inconsistence. - ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions; - convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv = - options.kernelUseBarePtrCallConv; - convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth; - convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true; + // ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions; + // convertGpuOpsToNVVMOpsOptions.indexBitwidth = + // options.kernelIndexBitWidth; pm.addNestedPass( - createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions)); + // TODO: fix inconsistence. + createLowerGpuOpsToNVVMOpsPass(/*indexBitWidth=*/ + options.kernelIndexBitWidth)); // TODO: C++20 designated initializers. ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions; @@ -142,6 +141,22 @@ void buildGpuPassPipeline(OpPassManager &pm, createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions)); pm.addNestedPass(createConvertSCFToCFPass()); + // TODO: C++20 designated initializers. + GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions; + // Note: hostBarePtrCallConv must be false for now otherwise + // gpu::HostRegister is ill-defined: it wants unranked memrefs but can't + // lower the to bare ptr. + gpuToLLVMConversionOptions.hostBarePtrCallConv = + options.hostUseBarePtrCallConv; + gpuToLLVMConversionOptions.kernelBarePtrCallConv = + options.kernelUseBarePtrCallConv; + gpuToLLVMConversionOptions.useOpaquePointers = true; + + // TODO: something useful here. + // gpuToLLVMConversionOptions.gpuBinaryAnnotation = ""; + pm.addNestedPass( + createGpuToLLVMConversionPass(gpuToLLVMConversionOptions)); + // Convert vector to LLVM (always needed). // TODO: C++20 designated initializers. ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; @@ -155,6 +170,11 @@ void buildGpuPassPipeline(OpPassManager &pm, // Finally we can reconcile unrealized casts. pm.addNestedPass(createReconcileUnrealizedCastsPass()); + +#if MLIR_GPU_TO_CUBIN_PASS_ENABLE + pm.addNestedPass(createGpuSerializeToCubinPass( + options.cubinTriple, options.cubinChip, options.cubinFeatures)); +#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE } void buildLowerToNVVMPassPipeline(OpPassManager &pm, @@ -231,16 +251,22 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm, //===----------------------------------------------------------------------===// // Host post-GPUModule-specific stuff. //===----------------------------------------------------------------------===// - // Attach an NVVM target to all the GPU modules with the provided target - // options. + // Convert vector to LLVM (always needed). // TODO: C++20 designated initializers. - GpuNVVMAttachTargetOptions nvvmTargetOptions; - nvvmTargetOptions.triple = options.cubinTriple; - nvvmTargetOptions.chip = options.cubinChip; - nvvmTargetOptions.features = options.cubinFeatures; - pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions)); + ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; + convertVectorToLLVMPassOptions.reassociateFPReductions = true; + pm.addNestedPass( + createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); - // Convert GPU to LLVM. + ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3; + // Must be 64b on the host, things don't compose properly around + // gpu::LaunchOp and gpu::HostRegisterOp. + // TODO: fix GPU layering. + convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth; + pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3)); + + // This must happen after cubin translation otherwise gpu.launch_func is + // illegal if no cubin annotation is present. // TODO: C++20 designated initializers. GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions; // Note: hostBarePtrCallConv must be false for now otherwise @@ -251,28 +277,10 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm, gpuToLLVMConversionOptions.kernelBarePtrCallConv = options.kernelUseBarePtrCallConv; gpuToLLVMConversionOptions.useOpaquePointers = true; - // TODO: something useful here. // gpuToLLVMConversionOptions.gpuBinaryAnnotation = ""; pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions)); - // Serialize all GPU modules to binaries. - pm.addPass(createGpuModuleToBinaryPass()); - - // Convert vector to LLVM (always needed). - // TODO: C++20 designated initializers. - ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; - convertVectorToLLVMPassOptions.reassociateFPReductions = true; - pm.addNestedPass( - createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); - - ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3; - // Must be 64b on the host, things don't compose properly around - // gpu::LaunchOp and gpu::HostRegisterOp. - // TODO: fix GPU layering. - convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth; - pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3)); - // Convert Func to LLVM (always needed). // TODO: C++20 designated initializers. ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2;