diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h index 46f29c6dd8b92..e0f4c71051e50 100644 --- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h +++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h @@ -16,9 +16,7 @@ namespace mlir { class LLVMTypeConverter; class ConversionTarget; class RewritePatternSet; - -template -class OperationPass; +class Pass; namespace gpu { class GPUModuleOp; @@ -45,14 +43,6 @@ void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter, /// Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM. void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns); - -/// Creates a pass that lowers GPU dialect operations to NVVM counterparts. The -/// index bitwidth used for the lowering of the device side index computations -/// is configurable. -std::unique_ptr> createLowerGpuOpsToNVVMOpsPass( - unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout, - bool hasRedux = false); - } // namespace mlir #endif // MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_ diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index ed37abf85275b..3218760931b8c 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -486,7 +486,6 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> { def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> { let summary = "Generate NVVM operations for gpu operations"; - let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()"; let dependentDialects = [ "cf::ControlFlowDialect", "memref::MemRefDialect", diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 06469dc82b3fc..764b6a779b98c 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -210,11 +210,7 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern { /// code. struct LowerGpuOpsToNVVMOpsPass : public impl::ConvertGpuOpsToNVVMOpsBase { - LowerGpuOpsToNVVMOpsPass() = default; - LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux = false) { - this->indexBitwidth = indexBitwidth; - this->hasRedux = hasRedux; - } + using Base::Base; void runOnOperation() override { gpu::GPUModuleOp m = getOperation(); @@ -378,8 +374,3 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, "__nv_tanh"); populateOpPatterns(converter, patterns, "__nv_tanf", "__nv_tan"); } - -std::unique_ptr> -mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux) { - return std::make_unique(indexBitwidth, hasRedux); -} diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp index a7fd5a25e6831..24c4c4c43a93d 100644 --- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp +++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp @@ -52,7 +52,7 @@ void mlir::sparse_tensor::buildSparseCompiler( pm.addPass(createSparseGPUCodegenPass()); pm.addNestedPass(createStripDebugInfoPass()); pm.addNestedPass(createConvertSCFToCFPass()); - pm.addNestedPass(createLowerGpuOpsToNVVMOpsPass()); + pm.addNestedPass(createConvertGpuOpsToNVVMOps()); } // TODO(springerm): Add sparse support to the BufferDeallocation pass and add diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir index 2c1ae3ee840d0..0cb06b7bf1d20 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir @@ -1,6 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{dump-ptx}))' \ +// RUN: | mlir-opt -test-lower-to-nvvm -debug-only=serialize-to-isa \ // RUN: 2>&1 | FileCheck %s // CHECK: Generated by LLVM NVPTX Back-End diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir index 8eb90fd3ca994..80972f244ec02 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir @@ -2,10 +2,9 @@ // NOTE: this test requires gpu-sm80 // // RUN: mlir-opt \ -// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse,gpu.module(gpu-to-cubin{chip=sm_80 features=+ptx71}))" \ +// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \ // RUN: %s \ -// RUN: | mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \ -// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \ +// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir index 8571c5ca5f3dc..8c991493a2b01 100644 --- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir @@ -1,9 +1,7 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\ // RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\ // RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \ -// RUN: -convert-arith-to-llvm -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -convert-arith-to-llvm -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir index c671c1843862f..f26c18c4ae3dd 100644 --- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir @@ -2,9 +2,7 @@ // everything on the same thread. // RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ @@ -15,9 +13,7 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \ // RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ @@ -27,9 +23,7 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \ // RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir index 535ba52d66f00..591bf1b4fd182 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \ -// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir index c4ca46521eeb4..51bd23f817b33 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir @@ -3,9 +3,7 @@ // Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention. // This test also uses gpu.memcpy operations (instead of gpu.host_register). // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin{chip=sm_70}))' \ -// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm="use-bare-pointers-for-host=1 use-bare-pointers-for-kernels=1" \ +// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir index ae410dce281b1..0307b3d504be9 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \ -// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir index f4324a14a36b6..b131b8682ddee 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -10,9 +8,7 @@ // Same as above but with the memref bare pointer lowering convention. // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm="use-bare-pointers-for-kernels=1" \ +// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir index 0a8d38f145279..155423db7e050 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir index bcd785d35291c..e5047b6efa3bf 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir index aa4b0e8820479..163e9fdba60c1 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir index 2e7d046c39214..381db2639c371 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir index 32cfa27c8988a..23c6c117e67f3 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir index 30767b9495b6f..3c5a100b5b90d 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir index e6dd91ace9743..d2a5127a34c3b 100644 --- a/mlir/test/Integration/GPU/CUDA/async.mlir +++ b/mlir/test/Integration/GPU/CUDA/async.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \ +// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \ // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \ // RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \ // RUN: | mlir-cpu-runner \ diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir index afcb674858c86..a5d04f7322b49 100644 --- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir +++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir @@ -1,8 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir index 444e2877c822c..7657bf4732d32 100644 --- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir +++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir index fce773974d5ba..1a35d1e78b094 100644 --- a/mlir/test/Integration/GPU/CUDA/printf.mlir +++ b/mlir/test/Integration/GPU/CUDA/printf.mlir @@ -1,6 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir index 6a784ca32f9ef..40fcea857d5b4 100644 --- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir +++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir index 5f6e5d75aff5b..5a9acdf3d8da6 100644 --- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir +++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp index 5db6f56fb4b38..99e19dae0d72b 100644 --- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp +++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp @@ -65,7 +65,7 @@ struct TestLowerToNVVMOptions llvm::cl::init("nvptx64-nvidia-cuda")}; PassOptions::Option cubinChip{ *this, "cubin-chip", llvm::cl::desc("Chip to use to serialize to cubin."), - llvm::cl::init("sm_80")}; + llvm::cl::init("sm_50")}; PassOptions::Option cubinFeatures{ *this, "cubin-features", llvm::cl::desc("Features to use to serialize to cubin."), @@ -126,13 +126,14 @@ void buildGpuPassPipeline(OpPassManager &pm, // TODO: C++20 designated initializers. // The following pass is inconsistent. - // ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions; - // convertGpuOpsToNVVMOpsOptions.indexBitwidth = - // options.kernelIndexBitWidth; + // TODO: fix inconsistence. + ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions; + convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv = + options.kernelUseBarePtrCallConv; + convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth; + convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true; pm.addNestedPass( - // TODO: fix inconsistence. - createLowerGpuOpsToNVVMOpsPass(/*indexBitWidth=*/ - options.kernelIndexBitWidth)); + createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions)); // TODO: C++20 designated initializers. ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions; @@ -141,22 +142,6 @@ void buildGpuPassPipeline(OpPassManager &pm, createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions)); pm.addNestedPass(createConvertSCFToCFPass()); - // TODO: C++20 designated initializers. - GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions; - // Note: hostBarePtrCallConv must be false for now otherwise - // gpu::HostRegister is ill-defined: it wants unranked memrefs but can't - // lower the to bare ptr. - gpuToLLVMConversionOptions.hostBarePtrCallConv = - options.hostUseBarePtrCallConv; - gpuToLLVMConversionOptions.kernelBarePtrCallConv = - options.kernelUseBarePtrCallConv; - gpuToLLVMConversionOptions.useOpaquePointers = true; - - // TODO: something useful here. - // gpuToLLVMConversionOptions.gpuBinaryAnnotation = ""; - pm.addNestedPass( - createGpuToLLVMConversionPass(gpuToLLVMConversionOptions)); - // Convert vector to LLVM (always needed). // TODO: C++20 designated initializers. ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; @@ -170,11 +155,6 @@ void buildGpuPassPipeline(OpPassManager &pm, // Finally we can reconcile unrealized casts. pm.addNestedPass(createReconcileUnrealizedCastsPass()); - -#if MLIR_GPU_TO_CUBIN_PASS_ENABLE - pm.addNestedPass(createGpuSerializeToCubinPass( - options.cubinTriple, options.cubinChip, options.cubinFeatures)); -#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE } void buildLowerToNVVMPassPipeline(OpPassManager &pm, @@ -251,22 +231,16 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm, //===----------------------------------------------------------------------===// // Host post-GPUModule-specific stuff. //===----------------------------------------------------------------------===// - // Convert vector to LLVM (always needed). + // Attach an NVVM target to all the GPU modules with the provided target + // options. // TODO: C++20 designated initializers. - ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; - convertVectorToLLVMPassOptions.reassociateFPReductions = true; - pm.addNestedPass( - createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); + GpuNVVMAttachTargetOptions nvvmTargetOptions; + nvvmTargetOptions.triple = options.cubinTriple; + nvvmTargetOptions.chip = options.cubinChip; + nvvmTargetOptions.features = options.cubinFeatures; + pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions)); - ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3; - // Must be 64b on the host, things don't compose properly around - // gpu::LaunchOp and gpu::HostRegisterOp. - // TODO: fix GPU layering. - convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth; - pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3)); - - // This must happen after cubin translation otherwise gpu.launch_func is - // illegal if no cubin annotation is present. + // Convert GPU to LLVM. // TODO: C++20 designated initializers. GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions; // Note: hostBarePtrCallConv must be false for now otherwise @@ -277,10 +251,28 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm, gpuToLLVMConversionOptions.kernelBarePtrCallConv = options.kernelUseBarePtrCallConv; gpuToLLVMConversionOptions.useOpaquePointers = true; + // TODO: something useful here. // gpuToLLVMConversionOptions.gpuBinaryAnnotation = ""; pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions)); + // Serialize all GPU modules to binaries. + pm.addPass(createGpuModuleToBinaryPass()); + + // Convert vector to LLVM (always needed). + // TODO: C++20 designated initializers. + ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; + convertVectorToLLVMPassOptions.reassociateFPReductions = true; + pm.addNestedPass( + createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); + + ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3; + // Must be 64b on the host, things don't compose properly around + // gpu::LaunchOp and gpu::HostRegisterOp. + // TODO: fix GPU layering. + convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth; + pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3)); + // Convert Func to LLVM (always needed). // TODO: C++20 designated initializers. ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2;