Skip to content

Commit ad44112

Browse files
authored
[MLIR] Make SM_90 integration tests use TargetAttr (#65926)
The 'TargetAttr' workflow was recently introduced to serialization for 'MLIR->LLVM->PTX'. #65857 removes previous passes (gpu::Serialization* passes) because they are duplicates. This PR removes the use of gpu::Serialization* passes in SM_90 integration tests, and enables the 'TargetAttr' workflow. It also moves the transform dialect specific test to a new folder.
1 parent 0954dc3 commit ad44112

File tree

3 files changed

+129
-121
lines changed

3 files changed

+129
-121
lines changed

mlir/test/Integration/GPU/CUDA/sm90/tmaload.mlir renamed to mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
1-
// RUN: mlir-opt %s --convert-nvgpu-to-nvvm -gpu-kernel-outlining \
2-
// RUN: -convert-scf-to-cf -convert-nvvm-to-llvm \
3-
// RUN: -convert-vector-to-llvm \
4-
// RUN: -convert-math-to-llvm \
5-
// RUN: -expand-strided-metadata \
6-
// RUN: -lower-affine \
7-
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
8-
// RUN: -convert-arith-to-llvm \
9-
// RUN: -finalize-memref-to-llvm \
10-
// RUN: -convert-func-to-llvm \
11-
// RUN: -canonicalize \
12-
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_90 features=+ptx80 dump-ptx}))' \
1+
// RUN: mlir-opt %s --convert-nvgpu-to-nvvm \
2+
// RUN: -gpu-kernel-outlining \
3+
// RUN: -convert-nvvm-to-llvm \
4+
// RUN: -convert-nvgpu-to-nvvm \
5+
// RUN: -convert-scf-to-cf \
6+
// RUN: -convert-vector-to-llvm \
7+
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
8+
// RUN: -convert-arith-to-llvm \
9+
// RUN: -finalize-memref-to-llvm='use-opaque-pointers=1' \
10+
// RUN: -convert-func-to-llvm \
11+
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
12+
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
13+
// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
1314
// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
1415

16+
// Basic PTX check to make sure we are generating the right instructions.
17+
1518
// CHECK-PTX: mbarrier.init.shared.b64
1619
// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
1720
// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes

mlir/test/Integration/GPU/CUDA/sm90/tmaload-transform.mlir

Lines changed: 0 additions & 109 deletions
This file was deleted.
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
// RUN: mlir-opt %s \
2+
// RUN: -test-transform-dialect-interpreter \
3+
// RUN: -test-transform-dialect-erase-schedule \
4+
// RUN: -convert-nvgpu-to-nvvm -gpu-kernel-outlining \
5+
// RUN: -convert-scf-to-cf -convert-nvvm-to-llvm \
6+
// RUN: -convert-vector-to-llvm \
7+
// RUN: -convert-math-to-llvm \
8+
// RUN: -expand-strided-metadata \
9+
// RUN: -lower-affine \
10+
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
11+
// RUN: -convert-arith-to-llvm \
12+
// RUN: -finalize-memref-to-llvm \
13+
// RUN: -convert-func-to-llvm \
14+
// RUN: -canonicalize \
15+
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
16+
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
17+
// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \
18+
// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX
19+
20+
// Basic PTX check to make sure we are generating the right instructions.
21+
// CHECK-PTX: mbarrier.init.shared.b64
22+
// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
23+
// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
24+
// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
25+
// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
26+
// CHECK-PTX: mbarrier.try_wait.parity.shared.b64
27+
28+
// RUN: mlir-opt %s \
29+
// RUN: -test-transform-dialect-interpreter \
30+
// RUN: -test-transform-dialect-erase-schedule \
31+
// RUN: -convert-nvgpu-to-nvvm -gpu-kernel-outlining \
32+
// RUN: -convert-scf-to-cf -convert-nvvm-to-llvm \
33+
// RUN: -convert-vector-to-llvm \
34+
// RUN: -convert-math-to-llvm \
35+
// RUN: -expand-strided-metadata \
36+
// RUN: -lower-affine \
37+
// RUN: -convert-index-to-llvm=index-bitwidth=32 \
38+
// RUN: -convert-arith-to-llvm \
39+
// RUN: -finalize-memref-to-llvm \
40+
// RUN: -convert-func-to-llvm \
41+
// RUN: -canonicalize \
42+
// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
43+
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
44+
// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \
45+
// RUN: | mlir-cpu-runner \
46+
// RUN: --shared-libs=%mlir_cuda_runtime \
47+
// RUN: --shared-libs=%mlir_runner_utils \
48+
// RUN: --entry-point-result=void \
49+
// RUN: | FileCheck %s
50+
51+
// CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000
52+
// CHECK: [GPU] TMA LOADED rhs[7][0] 3.000000
53+
54+
memref.global "private" @bufferLhsGlobal : memref<64x8xf32, 3>
55+
memref.global "private" @bufferRhsGlobal : memref<8x128xf32, 3>
56+
func.func @main() {
57+
%c10000000 = arith.constant 10000000 : index
58+
%c6144 = arith.constant 6144 : index
59+
%c45 = arith.constant 45 : index
60+
%c7 = arith.constant 7 : index
61+
%c64 = arith.constant 64 : index
62+
%c1 = arith.constant 1 : index
63+
%c0 = arith.constant 0 : index
64+
%c8 = arith.constant 8 : index
65+
%c128 = arith.constant 128 : index
66+
%cst = arith.constant 3.000000e+00 : f32
67+
%alloc = memref.alloc() : memref<64x8xf32>
68+
%alloc_0 = memref.alloc() : memref<8x128xf32>
69+
scf.for %arg0 = %c0 to %c8 step %c1 {
70+
scf.for %arg1 = %c0 to %c128 step %c1 {
71+
memref.store %cst, %alloc_0[%arg0, %arg1] : memref<8x128xf32>
72+
}
73+
}
74+
scf.for %arg0 = %c0 to %c64 step %c1 {
75+
scf.for %arg1 = %c0 to %c8 step %c1 {
76+
%5 = arith.index_cast %arg1 : index to i64
77+
%6 = arith.uitofp %5 : i64 to f32
78+
memref.store %6, %alloc[%arg0, %arg1] : memref<64x8xf32>
79+
}
80+
}
81+
%0 = gpu.wait async
82+
%memref, %asyncToken = gpu.alloc async [%0] () : memref<64x8xf32>
83+
%memref_1, %asyncToken_2 = gpu.alloc async [%0] () : memref<8x128xf32>
84+
%1 = gpu.memcpy async [%0] %memref, %alloc : memref<64x8xf32>, memref<64x8xf32>
85+
%2 = gpu.memcpy async [%0] %memref_1, %alloc_0 : memref<8x128xf32>, memref<8x128xf32>
86+
87+
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
88+
threads(%tx, %ty, %tz) in (%block_x = %c128, %block_y = %c1, %block_z = %c1) {
89+
%out = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3>
90+
%out_1 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3>
91+
linalg.copy ins(%memref: memref<64x8xf32>) outs(%out: memref<64x8xf32, 3>)
92+
linalg.copy ins(%memref_1: memref<8x128xf32>) outs(%out_1: memref<8x128xf32, 3>)
93+
94+
%6 = gpu.thread_id x
95+
%10 = arith.cmpi eq, %6, %c0 : index
96+
scf.if %10 {
97+
%11 = memref.load %out[%c45, %c7] : memref<64x8xf32, 3>
98+
%12 = memref.load %out_1[%c7, %c0] : memref<8x128xf32, 3>
99+
gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32
100+
gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32
101+
}
102+
gpu.terminator
103+
}
104+
105+
return
106+
}
107+
108+
transform.sequence failures(propagate) {
109+
^bb1(%arg1: !transform.any_op):
110+
%copy = transform.structured.match ops{["linalg.copy"]} in %arg1
111+
: (!transform.any_op) -> !transform.any_op
112+
transform.nvgpu.rewrite_copy_as_tma %copy
113+
: (!transform.any_op) -> ()
114+
}

0 commit comments

Comments
 (0)