|
| 1 | +// RUN: mlir-opt %s \ |
| 2 | +// RUN: -test-transform-dialect-interpreter \ |
| 3 | +// RUN: -test-transform-dialect-erase-schedule \ |
| 4 | +// RUN: -convert-nvgpu-to-nvvm -gpu-kernel-outlining \ |
| 5 | +// RUN: -convert-scf-to-cf -convert-nvvm-to-llvm \ |
| 6 | +// RUN: -convert-vector-to-llvm \ |
| 7 | +// RUN: -convert-math-to-llvm \ |
| 8 | +// RUN: -expand-strided-metadata \ |
| 9 | +// RUN: -lower-affine \ |
| 10 | +// RUN: -convert-index-to-llvm=index-bitwidth=32 \ |
| 11 | +// RUN: -convert-arith-to-llvm \ |
| 12 | +// RUN: -finalize-memref-to-llvm \ |
| 13 | +// RUN: -convert-func-to-llvm \ |
| 14 | +// RUN: -canonicalize \ |
| 15 | +// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \ |
| 16 | +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \ |
| 17 | +// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \ |
| 18 | +// RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX |
| 19 | + |
| 20 | +// Basic PTX check to make sure we are generating the right instructions. |
| 21 | +// CHECK-PTX: mbarrier.init.shared.b64 |
| 22 | +// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64 |
| 23 | +// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes |
| 24 | +// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes |
| 25 | +// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64 |
| 26 | +// CHECK-PTX: mbarrier.try_wait.parity.shared.b64 |
| 27 | + |
| 28 | +// RUN: mlir-opt %s \ |
| 29 | +// RUN: -test-transform-dialect-interpreter \ |
| 30 | +// RUN: -test-transform-dialect-erase-schedule \ |
| 31 | +// RUN: -convert-nvgpu-to-nvvm -gpu-kernel-outlining \ |
| 32 | +// RUN: -convert-scf-to-cf -convert-nvvm-to-llvm \ |
| 33 | +// RUN: -convert-vector-to-llvm \ |
| 34 | +// RUN: -convert-math-to-llvm \ |
| 35 | +// RUN: -expand-strided-metadata \ |
| 36 | +// RUN: -lower-affine \ |
| 37 | +// RUN: -convert-index-to-llvm=index-bitwidth=32 \ |
| 38 | +// RUN: -convert-arith-to-llvm \ |
| 39 | +// RUN: -finalize-memref-to-llvm \ |
| 40 | +// RUN: -convert-func-to-llvm \ |
| 41 | +// RUN: -canonicalize \ |
| 42 | +// RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \ |
| 43 | +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \ |
| 44 | +// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \ |
| 45 | +// RUN: | mlir-cpu-runner \ |
| 46 | +// RUN: --shared-libs=%mlir_cuda_runtime \ |
| 47 | +// RUN: --shared-libs=%mlir_runner_utils \ |
| 48 | +// RUN: --entry-point-result=void \ |
| 49 | +// RUN: | FileCheck %s |
| 50 | + |
| 51 | +// CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000 |
| 52 | +// CHECK: [GPU] TMA LOADED rhs[7][0] 3.000000 |
| 53 | + |
| 54 | +memref.global "private" @bufferLhsGlobal : memref<64x8xf32, 3> |
| 55 | +memref.global "private" @bufferRhsGlobal : memref<8x128xf32, 3> |
| 56 | +func.func @main() { |
| 57 | + %c10000000 = arith.constant 10000000 : index |
| 58 | + %c6144 = arith.constant 6144 : index |
| 59 | + %c45 = arith.constant 45 : index |
| 60 | + %c7 = arith.constant 7 : index |
| 61 | + %c64 = arith.constant 64 : index |
| 62 | + %c1 = arith.constant 1 : index |
| 63 | + %c0 = arith.constant 0 : index |
| 64 | + %c8 = arith.constant 8 : index |
| 65 | + %c128 = arith.constant 128 : index |
| 66 | + %cst = arith.constant 3.000000e+00 : f32 |
| 67 | + %alloc = memref.alloc() : memref<64x8xf32> |
| 68 | + %alloc_0 = memref.alloc() : memref<8x128xf32> |
| 69 | + scf.for %arg0 = %c0 to %c8 step %c1 { |
| 70 | + scf.for %arg1 = %c0 to %c128 step %c1 { |
| 71 | + memref.store %cst, %alloc_0[%arg0, %arg1] : memref<8x128xf32> |
| 72 | + } |
| 73 | + } |
| 74 | + scf.for %arg0 = %c0 to %c64 step %c1 { |
| 75 | + scf.for %arg1 = %c0 to %c8 step %c1 { |
| 76 | + %5 = arith.index_cast %arg1 : index to i64 |
| 77 | + %6 = arith.uitofp %5 : i64 to f32 |
| 78 | + memref.store %6, %alloc[%arg0, %arg1] : memref<64x8xf32> |
| 79 | + } |
| 80 | + } |
| 81 | + %0 = gpu.wait async |
| 82 | + %memref, %asyncToken = gpu.alloc async [%0] () : memref<64x8xf32> |
| 83 | + %memref_1, %asyncToken_2 = gpu.alloc async [%0] () : memref<8x128xf32> |
| 84 | + %1 = gpu.memcpy async [%0] %memref, %alloc : memref<64x8xf32>, memref<64x8xf32> |
| 85 | + %2 = gpu.memcpy async [%0] %memref_1, %alloc_0 : memref<8x128xf32>, memref<8x128xf32> |
| 86 | + |
| 87 | + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) |
| 88 | + threads(%tx, %ty, %tz) in (%block_x = %c128, %block_y = %c1, %block_z = %c1) { |
| 89 | + %out = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3> |
| 90 | + %out_1 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3> |
| 91 | + linalg.copy ins(%memref: memref<64x8xf32>) outs(%out: memref<64x8xf32, 3>) |
| 92 | + linalg.copy ins(%memref_1: memref<8x128xf32>) outs(%out_1: memref<8x128xf32, 3>) |
| 93 | + |
| 94 | + %6 = gpu.thread_id x |
| 95 | + %10 = arith.cmpi eq, %6, %c0 : index |
| 96 | + scf.if %10 { |
| 97 | + %11 = memref.load %out[%c45, %c7] : memref<64x8xf32, 3> |
| 98 | + %12 = memref.load %out_1[%c7, %c0] : memref<8x128xf32, 3> |
| 99 | + gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32 |
| 100 | + gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32 |
| 101 | + } |
| 102 | + gpu.terminator |
| 103 | + } |
| 104 | + |
| 105 | + return |
| 106 | +} |
| 107 | + |
| 108 | +transform.sequence failures(propagate) { |
| 109 | +^bb1(%arg1: !transform.any_op): |
| 110 | + %copy = transform.structured.match ops{["linalg.copy"]} in %arg1 |
| 111 | + : (!transform.any_op) -> !transform.any_op |
| 112 | + transform.nvgpu.rewrite_copy_as_tma %copy |
| 113 | + : (!transform.any_op) -> () |
| 114 | +} |
0 commit comments