-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[mlir][test][sve] Add e2e test for linalg.pack + linalg.unpack #129696
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
banach-space
merged 1 commit into
llvm:main
from
banach-space:andrzej/sve/pack_unpack_e2e
Mar 5, 2025
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
185 changes: 185 additions & 0 deletions
185
mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-scalable-inner-tile.mlir
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
// DEFINE: %{compile} = mlir-opt %s \ | ||
// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \ | ||
// DEFINE: --lower-vector-mask |\ | ||
// DEFINE: mlir-opt -arm-sve-legalize-vector-storage -convert-vector-to-llvm="enable-arm-sve"\ | ||
// DEFINE: -test-lower-to-llvm -o %t | ||
// DEFINE: %{entry_point} = main | ||
// DEFINE: %{run} = mlir-cpu-runner %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\ | ||
// DEFINE: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils | ||
|
||
// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s | ||
|
||
/// End-to-end test for linalg.pack + linalg.unpack where one of the inner tile sizes is | ||
/// scalable. | ||
/// NOTE: Vectorization has not been enabled yet! | ||
|
||
|
||
/// The main entry point | ||
func.func @main() { | ||
// Set vscale to 2 (vector width = 256). This will have identical effect to: | ||
// * qemu-aarch64 -cpu max,sve-max-vq=2 (...) | ||
// (If your platform supports it, you can play with other values as well) | ||
%c256 = arith.constant 256 : i32 | ||
func.call @setArmVLBits(%c256) : (i32) -> () | ||
|
||
// Dynamic/scalable tile size (vscale x 4) | ||
%c4 = arith.constant 4 : index | ||
%vs = vector.vscale | ||
%tile_size = arith.muli %c4, %vs : index | ||
|
||
vector.print str "\nINNER TILE SIZE (run-time value): " | ||
vector.print %tile_size : index | ||
|
||
// Input matrix. The values and dimension have been selected so that this | ||
// matrix can be viewed as: | ||
// +--------+--------+--------+ | ||
// | | | | | ||
// | 4x4 | 4x4 | 4x4 | | ||
// | | | | | ||
// +--------+--------+--------+ | ||
// | | | | | ||
// | 3x4 | 3x4 | 3x4 | | ||
// | | | | | ||
// +--------+--------+--------+ | ||
// This way, after packing, there will be "incomplete" tiles that will | ||
// contain the padding value. After unpacking, the padding value should be | ||
// gone. | ||
%A_before = arith.constant dense<[ | ||
[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], | ||
[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], | ||
[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], | ||
[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], | ||
[4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6], | ||
[4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6], | ||
[4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6] | ||
]> : tensor<7x12xi32> | ||
|
||
// STEP 1: PACK + UNPACK | ||
// TODO: We should change the order to: Pack+print, Unpack+print. However, that causes the | ||
// bufferization to fail with: | ||
// * 'tensor.cast' op not bufferizable under the given constraints: cannot avoid RaW conflict | ||
// Investigate and either fix or remove this comment (if impossible to work-around). | ||
%A_pack = func.call @pack_main(%A_before, %tile_size) : (tensor<7x12xi32>, index) -> tensor<2x?x4x?xi32> | ||
%A_unpack = func.call @unpack_main(%A_pack, %tile_size) : (tensor<2x?x4x?xi32>, index) -> tensor<7x12xi32> | ||
|
||
// STEP 2: Print the matrices | ||
vector.print str "\nINPUT MATRIX (before packing)\n" | ||
%A_before_cast = tensor.cast %A_before : tensor<7x12xi32> to tensor<*xi32> | ||
call @printMemrefI32(%A_before_cast) : (tensor<*xi32>) -> () | ||
|
||
vector.print str "\nINPUT MATRIX (after packing)\n" | ||
%A_pack_cast = tensor.cast %A_pack : tensor<2x?x4x?xi32> to tensor<*xi32> | ||
// There ought to be at least one pad value inserted into a tile | ||
// CHECK-LABEL: (after packing) | ||
// CHECK: 123 | ||
call @printMemrefI32(%A_pack_cast) : (tensor<*xi32>) -> () | ||
|
||
vector.print str "\nINPUT MATRIX (after unpacking)\n" | ||
%A_unpack_cast = tensor.cast %A_unpack : tensor<7x12xi32> to tensor<*xi32> | ||
// This ought to match the input matrix | ||
// CHECK-LABEL: (after unpacking) | ||
// CHECK: [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], | ||
// CHECK: [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], | ||
// CHECK: [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], | ||
// CHECK: [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], | ||
// CHECK: [4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6], | ||
// CHECK: [4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6], | ||
// CHECK: [4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6] | ||
call @printMemrefI32(%A_unpack_cast) : (tensor<*xi32>) -> () | ||
|
||
return | ||
} | ||
|
||
/// Takes the unpacked matrix + inner tile size to use and return the packed matrix. | ||
func.func private @pack_main(%A: tensor<7x12xi32>, %inner_tile_size: index) -> (tensor<2x?x4x?xi32>) { | ||
// Get the size of dim (we could skip tensor.dim, but this way we can keep it generic) | ||
%c1 = arith.constant 1 : index | ||
%dim_1 = tensor.dim %A, %c1 : tensor<7x12xi32> | ||
|
||
// Compute the outer-tile size corresponding to the dynamic inner tile size. | ||
// NOTE: This step is importantant. While as a user we would only tweak the | ||
// inner tile sizes, we need to make sure that the outer sizes are updated | ||
// accordingly. | ||
%outer_tile_size = arith.ceildivui %dim_1, %inner_tile_size : index | ||
|
||
// NOTE: This is deliberately much larger than the input values in %A_before | ||
// so that it's easy to spot it in the output. | ||
%pad_val = arith.constant 123 : i32 | ||
|
||
%A_pack_empty = tensor.empty(%outer_tile_size, %inner_tile_size) : tensor<2x?x4x?xi32> | ||
|
||
%A_pack = linalg.pack %A | ||
padding_value(%pad_val : i32) | ||
inner_dims_pos = [0, 1] | ||
inner_tiles = [4, %inner_tile_size] | ||
into %A_pack_empty : tensor<7x12xi32> -> tensor<2x?x4x?xi32> | ||
|
||
return %A_pack : tensor<2x?x4x?xi32> | ||
} | ||
|
||
/// Takes the packed matrix, unpacks it and returns the result. | ||
func.func private @unpack_main(%A_pack : tensor<2x?x4x?xi32>, %inner_tile_size: index) -> tensor<7x12xi32> { | ||
%A_unpack_empty = tensor.empty() : tensor<7x12xi32> | ||
|
||
%A_unpack = linalg.unpack %A_pack | ||
inner_dims_pos = [0, 1] | ||
inner_tiles = [4, %inner_tile_size] | ||
into %A_unpack_empty : tensor<2x?x4x?xi32> -> tensor<7x12xi32> | ||
|
||
return %A_unpack : tensor<7x12xi32> | ||
} | ||
|
||
module @transforms attributes { transform.with_named_sequence } { | ||
transform.named_sequence @__transform_main(%module: !transform.any_op {transform.consume}) { | ||
%pack = transform.structured.match ops{["linalg.pack"]} in %module : (!transform.any_op) -> !transform.any_op | ||
%unpack = transform.structured.match ops{["linalg.unpack"]} in %module : (!transform.any_op) -> !transform.any_op | ||
|
||
// 1.1 Tile the linalg.pack Op so that we can decompose it into e.g. tensor.pad | ||
// and other lower-level Ops (see step 2.1) | ||
%tiled_pack_op_p, %loops_pack:2 = transform.structured.tile_using_for %pack tile_sizes [1, 1] | ||
: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) | ||
|
||
// 1.2 Tile the linalg.unpack Op so that we can decompose it into e.g. tensor.pad | ||
// and other lower-level Ops (see step 2) | ||
%tiled_unpack_op_p, %loops_unpack:2 = transform.structured.tile_using_for %unpack tile_sizes [4, 1] | ||
: (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) | ||
|
||
// 2.1. Decompose tiled PackOp into lower-level Ops | ||
%func_op_pack = transform.get_parent_op %tiled_pack_op_p {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func"> | ||
transform.apply_patterns to %func_op_pack { | ||
transform.apply_patterns.linalg.decompose_pack_unpack | ||
transform.apply_patterns.linalg.decompose_pad | ||
} : !transform.op<"func.func"> | ||
|
||
transform.apply_patterns to %func_op_pack { | ||
rengolin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
transform.apply_patterns.tensor.fold_tensor_subset_ops | ||
transform.apply_patterns.canonicalization | ||
} : !transform.op<"func.func"> | ||
|
||
// 2.1. Decompose tiled UnpackOp into lower-level Ops | ||
%func_op_unpack = transform.get_parent_op %tiled_unpack_op_p {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func"> | ||
transform.apply_patterns to %func_op_unpack { | ||
transform.apply_patterns.linalg.decompose_pack_unpack | ||
} : !transform.op<"func.func"> | ||
|
||
transform.apply_patterns to %func_op_unpack { | ||
transform.apply_patterns.tensor.fold_tensor_subset_ops | ||
transform.apply_patterns.canonicalization | ||
} : !transform.op<"func.func"> | ||
|
||
// 3. Bufferize before lowering to LLVM | ||
%bufferize = transform.bufferization.one_shot_bufferize %module | ||
{bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op | ||
|
||
// 4. Canonicalize | ||
%func_op_bufferized = transform.structured.match ops{["func.func"]} in %bufferize : (!transform.any_op) -> !transform.op<"func.func"> | ||
transform.apply_patterns to %func_op_bufferized { | ||
transform.apply_patterns.canonicalization | ||
} : !transform.op<"func.func"> | ||
|
||
transform.yield | ||
} | ||
} | ||
|
||
func.func private @printMemrefI32(%ptr : tensor<*xi32>) | ||
func.func private @setArmVLBits(%bits : i32) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.