Skip to content

Commit 09b0fcd

Browse files
committed
[mlir][vector] Add mask elimination transform
This adds a new transform `eliminateVectorMasks()` which aims at removing scalable `vector.create_masks` that will be all-true at runtime. It attempts to do this by simply pattern-matching the mask operands (similar to some canonicalizations), if that does not lead to an answer (is all-true? yes/no), then value bounds analysis will be used to find the lower bound of the unknown operands. If the lower bound is >= to the corresponding mask vector type dim, then that dimension of the mask is all true. Note: Eliminating create_masks here means replacing them with all-true constants (which will then lead to the masks folding away).
1 parent b7b0071 commit 09b0fcd

File tree

5 files changed

+307
-0
lines changed

5 files changed

+307
-0
lines changed

mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
1313
#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
14+
#include "mlir/Interfaces/FunctionInterfaces.h"
1415

1516
namespace mlir {
1617
class MLIRContext;
@@ -115,6 +116,22 @@ castAwayContractionLeadingOneDim(vector::ContractionOp contractOp,
115116
MaskingOpInterface maskingOp,
116117
RewriterBase &rewriter);
117118

119+
/// Structure to hold the range [vscaleMin, vscaleMax] `vector.vscale` can take.
120+
struct VscaleRange {
121+
unsigned vscaleMin;
122+
unsigned vscaleMax;
123+
};
124+
125+
/// Attempts to eliminate redundant vector masks by replacing them with all-true
126+
/// constants at the top of the function (which results in the masks folding
127+
/// away). Note: Currently, this only runs for vector.create_mask ops and
128+
/// requires `vscaleRange`. If `vscaleRange` is not provided this transform does
129+
/// nothing. This is because these redundant masks are much more likely for
130+
/// scalable code which requires memref/tensor dynamic sizes, whereas fixed-size
131+
/// code has static sizes, so simpler folds remove the masks.
132+
void eliminateVectorMasks(IRRewriter &rewriter, FunctionOpInterface function,
133+
std::optional<VscaleRange> vscaleRange = {});
134+
118135
} // namespace vector
119136
} // namespace mlir
120137

mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
2222
VectorTransferSplitRewritePatterns.cpp
2323
VectorTransforms.cpp
2424
VectorUnroll.cpp
25+
VectorMaskElimination.cpp
2526

2627
ADDITIONAL_HEADER_DIRS
2728
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Vector/Transforms
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#include "mlir/Dialect/Arith/IR/Arith.h"
2+
#include "mlir/Dialect/Utils/StaticValueUtils.h"
3+
#include "mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h"
4+
#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
5+
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
6+
#include "mlir/Interfaces/FunctionInterfaces.h"
7+
8+
using namespace mlir;
9+
using namespace mlir::vector;
10+
namespace {
11+
12+
/// If `value` is a constant multiple of `vector.vscale` return the multiplier.
13+
std::optional<int64_t> getConstantVscaleMultiplier(Value value) {
14+
if (value.getDefiningOp<vector::VectorScaleOp>())
15+
return 1;
16+
auto mul = value.getDefiningOp<arith::MulIOp>();
17+
if (!mul)
18+
return {};
19+
auto lhs = mul.getLhs();
20+
auto rhs = mul.getRhs();
21+
if (lhs.getDefiningOp<vector::VectorScaleOp>())
22+
return getConstantIntValue(rhs);
23+
if (rhs.getDefiningOp<vector::VectorScaleOp>())
24+
return getConstantIntValue(lhs);
25+
return {};
26+
}
27+
28+
/// Attempts to resolve a (scalable) CreateMaskOp to an all-true constant mask.
29+
/// All-true masks can then be eliminated by simple folds.
30+
LogicalResult resolveAllTrueCreateMaskOp(IRRewriter &rewriter,
31+
vector::CreateMaskOp createMaskOp,
32+
VscaleRange vscaleRange) {
33+
auto maskType = createMaskOp.getVectorType();
34+
auto maskTypeDimScalableFlags = maskType.getScalableDims();
35+
auto maskTypeDimSizes = maskType.getShape();
36+
37+
struct UnknownMaskDim {
38+
size_t position;
39+
Value dimSize;
40+
};
41+
42+
// Check for any dims that could be (partially) false before doing the more
43+
// expensive value bounds computations.
44+
SmallVector<UnknownMaskDim> unknownDims;
45+
for (auto [i, dimSize] : llvm::enumerate(createMaskOp.getOperands())) {
46+
if (auto intSize = getConstantIntValue(dimSize)) {
47+
// Mask not all-true for this dim.
48+
if (maskTypeDimScalableFlags[i] || intSize < maskTypeDimSizes[i])
49+
return failure();
50+
} else if (auto vscaleMultiplier = getConstantVscaleMultiplier(dimSize)) {
51+
// Mask not all-true for this dim.
52+
if (vscaleMultiplier < maskTypeDimSizes[i])
53+
return failure();
54+
} else {
55+
// Unknown (without further analysis).
56+
unknownDims.push_back(UnknownMaskDim{i, dimSize});
57+
}
58+
}
59+
60+
for (auto [i, dimSize] : unknownDims) {
61+
// Compute the lower bound for the unknown dimension (i.e. the smallest
62+
// value it could be).
63+
auto lowerBound =
64+
vector::ScalableValueBoundsConstraintSet::computeScalableBound(
65+
dimSize, {}, vscaleRange.vscaleMin, vscaleRange.vscaleMax,
66+
presburger::BoundType::LB);
67+
if (failed(lowerBound))
68+
return failure();
69+
auto boundSize = lowerBound->getSize();
70+
if (failed(boundSize))
71+
return failure();
72+
if (boundSize->scalable) {
73+
// If the lower bound is scalable and >= to the mask dim size then this
74+
// dim is all-true.
75+
if (boundSize->baseSize < maskTypeDimSizes[i])
76+
return failure();
77+
} else {
78+
// If the lower bound is a constant and >= to the _fixed-size_ mask dim
79+
// size then this dim is all-true.
80+
if (maskTypeDimScalableFlags[i])
81+
return failure();
82+
if (boundSize->baseSize < maskTypeDimSizes[i])
83+
return failure();
84+
}
85+
}
86+
87+
// Replace createMaskOp with an all-true constant. This should result in the
88+
// mask being removed in most cases (as xfer ops + vector.mask have folds to
89+
// remove all-true masks).
90+
auto allTrue = rewriter.create<arith::ConstantOp>(
91+
createMaskOp.getLoc(), maskType, DenseElementsAttr::get(maskType, true));
92+
rewriter.replaceAllUsesWith(createMaskOp, allTrue);
93+
return success();
94+
}
95+
96+
} // namespace
97+
98+
namespace mlir::vector {
99+
100+
void eliminateVectorMasks(IRRewriter &rewriter, FunctionOpInterface function,
101+
std::optional<VscaleRange> vscaleRange) {
102+
// TODO: Support fixed-size case. This is less likely to be useful as for
103+
// fixed-size code dimensions are all static so masks tend to fold away.
104+
if (!vscaleRange)
105+
return;
106+
107+
OpBuilder::InsertionGuard g(rewriter);
108+
SmallVector<vector::CreateMaskOp> worklist;
109+
function.walk([&](vector::CreateMaskOp createMaskOp) {
110+
worklist.push_back(createMaskOp);
111+
});
112+
rewriter.setInsertionPointToStart(&function.front());
113+
for (auto mask : worklist)
114+
(void)resolveAllTrueCreateMaskOp(rewriter, mask, *vscaleRange);
115+
}
116+
117+
} // namespace mlir::vector
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
// RUN: mlir-opt %s -split-input-file -test-eliminate-vector-masks | FileCheck %s
2+
3+
// This tests a general pattern the vectorizer tends to emit.
4+
5+
// CHECK-LABEL: @eliminate_redundant_masks_through_insert_and_extracts
6+
// CHECK: %[[ALL_TRUE_MASK:.*]] = arith.constant dense<true> : vector<[4]xi1>
7+
// CHECK: vector.transfer_read {{.*}} %[[ALL_TRUE_MASK]]
8+
// CHECK: vector.transfer_write {{.*}} %[[ALL_TRUE_MASK]]
9+
func.func @eliminate_redundant_masks_through_insert_and_extracts(%tensor: tensor<1x1000xf32>) {
10+
%c0 = arith.constant 0 : index
11+
%c4 = arith.constant 4 : index
12+
%c1000 = arith.constant 1000 : index
13+
%c0_f32 = arith.constant 0.0 : f32
14+
%vscale = vector.vscale
15+
%c4_vscale = arith.muli %vscale, %c4 : index
16+
%extracted_slice_0 = tensor.extract_slice %tensor[0, 0] [1, %c4_vscale] [1, 1] : tensor<1x1000xf32> to tensor<1x?xf32>
17+
%output_tensor = scf.for %i = %c0 to %c1000 step %c4_vscale iter_args(%arg = %extracted_slice_0) -> tensor<1x?xf32> {
18+
// 1. Extract a slice.
19+
%extracted_slice_1 = tensor.extract_slice %arg[0, 0] [1, %c4_vscale] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
20+
21+
// 2. Create a mask for the slice.
22+
%dim_1 = tensor.dim %extracted_slice_1, %c0 : tensor<?xf32>
23+
%mask = vector.create_mask %dim_1 : vector<[4]xi1>
24+
25+
// 3. Read the slice and do some computation.
26+
%vec = vector.transfer_read %extracted_slice_1[%c0], %c0_f32, %mask {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32>
27+
%new_vec = "test.some_computation"(%vec) : (vector<[4]xf32>) -> (vector<[4]xf32>)
28+
29+
// 4. Write the new value.
30+
%write = vector.transfer_write %new_vec, %extracted_slice_1[%c0], %mask {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32>
31+
32+
// 5. Insert and yield the new tensor value.
33+
%result = tensor.insert_slice %write into %arg[0, 0] [1, %c4_vscale] [1, 1] : tensor<?xf32> into tensor<1x?xf32>
34+
scf.yield %result : tensor<1x?xf32>
35+
}
36+
"test.some_use"(%output_tensor) : (tensor<1x?xf32>) -> ()
37+
return
38+
}
39+
40+
// -----
41+
42+
// CHECK-LABEL: @negative_extract_slice_size_shrink
43+
// CHECK-NOT: arith.constant dense<true> : vector<[4]xi1>
44+
// CHECK: %[[MASK:.*]] = vector.create_mask
45+
// CHECK: "test.some_use"(%[[MASK]]) : (vector<[4]xi1>) -> ()
46+
func.func @negative_extract_slice_size_shrink(%tensor: tensor<1000xf32>) {
47+
%c0 = arith.constant 0 : index
48+
%c4 = arith.constant 4 : index
49+
%c1000 = arith.constant 1000 : index
50+
%vscale = vector.vscale
51+
%c4_vscale = arith.muli %vscale, %c4 : index
52+
%extracted_slice = tensor.extract_slice %tensor[0] [%c4_vscale] [1] : tensor<1000xf32> to tensor<?xf32>
53+
%slice = scf.for %i = %c0 to %c1000 step %c4_vscale iter_args(%arg = %extracted_slice) -> tensor<?xf32> {
54+
// This mask cannot be eliminated even though looking at the above operations
55+
// it appears `tensor.dim` will always be c4_vscale (so the mask all-true).
56+
%dim = tensor.dim %arg, %c0 : tensor<?xf32>
57+
%mask = vector.create_mask %dim : vector<[4]xi1>
58+
"test.some_use"(%mask) : (vector<[4]xi1>) -> ()
59+
// !!! Here the size of the mask could shrink in the next iteration.
60+
%next_num_els = affine.min affine_map<(d0)[s0] -> (-d0 + 1000, s0)>(%i)[%c4_vscale]
61+
%new_extracted_slice = tensor.extract_slice %tensor[%c4_vscale] [%next_num_els] [1] : tensor<1000xf32> to tensor<?xf32>
62+
scf.yield %new_extracted_slice : tensor<?xf32>
63+
}
64+
"test.some_use"(%slice) : (tensor<?xf32>) -> ()
65+
return
66+
}
67+
68+
// -----
69+
70+
// CHECK-LABEL: @negative_constant_dim_not_all_true
71+
// CHECK-NOT: arith.constant dense<true> : vector<2x[4]xi1>
72+
// CHECK: %[[MASK:.*]] = vector.create_mask
73+
// CHECK: "test.some_use"(%[[MASK]]) : (vector<2x[4]xi1>) -> ()
74+
func.func @negative_constant_dim_not_all_true()
75+
{
76+
%c1 = arith.constant 1 : index
77+
%c4 = arith.constant 4 : index
78+
%vscale = vector.vscale
79+
%c4_vscale = arith.muli %vscale, %c4 : index
80+
%mask = vector.create_mask %c1, %c4_vscale : vector<2x[4]xi1>
81+
"test.some_use"(%mask) : (vector<2x[4]xi1>) -> ()
82+
return
83+
}
84+
85+
// -----
86+
87+
// CHECK-LABEL: @negative_constant_vscale_multiple_not_all_true
88+
// CHECK-NOT: arith.constant dense<true> : vector<2x[4]xi1>
89+
// CHECK: %[[MASK:.*]] = vector.create_mask
90+
// CHECK: "test.some_use"(%[[MASK]]) : (vector<2x[4]xi1>) -> ()
91+
func.func @negative_constant_vscale_multiple_not_all_true() {
92+
%c2 = arith.constant 2 : index
93+
%c3 = arith.constant 3 : index
94+
%vscale = vector.vscale
95+
%c3_vscale = arith.muli %vscale, %c3 : index
96+
%mask = vector.create_mask %c2, %c3_vscale : vector<2x[4]xi1>
97+
"test.some_use"(%mask) : (vector<2x[4]xi1>) -> ()
98+
return
99+
}
100+
101+
// -----
102+
103+
// CHECK-LABEL: @negative_value_bounds_fixed_dim_not_all_true
104+
// CHECK-NOT: arith.constant dense<true> : vector<3x[4]xi1>
105+
// CHECK: %[[MASK:.*]] = vector.create_mask
106+
// CHECK: "test.some_use"(%[[MASK]]) : (vector<3x[4]xi1>) -> ()
107+
func.func @negative_value_bounds_fixed_dim_not_all_true(%tensor: tensor<2x?xf32>)
108+
{
109+
%c0 = arith.constant 0 : index
110+
%c4 = arith.constant 4 : index
111+
%vscale = vector.vscale
112+
%c4_vscale = arith.muli %vscale, %c4 : index
113+
// This is _very_ simple but since addi is not a constant value bounds will
114+
// be used to resolve it.
115+
%dim = tensor.dim %tensor, %c0 : tensor<2x?xf32>
116+
%mask = vector.create_mask %dim, %c4_vscale : vector<3x[4]xi1>
117+
"test.some_use"(%mask) : (vector<3x[4]xi1>) -> ()
118+
return
119+
}
120+
121+
// -----
122+
123+
// CHECK-LABEL: @negative_value_bounds_scalable_dim_not_all_true
124+
// CHECK-NOT: arith.constant dense<true> : vector<3x[4]xi1>
125+
// CHECK: %[[MASK:.*]] = vector.create_mask
126+
// CHECK: "test.some_use"(%[[MASK]]) : (vector<3x[4]xi1>) -> ()
127+
func.func @negative_value_bounds_scalable_dim_not_all_true(%tensor: tensor<2x100xf32>) {
128+
%c1 = arith.constant 1 : index
129+
%c3 = arith.constant 3 : index
130+
%vscale = vector.vscale
131+
%c3_vscale = arith.muli %vscale, %c3 : index
132+
%slice = tensor.extract_slice %tensor[0, 0] [2, %c3_vscale] [1, 1] : tensor<2x100xf32> to tensor<2x?xf32>
133+
// Another simple example, but value bounds will be used to resolve the tensor.dim.
134+
%dim = tensor.dim %slice, %c1 : tensor<2x?xf32>
135+
%mask = vector.create_mask %c3, %dim : vector<3x[4]xi1>
136+
"test.some_use"(%mask) : (vector<3x[4]xi1>) -> ()
137+
return
138+
}

mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,38 @@ struct TestVectorLinearize final
874874
return signalPassFailure();
875875
}
876876
};
877+
878+
struct TestEliminateVectorMasks
879+
: public PassWrapper<TestEliminateVectorMasks,
880+
OperationPass<func::FuncOp>> {
881+
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestEliminateVectorMasks)
882+
883+
TestEliminateVectorMasks() = default;
884+
TestEliminateVectorMasks(const TestEliminateVectorMasks &pass)
885+
: PassWrapper(pass) {}
886+
887+
Option<unsigned> vscaleMin{
888+
*this, "vscale-min",
889+
llvm::cl::desc(
890+
"Minimum value `vector.vscale` can possibly be at runtime."),
891+
llvm::cl::init(1)};
892+
893+
Option<unsigned> vscaleMax{
894+
*this, "vscale-max",
895+
llvm::cl::desc(
896+
"Maximum value `vector.vscale` can possibly be at runtime."),
897+
llvm::cl::init(16)};
898+
899+
StringRef getArgument() const final { return "test-eliminate-vector-masks"; }
900+
StringRef getDescription() const final {
901+
return "Test eliminating vector masks";
902+
}
903+
void runOnOperation() override {
904+
IRRewriter rewriter(&getContext());
905+
eliminateVectorMasks(rewriter, getOperation(),
906+
VscaleRange{vscaleMin, vscaleMax});
907+
}
908+
};
877909
} // namespace
878910

879911
namespace mlir {
@@ -920,6 +952,8 @@ void registerTestVectorLowerings() {
920952
PassRegistration<TestVectorEmulateMaskedLoadStore>();
921953

922954
PassRegistration<TestVectorLinearize>();
955+
956+
PassRegistration<TestEliminateVectorMasks>();
923957
}
924958
} // namespace test
925959
} // namespace mlir

0 commit comments

Comments
 (0)