@@ -124,9 +124,10 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
124
124
def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
125
125
let summary = "GPU tiling and fusion path.";
126
126
let description = [{
127
- This pass tiles linalg operations and creates an inner loop that is mapped to the block sizes, when converting
128
- to gpu.launch. The tiles calculation is based on the GPU device properties, retrieved from the DLTI attributes.
129
- If the DLTI attributes are not specified, defaults to the pass options.
127
+ This pass tiles linalg operations and creates two nested csf.forall loops. When converting to gpu.launch,
128
+ the inner loop is mapped to the block sizes and the outer - to grid sizes. The tiles calculation is based
129
+ on the GPU device properties, retrieved from the DLTI attributes. If the DLTI attributes are not specified,
130
+ defaults to the pass options.
130
131
}];
131
132
let options = [
132
133
Option<"numEus", "num-eus", "size_t",
@@ -143,18 +144,7 @@ def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
143
144
"Execution Unit cache size.">,
144
145
Option<"vectorWidth", "vector-width", "size_t",
145
146
/*default=*/"512",
146
- "The maximum width of EU's vector registers.">
147
- ];
148
- }
149
-
150
- def GpuLoopTiling : Pass<"gpu-loop-tiling", "func::FuncOp"> {
151
- let summary = "Create nested parallel loops to be mapped to GPU.";
152
- let description = [{
153
- This pass tiles the loops created by the GpuTilingAndFusion pass and converted to parallel loops. The tiles
154
- calculation is based on the max_work_group_size DLTI attribute. If the attribute is not specified,
155
- defaults to the pass options.
156
- }];
157
- let options = [
147
+ "The maximum width of EU's vector registers.">,
158
148
Option<"workGroupSize", "work-group-size", "size_t",
159
149
/*default=*/"64",
160
150
"The maximum workgroup size.">
0 commit comments