Skip to content

Commit ed0ef95

Browse files
Implemented tiling and fusion path for GPU
1 parent 9978725 commit ed0ef95

File tree

10 files changed

+996
-15
lines changed

10 files changed

+996
-15
lines changed

include/gc/Transforms/Passes.td

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,36 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
120120
"Call finish() after each kernel launch.">
121121
];
122122
}
123+
124+
def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
125+
let summary = "GPU tiling and fusion path.";
126+
let description = [{
127+
This pass tiles linalg operations and creates two nested scf.forall loops. When converting to gpu.launch,
128+
the inner loop is mapped to the block sizes and the outer - to grid sizes. The tiles calculation is based
129+
on the GPU device properties, retrieved from the DLTI attributes. If the DLTI attributes are not specified,
130+
defaults to the pass options.
131+
}];
132+
let options = [
133+
Option<"numEus", "num-eus", "size_t",
134+
/*default=*/"448",
135+
"Number of Execution Units.">,
136+
Option<"numEusPerSlice", "num-eus-per-slice", "size_t",
137+
/*default=*/"8",
138+
"Number of Execution Units per slice.">,
139+
Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
140+
/*default=*/"8",
141+
"Number of threads per Execution Unit.">,
142+
Option<"localMemSize", "local-mem-size", "size_t",
143+
/*default=*/"131072",
144+
"The size of the local memory, shared across a work-group.">,
145+
Option<"vectorWidth", "vector-width", "size_t",
146+
/*default=*/"512",
147+
"The maximum width of EU's vector registers.">,
148+
Option<"workGroupSize", "work-group-size", "size_t",
149+
/*default=*/"64",
150+
"The maximum workgroup size.">
151+
];
152+
}
123153
#endif // GC_USE_IMEX
124154

125155
def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",

lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -876,8 +876,7 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
876876
{CL_DEVICE_MAX_COMPUTE_UNITS, "num_exec_units"},
877877
{CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, "num_exec_units_per_slice"},
878878
{CL_DEVICE_NUM_THREADS_PER_EU_INTEL, "num_threads_per_eu"},
879-
// Assuming the cache size is equal to the local mem
880-
{CL_DEVICE_LOCAL_MEM_SIZE, "L1_cache_size_in_bytes"},
879+
{CL_DEVICE_LOCAL_MEM_SIZE, "local_mem_size"},
881880
};
882881

883882
unsigned i = 0;

lib/gc/Transforms/GPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ set_property(GLOBAL APPEND PROPERTY IMEX_LIBS ${IMEX_LIBS})
1313
gc_add_mlir_library(GcGpuPasses
1414
AddContextArg.cpp
1515
AllocsToSLM.cpp
16+
GpuTilingAndFusion.cpp
1617
GpuToGpuOcl.cpp
1718
LinalgToXeGPU.cpp
1819
Pipeline.cpp

0 commit comments

Comments
 (0)