intel · kurapov-peter · Nov 26, 2024 · Oct 16, 2024
diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td
@@ -120,6 +120,36 @@ def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
            "Call finish() after each kernel launch.">
     ];
 }
+
+def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
+  let summary = "GPU tiling and fusion path.";
+  let description = [{
+    This pass tiles linalg operations and creates two nested scf.forall loops. When converting to gpu.launch,
+    the inner loop is mapped to the block sizes and the outer - to grid sizes. The tiles calculation is based
+    on the GPU device properties, retrieved from the DLTI attributes. If the DLTI attributes are not specified,
+    defaults to the pass options.
+  }];
+  let options = [
+    Option<"numEus", "num-eus", "size_t",
+           /*default=*/"448",
+           "Number of Execution Units.">,
+    Option<"numEusPerSlice", "num-eus-per-slice", "size_t",
+           /*default=*/"8",
+           "Number of Execution Units per slice.">,
+    Option<"numThreadsPerEu", "num-threads-per-eu", "size_t",
+           /*default=*/"8",
+           "Number of threads per Execution Unit.">,
+    Option<"localMemSize", "local-mem-size", "size_t",
+           /*default=*/"131072",
+           "The size of the local memory, shared across a work-group.">,
+    Option<"vectorWidth", "vector-width", "size_t",
+           /*default=*/"512",
+           "The maximum width of EU's vector registers.">,
+    Option<"workGroupSize", "work-group-size", "size_t",
+           /*default=*/"64",
+           "The maximum workgroup size.">
+    ];
+}
 #endif // GC_USE_IMEX
 
 def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",

diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
@@ -876,8 +876,7 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
         {CL_DEVICE_MAX_COMPUTE_UNITS, "num_exec_units"},
         {CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, "num_exec_units_per_slice"},
         {CL_DEVICE_NUM_THREADS_PER_EU_INTEL, "num_threads_per_eu"},
-        // Assuming the cache size is equal to the local mem
-        {CL_DEVICE_LOCAL_MEM_SIZE, "L1_cache_size_in_bytes"},
+        {CL_DEVICE_LOCAL_MEM_SIZE, "local_mem_size"},
     };
 
     unsigned i = 0;

diff --git a/lib/gc/Transforms/GPU/CMakeLists.txt b/lib/gc/Transforms/GPU/CMakeLists.txt
@@ -13,6 +13,7 @@ set_property(GLOBAL APPEND PROPERTY IMEX_LIBS ${IMEX_LIBS})
 gc_add_mlir_library(GcGpuPasses
   AddContextArg.cpp
   AllocsToSLM.cpp
+  GpuTilingAndFusion.cpp
   GpuToGpuOcl.cpp
   LinalgToXeGPU.cpp
   Pipeline.cpp