warp tiling fast

makslevental · makslevental · commit 8d63ca4042c2 · 2024-04-23T13:33:16.000-05:00
diff --git a/examples/cuda_matmul_opt.py b/examples/cuda_matmul_opt.py
@@ -775,15 +775,16 @@ def prepare_warp_tiled_kernel(ctx: MLIRContext, kernel, M, K, N):
 
     gpu.set_container_module(ctx.module)
 
+    # Settings for A100 (looks like it works for 3070 too?)
     NUM_THREADS = 128
     BN = 128
-    BM = 128
+    BM = 64
     BK = 16
     WN = 64
-    WM = 64
-    WNITER = 4
+    WM = 32
+    WNITER = 1
     TN = 4
-    TM = 8
+    TM = 4
 
     @gpu.module("matmul", ["#nvvm.target"])
     def matmul_mod():
@@ -869,11 +870,11 @@ def run_eval(
 repeats = None
 
 for k in [
-    # sgemm_naive,
-    # sgemm_naive_row_order,
-    # sgemm_coalesce,
-    # sgemm_coalesce_transpose_B,
-    # sgemm_shared_mem_block,
+    sgemm_naive,
+    sgemm_naive_row_order,
+    sgemm_coalesce,
+    sgemm_coalesce_transpose_B,
+    sgemm_shared_mem_block,
 ]:
     print(f"\n{k.__name__}")
     for s in sizes:
@@ -899,9 +900,9 @@ def run_eval(
 
 
 for k in [
-    # sgemm_shared_mem_1d_block_tiling,
-    # sgemm_shared_mem_2d_block_tiling,
-    # sgemm_shared_mem_2d_block_tiling_vectorize,
+    sgemm_shared_mem_1d_block_tiling,
+    sgemm_shared_mem_2d_block_tiling,
+    sgemm_shared_mem_2d_block_tiling_vectorize,
 ]:
     print(f"\n{k.__name__}")
     for s in sizes:
@@ -925,6 +926,7 @@ def run_eval(
                 transpose_B,
             )
 
+print(f"\n{sgemm_warp_tiling.__name__}")
 for s in sizes:
     with (
         mlir_mod_ctx() as ctx,
diff --git a/mlir/extras/ast/canonicalize.py b/mlir/extras/ast/canonicalize.py
@@ -116,7 +116,7 @@ def transform_ast(
         max([l for _, l in line_starts]) - min([l for _, l in line_starts]) + 1
         > n_lines
     ) or (f.__code__.co_firstlineno != min([l for _, l in line_starts])):
-        warnings.warn(
+        logger.debug(
             "something went wrong with the line numbers for the rewritten/canonicalized function"
         )
     f.__code__ = new_f_code_o

Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ def transform_ast(`
`116`	`116`	`max([l for _, l in line_starts]) - min([l for _, l in line_starts]) + 1`
`117`	`117`	`> n_lines`
`118`	`118`	`) or (f.__code__.co_firstlineno != min([l for _, l in line_starts])):`
`119`		`- warnings.warn(`
	`119`	`+ logger.debug(`
`120`	`120`	`"something went wrong with the line numbers for the rewritten/canonicalized function"`
`121`	`121`	`)`
`122`	`122`	`f.__code__ = new_f_code_o`