support generics (and do closures right)

makslevental · makslevental · commit 2d4dba215f86 · 2024-04-18T13:08:52.000-05:00
diff --git a/examples/cuda_matmul_opt.py b/examples/cuda_matmul_opt.py
@@ -1,25 +1,27 @@
+from __future__ import annotations
 import ast
 import math
 import re
+import time
 
 import cupy as cp
 import mlir.extras.types as T
 import numpy as np
 from cupy.cuda import Module
-from mlir.dialects import math as math_dialect
 
+from mlir.extras.ast.canonicalize import canonicalize
 from mlir.extras.context import (
     mlir_mod_ctx,
     MLIRContext,
 )
-from mlir.extras.dialects.ext import arith, memref, gpu
+from mlir.extras.dialects.ext import arith, memref, gpu, scf
 from mlir.extras.dialects.ext.gpu import (
     block_id,
     thread_id,
     block_dim,
 )
 from mlir.extras.dialects.ext.nvgpu import get_ptx, print_ptx
-from mlir.extras.dialects.ext.scf import range_, yield_
+from mlir.extras.dialects.ext.scf import range_
 from mlir.extras.runtime.passes import Pipeline, run_pipeline
 
 # noinspection PyUnresolvedReferences
@@ -39,26 +41,24 @@ def build_cuda_func(compiled_module, kernel_name="mat_product_kernel"):
 
 
 @gpu.func
-# @canonicalize(using=(arith.canonicalizer, scf.canonicalizer))
+@canonicalize(using=(arith.canonicalizer, scf.canonicalizer))
 def mat_product_kernel[
     M, K, N, dtype
-](A: "T.memref(M, K, dtype)", B: "T.memref(K, N, dtype)", C: "T.memref(M, N, dtype)"):
+](A: T.memref(M, K, dtype), B: T.memref(K, N, dtype), C: T.memref(M, N, dtype)):
     M, K, N, dtype
     x = block_dim.x * block_id.x + thread_id.x
     y = block_dim.y * block_id.y + thread_id.y
 
     one = arith.constant(1.0, type=dtype)
     tmp = arith.constant(0, type=dtype)
     for k, tmp in range_(K, iter_args=[tmp]):
-        # tmp += A[x, k] * B[k, y]
-        # tmp = yield tmp
-        tmp = math_dialect.fma(A[x, k], B[k, y], tmp)
-        tmp = yield_(tmp)
+        tmp += A[x, k] * B[k, y]
+        tmp = yield tmp
     C[x, y] = tmp + one
 
 
 def main(ctx: MLIRContext):
-    M, K, N = 256, 256, 256
+    M, K, N = 2048, 2048, 2048
     BLOCK_SIZE = 32
     dtype = T.f32()
     npy_dtype = np.float32
@@ -69,8 +69,8 @@ def main(ctx: MLIRContext):
     def _():
         mat_product_kernel[M, K, N, dtype].emit()
 
-    print(ctx.module)
-    print(ctx.module.operation.verify())
+    # print(ctx.module)
+    ctx.module.operation.verify()
 
     compiled_module = run_pipeline(
         ctx.module,
@@ -87,8 +87,8 @@ def _():
             },
         ),
     )
-    print(compiled_module)
-    print_ptx(compiled_module)
+    # print(compiled_module)
+    # print_ptx(compiled_module)
 
     A = np.random.randint(0, 10, (M, K)).astype(npy_dtype)
     B = np.random.randint(0, 10, (K, N)).astype(npy_dtype)
@@ -98,12 +98,26 @@ def _():
     dB = cp.asarray(B)
     dC = cp.asarray(C)
 
+    start_gpu = cp.cuda.Event()
+    end_gpu = cp.cuda.Event()
+
     cuda_func = build_cuda_func(compiled_module)
+    start_gpu.record()
+    start_cpu = time.perf_counter()
     cuda_func(
         (math.ceil(M / BLOCK_SIZE), math.ceil(N / BLOCK_SIZE), 1),
         (BLOCK_SIZE, BLOCK_SIZE, 1),
         (dA.data.ptr, dB.data.ptr, dC.data.ptr),
     )
+    end_cpu = time.perf_counter()
+    end_gpu.record()
+    end_gpu.synchronize()
+
+    t_gpu = cp.cuda.get_elapsed_time(start_gpu, end_gpu)
+    t_cpu = end_cpu - start_cpu
+
+    print(f"{t_gpu=}ms")
+    print(f"t_cpu={t_cpu / 1000}ms")
 
     if not cp.array_equal(dC, dA @ dB + 1):
         print(dA @ dB + 1)
diff --git a/mlir/extras/ast/canonicalize.py b/mlir/extras/ast/canonicalize.py
@@ -4,6 +4,7 @@
 import inspect
 import logging
 import types
+import warnings
 from abc import ABC, abstractmethod
 from dis import findlinestarts
 from opcode import opmap
@@ -13,7 +14,7 @@
 import astunparse
 from bytecode import ConcreteBytecode
 
-from ..ast.util import get_module_cst, copy_func
+from ..ast.util import get_module_cst
 
 logger = logging.getLogger(__name__)
 
@@ -59,28 +60,62 @@ def transform_func(f, *transformer_ctors: type(Transformer)):
     return module
 
 
+def insert_closed_vars(f, module):
+    enclosing_mod = ast.FunctionDef(
+        name="enclosing_mod",
+        args=ast.arguments(
+            posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]
+        ),
+        body=[],
+        decorator_list=[],
+    )
+    for var in f.__code__.co_freevars:
+        enclosing_mod.body.append(
+            ast.Assign(
+                targets=[ast.Name(var, ctx=ast.Store())],
+                value=ast.Constant(None, kind="None"),
+            )
+        )
+    enclosing_mod.body.extend(module.body)
+    module.body = [enclosing_mod]
+    return module
+
+
+def find_func_in_code_object(co, func_name):
+    for c in co.co_consts:
+        if type(c) is CodeType:
+            if c.co_name == func_name:
+                return c
+            else:
+                f = find_func_in_code_object(c, func_name)
+                if f is not None:
+                    return f
+
+
 def transform_ast(
     f, transformers: List[Union[type(Transformer), type(StrictTransformer)]] = None
 ):
     if transformers is None:
         return f
 
     module = transform_func(f, *transformers)
+    if f.__closure__:
+        module = insert_closed_vars(f, module)
     module = ast.fix_missing_locations(module)
     module = ast.increment_lineno(module, f.__code__.co_firstlineno - 1)
     module_code_o = compile(module, f.__code__.co_filename, "exec")
-    new_f_code_o = next(
-        c
-        for c in module_code_o.co_consts
-        if type(c) is CodeType and c.co_name == f.__name__
-    )
+    new_f_code_o = find_func_in_code_object(module_code_o, f.__name__)
     n_lines = len(inspect.getsource(f).splitlines())
     line_starts = list(findlinestarts(new_f_code_o))
-    assert (
+    if (
         max([l for _, l in line_starts]) - min([l for _, l in line_starts]) + 1
-        <= n_lines
-    ), f"something went wrong with the line numbers for the rewritten/canonicalized function"
-    return copy_func(f, new_f_code_o)
+        > n_lines
+    ) or (f.__code__.co_firstlineno != min([l for _, l in line_starts])):
+        warnings.warn(
+            "something went wrong with the line numbers for the rewritten/canonicalized function"
+        )
+    f.__code__ = new_f_code_o
+    return f
 
 
 # this is like this because i couldn't figure out how to subclass
@@ -117,7 +152,8 @@ def patch_bytecode(f, patchers: List[type(BytecodePatcher)] = None):
     for patcher in patchers:
         code = patcher(context).patch_bytecode(code, f)
 
-    return copy_func(f, code.to_code())
+    f.__code__ = code.to_code()
+    return f
 
 
 class Canonicalizer(ABC):
diff --git a/mlir/extras/ast/util.py b/mlir/extras/ast/util.py
@@ -1,7 +1,6 @@
 import ast
-import functools
 import inspect
-import types
+from itertools import dropwhile
 from textwrap import dedent
 
 
@@ -26,8 +25,8 @@ def ast_call(name, args=None, keywords=None):
 
 
 def get_module_cst(f):
-    f_src = dedent(inspect.getsource(f))
-    # tree = cst.parse_module(f_src)
+    lines, _lnum = inspect.getsourcelines(f)
+    f_src = dedent("".join(list(dropwhile(lambda l: l.startswith("@"), lines))))
     tree = ast.parse(f_src)
     assert isinstance(
         tree.body[0], ast.FunctionDef
@@ -43,31 +42,6 @@ def bind(func, instance, as_name=None):
     return bound_method
 
 
-def copy_func(f, new_code):
-    """Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)"""
-    g = types.FunctionType(
-        code=new_code,
-        globals={
-            **f.__globals__,
-            **{
-                fr: f.__closure__[i].cell_contents
-                for i, fr in enumerate(f.__code__.co_freevars)
-            },
-        },
-        name=f.__name__,
-        argdefs=f.__defaults__,
-        # TODO(max): ValueError: foo requires closure of length 0, not 1
-        # closure=f.__closure__ if f.__closure__ is not None else None,
-    )
-    g.__kwdefaults__ = f.__kwdefaults__
-    g.__dict__.update(f.__dict__)
-    g = functools.update_wrapper(g, f)
-
-    if inspect.ismethod(f):
-        g = bind(g, f.__self__)
-    return g
-
-
 def append_hidden_node(node_body, new_node):
     last_statement = node_body[-1]
     new_node = ast.fix_missing_locations(
diff --git a/mlir/extras/dialects/ext/func.py b/mlir/extras/dialects/ext/func.py
@@ -256,20 +256,22 @@ def __call__(self, *call_args):
         return call(self.emit(*call_args), call_args)
 
     def __getitem__(self, item):
-        closure = {
-            k: v
-            for k, v in zip(
-                self.body_builder.__code__.co_freevars, self.body_builder.__closure__
-            )
-            if v.cell_contents in self.body_builder.__type_params__
-        }
-
-        for i, t in enumerate(self.body_builder.__type_params__):
-            if t.__bound__ is not None:
-                v = t.__bound__
-            else:
-                v = item[i]
-            closure[t.__name__].cell_contents = v
+        if self.body_builder.__code__.co_freevars and self.body_builder.__closure__:
+            closure = {
+                k: v
+                for k, v in zip(
+                    self.body_builder.__code__.co_freevars,
+                    self.body_builder.__closure__,
+                )
+                if v.cell_contents in self.body_builder.__type_params__
+            }
+
+            for i, t in enumerate(self.body_builder.__type_params__):
+                if t.__bound__ is not None:
+                    v = t.__bound__
+                else:
+                    v = item[i]
+                closure[t.__name__].cell_contents = v
 
         return self