octoml
diff --git a/‎python/mlc_chat/compiler/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/mlc_chat/compiler/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/mlc_chat/compiler/compile.py‎
Lines changed: 18 additions & 88 deletions b/‎python/mlc_chat/compiler/compile.py‎
Lines changed: 18 additions & 88 deletions
diff --git a/‎python/mlc_chat/compiler/compiler_pass/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎python/mlc_chat/compiler/compiler_pass/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/mlc_chat/compiler/compiler_pass/clean_up_tir_attrs.py‎
Lines changed: 31 additions & 0 deletions b/‎python/mlc_chat/compiler/compiler_pass/clean_up_tir_attrs.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎python/mlc_chat/compiler/compiler_pass/fuse_decode_matmul_ewise.py‎
Lines changed: 81 additions & 0 deletions b/‎python/mlc_chat/compiler/compiler_pass/fuse_decode_matmul_ewise.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎python/mlc_chat/compiler/compiler_pass/fuse_decode_take.py‎
Lines changed: 83 additions & 0 deletions b/‎python/mlc_chat/compiler/compiler_pass/fuse_decode_take.py‎
Lines changed: 83 additions & 0 deletions
@@ -2,6 +2,7 @@
 A compiler for MLC Chat. By default, it is not imported to MLC Chat to avoid unnecessary dependency,
 but users could optionally import it if they want to use the compiler.
 """
+from . import compiler_pass
 from .compile import (  # pylint: disable=redefined-builtin
     CompileArgs,
     OptimizationFlags,
 
@@ -1,63 +1,15 @@
 """Python entrypoint of compilation."""
-import argparse
 import dataclasses
-import logging
 from io import StringIO
 from pathlib import Path
 from typing import Callable
 
-from mlc_chat.compiler.model import Model
-from tvm import IRModule  # pylint: disable=wrong-import-order
-from tvm.target import Target  # pylint: disable=wrong-import-order
+from tvm import IRModule, relax
+from tvm.target import Target
 
+from ..compiler.model import Model
 from ..support.style import bold
-
-logger = logging.getLogger(__name__)
-
-
-@dataclasses.dataclass
-class OptimizationFlags:
-    """Optiization flags"""
-
-    cutlass_attn: bool = True
-    cutlass_norm: bool = True
-    cublas_gemm: bool = False
-    cudagraph: bool = False
-
-    def __repr__(self) -> str:
-        out = StringIO()
-        print(f"cutlass_attn={int(self.cutlass_attn)}", file=out, end="")
-        print(f";cutlass_norm={int(self.cutlass_norm)}", file=out, end="")
-        print(f";cublas_gemm={int(self.cublas_gemm)}", file=out, end="")
-        print(f";cudagraph={int(self.cudagraph)}", file=out, end="")
-        return out.getvalue().rstrip()
-
-    @staticmethod
-    def from_str(source: str) -> "OptimizationFlags":
-        """Parse optimization flags from a string."""
-
-        if source in OPT_FLAG_PRESET:
-            return OPT_FLAG_PRESET[source]
-
-        def boolean(value: str) -> bool:
-            if value == "0":
-                return False
-            if value == "1":
-                return True
-            raise ValueError(f"Invalid boolean value: {value}")
-
-        parser = argparse.ArgumentParser(description="optimization flags")
-        parser.add_argument("--cutlass_attn", type=boolean, default=True)
-        parser.add_argument("--cutlass_norm", type=boolean, default=True)
-        parser.add_argument("--cublas_gemm", type=boolean, default=False)
-        parser.add_argument("--cudagraph", type=boolean, default=False)
-        results = parser.parse_args([f"--{i}" for i in source.split(";") if i])
-        return OptimizationFlags(
-            cutlass_attn=results.cutlass_attn,
-            cutlass_norm=results.cutlass_norm,
-            cublas_gemm=results.cublas_gemm,
-            cudagraph=results.cudagraph,
-        )
+from .flags_optimization import OptimizationFlags
 
 
 @dataclasses.dataclass
@@ -86,6 +38,19 @@ def _echo_args(args: CompileArgs) -> None:
     print(out.getvalue().rstrip())
 
 
+def _compile(args: CompileArgs):
+    model_config = args.model.config.from_file(args.config)
+    model = args.model.model(model_config)
+    mod, named_params = model.export_tvm(
+        spec=model.get_default_spec(),  # type: ignore
+    )
+    with args.target:
+        mod = relax.get_pipeline("mlc_llm")(mod)
+    mod.show(black_format=False)
+    for name, param in named_params:
+        print(f"{name}: {param.shape} {param.dtype}")
+
+
 def compile(  # pylint: disable=too-many-arguments,redefined-builtin
     config: Path,
     quantization,
@@ -101,39 +66,4 @@ def compile(  # pylint: disable=too-many-arguments,redefined-builtin
         config, quantization, model_type, target, opt, build_func, prefix_symbols, output
     )
     _echo_args(args)
-    model_config = args.model.config.from_file(args.config)
-    model = args.model.model(model_config)
-    mod, named_params = model.export_tvm(
-        spec=model.get_default_spec(),  # type: ignore
-    )
-    mod.show(black_format=False)
-    for name, param in named_params:
-        print(f"{name}: {param.shape} {param.dtype}")
-
-
-OPT_FLAG_PRESET = {
-    "O0": OptimizationFlags(
-        cutlass_attn=False,
-        cutlass_norm=False,
-        cublas_gemm=False,
-        cudagraph=False,
-    ),
-    "O1": OptimizationFlags(
-        cutlass_attn=False,
-        cutlass_norm=True,
-        cublas_gemm=False,
-        cudagraph=False,
-    ),
-    "O2": OptimizationFlags(
-        cutlass_attn=True,
-        cutlass_norm=True,
-        cublas_gemm=False,
-        cudagraph=False,
-    ),
-    "O3": OptimizationFlags(
-        cutlass_attn=True,
-        cutlass_norm=True,
-        cublas_gemm=False,
-        cudagraph=True,
-    ),
-}
+    _compile(args)
@@ -0,0 +1,2 @@
+"""Compiler passes used in MLC LLM."""
+from . import pipeline as _pipeline
@@ -0,0 +1,31 @@
+"""A compiler pass that cleans up undesired TIR attrs."""
+from typing import List
+
+import tvm
+from tvm.ir.module import IRModule
+
+
+@tvm.transform.module_pass(opt_level=0, name="CleanUpTIRAttrs")
+class CleanUpTIRAttrs:  # pylint: disable=too-few-public-methods
+    """A compiler pass that cleans up undesired TIR attrs."""
+
+    def __init__(self, attrs: List[str]):
+        self.attrs = attrs
+
+    def transform_module(
+        self,
+        mod: IRModule,
+        _ctx: tvm.transform.PassContext,
+    ) -> IRModule:
+        """IRModule-level transformation"""
+        for g_var in list(mod.functions):
+            func = mod[g_var]
+            changed = False
+            for attr in self.attrs:
+                if func.attrs is not None and attr in func.attrs:
+                    func = func.without_attr(attr)
+                    changed = True
+                    break
+            if changed:
+                mod[g_var] = func
+        return mod
@@ -0,0 +1,81 @@
+"""A compiler pass that fuses decode + matmul + elementwise."""
+import tvm
+from tvm import IRModule, relax
+from tvm.relax.dpl.pattern import GlobalVarPattern, TuplePattern, is_op, wildcard
+
+
+@tvm.transform.module_pass(opt_level=0, name="FuseDecodeMatmulEwise")
+class FuseDecodeMatmulEwise:  # pylint: disable=too-few-public-methods
+    """A compiler pass that fuses decode + matmul + elementwise."""
+
+    def transform_module(
+        self,
+        mod: IRModule,
+        _ctx: tvm.transform.PassContext,
+    ) -> IRModule:
+        """IRModule-level transformation"""
+        for n_aux_tensor in [1, 2, 3, 4]:
+            for match_ewise in [0, 1, 2, 6]:
+                if match_ewise == 6 and n_aux_tensor != 4:
+                    continue
+                mod = relax.transform.FuseOpsByPattern(
+                    [
+                        (
+                            "decode_matmul",
+                            *_pattern(match_ewise, n_aux_tensor),
+                        )
+                    ]
+                )(mod)
+        mod = relax.transform.FuseTIR()(mod)
+        return mod
+
+
+def _pattern(match_ewise: int, n_aux_tensor: int):
+    # pylint: disable=invalid-name
+    w_scaled = wildcard()
+    x = wildcard()
+    w = is_op("relax.call_tir")(
+        GlobalVarPattern(),
+        TuplePattern([w_scaled] + [wildcard() for _ in range(n_aux_tensor)]),
+        add_constraint=False,
+    )
+    matmul = is_op("relax.call_tir")(
+        GlobalVarPattern(),
+        TuplePattern([x, w] + [wildcard() for _ in range(match_ewise)]),
+        add_constraint=False,
+    )
+    # pylint: enable=invalid-name
+    annotations = {
+        "w_scaled": w_scaled,
+        "x": x,
+        "w": w,
+        "matmul": matmul,
+    }
+
+    def _check_decoding(ctx: relax.transform.PatternCheckContext) -> bool:
+        call = ctx.annotated_expr["w"]
+        if not isinstance(call, relax.Call):
+            return False
+        g_var = call.args[0]
+        if not isinstance(g_var, relax.GlobalVar):
+            return False
+        return g_var.name_hint.startswith("decode") or g_var.name_hint.startswith("fused_decode")
+
+    def _check_matmul(ctx: relax.transform.PatternCheckContext) -> bool:
+        call = ctx.annotated_expr["matmul"]
+        if not isinstance(call, relax.Call):
+            return False
+        g_var = call.args[0]
+        if not isinstance(g_var, relax.GlobalVar):
+            return False
+        return (
+            g_var.name_hint.startswith("matmul")
+            or g_var.name_hint.startswith("fused_matmul")
+            or g_var.name_hint.startswith("NT_matmul")
+            or g_var.name_hint.startswith("fused_NT_matmul")
+        )
+
+    def _check(ctx: relax.transform.PatternCheckContext) -> bool:
+        return _check_decoding(ctx) and _check_matmul(ctx)
+
+    return matmul, annotations, _check
@@ -0,0 +1,83 @@
+"""A compiler pass that fuses decode + take."""
+import tvm
+from tvm import IRModule, relax, tir
+from tvm.relax.dpl.pattern import (
+    GlobalVarPattern,
+    TuplePattern,
+    is_const,
+    is_op,
+    wildcard,
+)
+
+
+@tvm.transform.module_pass(opt_level=0, name="FuseDecodeTake")
+class FuseDecodeTake:  # pylint: disable=too-few-public-methods
+    """A compiler pass that fuses decode + take."""
+
+    def transform_module(
+        self,
+        mod: IRModule,
+        _ctx: tvm.transform.PassContext,
+    ) -> IRModule:
+        """IRModule-level transformation"""
+        for n_aux_tensor in [2, 3]:
+            for match_tir_vars in [False, True]:
+                mod = relax.transform.FuseOpsByPattern(
+                    [
+                        (
+                            "decode_take",
+                            *_pattern(n_aux_tensor, match_tir_vars),
+                        )
+                    ]
+                )(mod)
+        mod = relax.transform.FuseTIR()(mod)
+        for g_var, func in mod.functions.items():
+            name = g_var.name_hint
+            if isinstance(func, tir.PrimFunc) and (("fused_decode" in name) and ("take" in name)):
+                mod = tvm.IRModule({"main": func})
+                sch = tir.Schedule(mod)
+                sch.compute_inline("decode")
+                mod[g_var] = sch.mod["main"]
+        return mod
+
+
+def _pattern(n_aux_tensor: int, match_tir_vars: bool):
+    decode = is_op("relax.call_tir")(
+        GlobalVarPattern(),
+        TuplePattern([wildcard() for _ in range(n_aux_tensor)]),
+        add_constraint=False,
+    )
+    indices = ~is_const()
+    if match_tir_vars:
+        call_tir_args_take = [
+            GlobalVarPattern(),
+            TuplePattern([decode, indices]),
+            wildcard(),
+        ]
+    else:
+        call_tir_args_take = [
+            GlobalVarPattern(),
+            TuplePattern([decode, indices]),
+        ]
+    take = is_op("relax.call_tir")(
+        *call_tir_args_take,
+        add_constraint=False,
+    )
+    annotations = {
+        "take": take,
+        "decode": decode,
+        "indices": indices,
+    }
+
+    def _check(ctx: relax.transform.PatternCheckContext) -> bool:
+        take = ctx.annotated_expr["take"]
+        decode = ctx.annotated_expr["decode"]
+        if not isinstance(decode, relax.expr.Call):
+            return False
+        if not isinstance(take.args[0], relax.GlobalVar) or not isinstance(
+            decode.args[0], relax.GlobalVar
+        ):
+            return False
+        return "take" in take.args[0].name_hint and "decode" in decode.args[0].name_hint
+
+    return take, annotations, _check
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+"""Compiler passes used in MLC LLM."""`
	`2`	`+from . import pipeline as _pipeline`