Refactor autotune error handling (#595)

jansel · web-flow · commit f8cfb5ab4e5b · 2025-09-17T17:12:31.000-07:00
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -11,7 +11,6 @@
 from math import inf
 from multiprocessing import connection
 import os
-import re
 import sys
 import time
 from typing import TYPE_CHECKING
@@ -21,8 +20,6 @@
 if TYPE_CHECKING:
     from triton.runtime.jit import JITFunction
 
-from torch._inductor.runtime.triton_compat import OutOfResources
-from torch._inductor.runtime.triton_compat import PTXASError
 import torch.multiprocessing as mp
 from triton.testing import do_bench
 
@@ -32,6 +29,8 @@
 from .config_generation import ConfigGeneration
 from .config_generation import FlatConfig
 from .logger import LambdaLogger
+from .logger import classify_triton_exception
+from .logger import format_triton_compile_failure
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -44,20 +43,6 @@
     from ..runtime.settings import Settings
     from . import ConfigSpec
 
-_expected_errors_regexp: re.Pattern[str] = re.compile(
-    r"|".join(
-        map(
-            re.escape,
-            [
-                "[CUDA]: invalid argument",  # CUDA Error
-                "misaligned address",  # CUDA Error
-                "PassManager::run failed",  # Triton Error
-                "illegal memory access",  # CUDA Error
-            ],
-        )
-    )
-)
-
 
 class BaseAutotuner(abc.ABC):
     """
@@ -143,22 +128,15 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
                 lambda: f"result: {res:.4f}ms (took {t1 - t0:.1f}s + {t2 - t1:.1f}s)",
             )
             return res  # pyright: ignore[reportReturnType]
-        except OutOfResources:
-            self.log.debug("Benchmarking failed: OutOfResources")
-        except PTXASError:
-            self.log.warning(f"PTXASError compiling config: {config}")
         except Exception as e:
-            msg = str(e)
-            if not _expected_errors_regexp.search(msg):
+            action = classify_triton_exception(e)
+            if action == "raise":
                 raise exc.TritonError(f"{type(e).__qualname__}: {e}", config) from e
-            # Surface Triton IR pass failures more prominently for easier bug reports.
-            if "PassManager::run failed" in msg:
-                self.log.warning(
-                    f"Triton PassManager::run failed while compiling config: {config}. Error: {e}"
-                )
+            if action == "warn":
+                self.log.warning(format_triton_compile_failure(config, e))
             else:
                 self.log.debug(f"Benchmarking failed: {type(e).__name__}: {e}")
-        return inf
+            return inf
 
     def start_precompile_and_check_for_hangs(
         self, config: Config, fn: CompiledConfig
@@ -195,7 +173,7 @@ def extract_launcher(
             # Should not reach here
             raise RuntimeError("Expected _ExtractedLaunchArgs exception")
         except _ExtractedLaunchArgs as e:
-            precompiler = make_precompiler(e.kernel)(*e.args, **e.kwargs)
+            precompiler = make_precompiler(e.kernel, config)(*e.args, **e.kwargs)
             if precompiler is already_compiled:
                 return PrecompileFuture.skip(self, config, True)
         process: mp.Process = ctx.Process(target=precompiler)  # pyright: ignore[reportAssignmentType]
@@ -575,8 +553,8 @@ def _mark_complete(self) -> bool:
         if not self.started:
             self.start()
         if not process.is_alive():
-            self.ok = True
-            return True
+            self.ok = process.exitcode == 0
+            return self.ok
         process.terminate()
         process.join(10)
         msg = f"Timeout after {self.elapsed:.0f}s compiling {self.config}"
diff --git a/helion/autotuner/logger.py b/helion/autotuner/logger.py
@@ -2,9 +2,18 @@
 
 import itertools
 import logging
+import re
 import sys
 import time
+from typing import TYPE_CHECKING
 from typing import Callable
+from typing import Literal
+
+from torch._inductor.runtime.triton_compat import OutOfResources
+from torch._inductor.runtime.triton_compat import PTXASError
+
+if TYPE_CHECKING:
+    from ..runtime.config import Config
 
 
 class LambdaLogger:
@@ -81,3 +90,52 @@ def _maybe_call(fn: Callable[[], str] | str) -> str:
     if callable(fn):
         return fn()
     return fn
+
+
+def format_triton_compile_failure(config: Config, err: BaseException) -> str:
+    return (
+        "Triton compile failed. This likely indicates a bug in Triton. "
+        "Skipping failing config.\n"
+        f"Config: {config!r}\n"
+        f"Error: {type(err).__name__}: {err}"
+    )
+
+
+# Common logic to decide how to surface Triton errors
+_EXPECTED_TRITON_ERRORS_RE: re.Pattern[str] = re.compile(
+    "|".join(
+        map(
+            re.escape,
+            [
+                "[CUDA]: invalid argument",  # CUDA Error
+                "misaligned address",  # CUDA Error
+                "illegal memory access",  # CUDA Error
+                "PassManager::run failed",  # Triton Error
+            ],
+        )
+    )
+)
+
+
+def classify_triton_exception(err: BaseException) -> Literal["raise", "warn", "debug"]:
+    """
+    Classify a Triton compile/runtime exception during autotuning.
+
+    Returns one of:
+      - "raise": unexpected error, caller should raise
+      - "warn": notable expected error (e.g., PassManager pipeline failure)
+      - "debug": benign/expected error; caller can log at debug level
+    """
+    # Known exception types first
+    if isinstance(err, OutOfResources):
+        return "debug"
+    # Different PTXASError classes may be raised from different modules; match by name as well
+    if isinstance(err, PTXASError) or err.__class__.__name__ == "PTXASError":
+        return "warn"
+
+    msg = str(err)
+    if "PassManager::run failed" in msg:
+        return "warn"
+    if _EXPECTED_TRITON_ERRORS_RE.search(msg):
+        return "debug"
+    return "raise"
diff --git a/helion/runtime/precompile_shim.py b/helion/runtime/precompile_shim.py
@@ -1,15 +1,23 @@
 from __future__ import annotations
 
 import os
+import sys
 from typing import TYPE_CHECKING
 
+from ..autotuner.logger import classify_triton_exception
+from ..autotuner.logger import format_triton_compile_failure
+
 if TYPE_CHECKING:
     from collections.abc import Callable
 
     from triton.runtime.jit import JITFunction
 
+    from .config import Config
+
 
-def make_precompiler(fn: JITFunction[object]) -> Callable[..., Callable[[], None]]:
+def make_precompiler(
+    fn: JITFunction[object], config: Config
+) -> Callable[..., Callable[[], None]]:
     from triton.runtime.jit import find_paths_if
     from triton.runtime.jit import get_iterable_path
 
@@ -48,14 +56,16 @@ def _make_precompiler(*args: object, **kwargs: object) -> Callable[[], None]:
         def finish_it() -> None:
             src = fn.ASTSource(fn, signature, constexprs, attrs)
             # here we update the cache so if this is called in the parent we skip a extra compile
-            from triton.runtime.errors import PTXASError
 
             try:
                 kernel_cache[key] = fn.compile(
                     src, target=target, options=options.__dict__
                 )
-            except PTXASError:
-                return
+            except Exception as e:
+                action = classify_triton_exception(e)
+                if action != "debug":
+                    print(format_triton_compile_failure(config, e), file=sys.stderr)
+                sys.exit(1)
 
         return finish_it
 
diff --git a/test/test_autotuner.py b/test/test_autotuner.py
@@ -131,7 +131,7 @@ def test_random_search(self):
             torch.randn([512, 512], device=DEVICE),
         )
         bound_kernel = examples_matmul.bind(args)
-        best = RandomSearch(bound_kernel, args, 5).autotune()
+        best = RandomSearch(bound_kernel, args, 10).autotune()
         fn = bound_kernel.compile_config(best)
         torch.testing.assert_close(fn(*args), args[0] @ args[1], rtol=1e-2, atol=1e-1)
 

Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,7 @@ def test_random_search(self):`
`131`	`131`	`torch.randn([512, 512], device=DEVICE),`
`132`	`132`	`)`
`133`	`133`	`bound_kernel = examples_matmul.bind(args)`
`134`		`- best = RandomSearch(bound_kernel, args, 5).autotune()`
	`134`	`+ best = RandomSearch(bound_kernel, args, 10).autotune()`
`135`	`135`	`fn = bound_kernel.compile_config(best)`
`136`	`136`	`torch.testing.assert_close(fn(*args), args[0] @ args[1], rtol=1e-2, atol=1e-1)`
`137`	`137`