[DebugMode] record triton kernels, run-to-run determinism checks (pytorch#167028)

pianpwk · Silv3S · commit 76100f99dd7f · 2025-11-18T16:39:46.000Z
Following up on pytorch#166348, extends DebugMode to capture inductor triton kernels at runtime, and adds an API for checking run-to-run determinism based on tensor hashes. The workflow looks something like... ```python # do 1st run with hashes, get logs with DebugMode() as debug_mode, DebugMode.log_tensor_hashes(): compiled_model(*inputs) logs1 = debug_mode.logs # do 2nd run with DebugMode() as debug_mode, DebugMode.log_tensor_hashes(): compiled_model(*inputs) logs2 = debug_mode.logs # returns list of calls w/ mismatched outputs mismatches = DebugMode.check_hash_mismatches(logs1, logs2) ``` Example dump off a smaller version of @drisspg's FlexAttention fwd+bwd determinism tests [script](https://gist.github.com/pianpwk/f65cc63811d12853709dcc77d7eb69f1) (without forced reduction order): ``` cfg: TestConfig(name='Standard', B=2, Hq=32, Hkv=32, Q=2048, KV=2048, Dqk=128, Dv=128) DETERMINISM: fwd: True, bwd_q: False, bwd_k: False, bwd_v: True $$$ DEBUG MODE DUMP $$$ (this is what the logs look like) [triton] triton_tem_fused_0(arg_Q=t: bf16[2, 32, 2048, 128], arg_K=t: bf16[2, 32, 2048, 128], arg_V=t: bf16[2, 32, 2048, 128], arg_LSE=t: f32[2, 32, 2048], arg_MAX=t: f32[2, 32, 2048], arg_KV_NUM_BLKS=t: i32[2, 32, 16], arg_KV_IDX=t: i32[2, 32, 16, 16], arg_FULL_KV_NUM_BLKS=t: i32[2, 32, 16], arg_FULL_KV_IDX=t: i32[2, 32, 16, 16], out_ptr0=t: bf16[2, 32, 2048, 128]) # post-kernel hashes: {arg_Q: 13385916.068706088, arg_K: 13389356.409105342, arg_V: 13384993.48412523, arg_LSE: 1347168.9026973695, arg_MAX: 81775.3811062593, arg_KV_NUM_BLKS: 1024.0, arg_KV_IDX: 122880.0, arg_FULL_KV_NUM_BLKS: 7680.0, arg_FULL_KV_IDX: 122880.0, out_ptr0: 924917.7918248245} [triton] triton_per_fused_zeros_0(in_ptr0=t: bf16[2, 32, 2048, 128], in_ptr1=t: bf16[2, 32, 2048, 128], out_ptr1=t: f32[2, 32, 2048], xnumel=131072, r0_numel=128) # post-kernel hashes: {in_ptr0: 924917.7918248245, in_ptr1: 13389213.797377996, out_ptr1: 81775.38106592931} [triton] triton_tem_fused_zeros_1(arg_Q=t: bf16[2, 32, 2048, 128], arg_K=t: bf16[2, 32, 2048, 128], arg_V=t: bf16[2, 32, 2048, 128], arg_LSE=t: f32[2, 32, 2048], arg_DELTA=t: f32[2, 32, 2048], arg_DO=t: bf16[2, 32, 2048, 128], arg_DQ=t: bf16[2, 32, 2048, 128], arg_DV=t: bf16[2, 32, 2048, 128], arg_KV_NUM_BLKS=t: i32[2, 32, 16], arg_KV_IDX=t: i32[2, 32, 16, 16], arg_Q_NUM_BLKS=t: i32[2, 32, 16], arg_Q_IDX=t: i32[2, 32, 16, 16], arg_FULL_KV_NUM_BLKS=t: i32[2, 32, 16], arg_FULL_KV_IDX=t: i32[2, 32, 16, 16], arg_FULL_Q_NUM_BLKS=t: i32[2, 32, 16], arg_FULL_Q_IDX=t: i32[2, 32, 16, 16], out_ptr0=t: bf16[2, 32, 2048, 128]) # post-kernel hashes: {arg_Q: 13385916.068706088, arg_K: 13389356.409105342, arg_V: 13384993.48412523, arg_LSE: 1347168.9026973695, arg_DELTA: 81775.38106592931, arg_DO: 13389213.797377996, arg_DQ: 874474.8084187683, arg_DV: 727742.3138379117, arg_KV_NUM_BLKS: 1024.0, arg_KV_IDX: 122880.0, arg_Q_NUM_BLKS: 1024.0, arg_Q_IDX: 122880.0, arg_FULL_KV_NUM_BLKS: 7680.0, arg_FULL_KV_IDX: 122880.0, arg_FULL_Q_NUM_BLKS: 7680.0, arg_FULL_Q_IDX: 122880.0, out_ptr0: 700542.3431890717} $$$ MISMATCHES $$$ mismatch: {'call_type': 'triton kernel', 'call': 'triton_tem_fused_0', 'arg_name': 'arg_MAX', 'pytree_path': None, 'hash1': 0.0, 'hash2': 81775.3811062593, 'rel_diff': 1.0, 'is_input_hash': False} # I guess this one is misleading? not sure if I'm doing something wrong with waiting for kernel results mismatch: {'call_type': 'triton kernel', 'call': 'triton_per_fused_zeros_0', 'arg_name': 'out_ptr1', 'pytree_path': None, 'hash1': 81775.3811062593, 'hash2': 81775.38106592931, 'rel_diff': 4.931801261646669e-10, 'is_input_hash': False} mismatch: {'call_type': 'triton kernel', 'call': 'triton_tem_fused_zeros_1', 'arg_name': 'arg_DELTA', 'pytree_path': None, 'hash1': 81775.3811062593, 'hash2': 81775.38106592931, 'rel_diff': 4.931801261646669e-10, 'is_input_hash': False} mismatch: {'call_type': 'triton kernel', 'call': 'triton_tem_fused_zeros_1', 'arg_name': 'arg_DQ', 'pytree_path': None, 'hash1': 874474.8097136207, 'hash2': 874474.8084187683, 'rel_diff': 1.480720012120795e-09, 'is_input_hash': False} mismatch: {'call_type': 'triton kernel', 'call': 'triton_tem_fused_zeros_1', 'arg_name': 'out_ptr0', 'pytree_path': None, 'hash1': 700542.3488049245, 'hash2': 700542.3431890717, 'rel_diff': 8.016435812581196e-09, 'is_input_hash': False} ``` note: current hash implementation is basically tensor norm, so tensor closeness -> hash closeness. This is likely to change soon, e.g. maybe to `torch.hash_tensor` (pytorch#154149) by default Sample paste diff between log dumps from 2 runs: <img width="1665" height="445" alt="Screenshot 2025-11-05 at 11 27 24 PM" src="https://github.com/user-attachments/assets/41402e37-f50b-4a9e-a17c-bb98b5917076" /> Another case where running this for FSDP2 on Llama3-8B, helped narrow down divergence b/w aot_eager <-> inductor, to inductor's FWD RMSNorm kernels: P2027003180 Pull Request resolved: pytorch#167028 Approved by: https://github.com/v0i0
diff --git a/test/distributed/tensor/debug/test_debug_mode.py b/test/distributed/tensor/debug/test_debug_mode.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 
 import contextlib
+import unittest
 
 import torch
 import torch.distributed as dist
@@ -23,8 +24,15 @@
     TestCase,
 )
 from torch.testing._internal.distributed.fake_pg import FakeStore
-from torch.utils._debug_mode import _OpCall, _RedistributeCall, DebugMode
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+from torch.utils._debug_mode import (
+    _OpCall,
+    _RedistributeCall,
+    _TritonKernelCall,
+    DebugMode,
+)
 from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._triton import has_triton_package
 
 
 @requires_cuda
@@ -434,6 +442,110 @@ def forward(self, x):
         ][-1]
         self.assertTrue("self.l2(self.l1(x))" in sum_op.fwd_stack_trace)
 
+    @unittest.skipIf(not HAS_GPU, "requires GPU")
+    @unittest.skipIf(not has_triton_package(), "requires triton")
+    def test_triton_kernel_logs(self):
+        import triton
+
+        from torch.testing._internal.triton_utils import add_kernel_autotuned
+
+        def call_triton(x, y):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)  # noqa: E731
+            add_kernel_autotuned[grid](x, y, output, n_elements)
+            return output
+
+        x = torch.randn(128, device=GPU_TYPE)
+        y = torch.randn(128, device=GPU_TYPE)
+
+        with DebugMode() as debug_mode:
+            torch.compile(call_triton)(x, y)
+
+        triton_calls = [
+            op for op in debug_mode.operators if isinstance(op, _TritonKernelCall)
+        ]
+        self.assertGreater(len(triton_calls), 0)
+        self.assertIn("[triton]", triton_calls[0].render([]))
+
+    def test_check_hash_mismatches(self):
+        x = torch.randn(64, 64, device=GPU_TYPE)
+        x_different = torch.randn(64, 64, device=GPU_TYPE)
+
+        # Identical runs should have no mismatches
+        with DebugMode() as dm1, DebugMode.log_tensor_hashes():
+            x.sin().sum()
+        with DebugMode() as dm2, DebugMode.log_tensor_hashes():
+            x.sin().sum()
+        mismatches = DebugMode.check_hash_mismatches(dm1.logs, dm2.logs)
+        self.assertEqual(len(mismatches), 0)
+
+        # Different inputs should produce hash mismatches
+        with DebugMode() as dm3, DebugMode.log_tensor_hashes():
+            x_different.sin().sum()
+
+        # Check that mismatches are detected
+        mismatches = DebugMode.check_hash_mismatches(dm1.logs, dm3.logs)
+        self.assertEqual(len(mismatches), 2)
+        self.assertEqual(
+            [call["call"] for call in mismatches], ["aten::sin", "aten::sum"]
+        )
+
+    @unittest.skipIf(not HAS_GPU, "requires GPU")
+    @unittest.skipIf(not has_triton_package(), "requires triton")
+    def test_check_triton_hash_mismatches(self):
+        import triton
+
+        from torch.testing._internal.triton_utils import add_kernel_autotuned
+
+        def call_triton(x, y):
+            output = torch.zeros_like(x)
+            n_elements = output.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)  # noqa: E731
+            add_kernel_autotuned[grid](x, y, output, n_elements)
+            return output
+
+        a = torch.randn(128, device=GPU_TYPE)
+        b = torch.randn(128, device=GPU_TYPE)
+        c = torch.randn(128, device=GPU_TYPE)
+
+        # Run with hash logging to verify triton kernels can be hashed
+        with DebugMode() as dm_t1, DebugMode.log_tensor_hashes(hash_inputs=True):
+            torch.compile(call_triton)(a, b)
+
+        # Different inputs should have different hashes in triton kernels
+        with DebugMode() as dm_t2, DebugMode.log_tensor_hashes(hash_inputs=True):
+            torch.compile(call_triton)(a, c)
+
+        # Compare triton kernel hashes
+        mismatches = DebugMode.check_hash_mismatches(
+            dm_t1.logs, dm_t2.logs, compare_inputs=True
+        )
+        triton_mismatches = [m for m in mismatches if m["call_type"] == "triton kernel"]
+        self.assertGreater(len(triton_mismatches), 0)
+
+        # check both input & output hash mismatches are detected
+        self.assertGreater(len([m for m in triton_mismatches if m["is_input_hash"]]), 0)
+        self.assertGreater(
+            len([m for m in triton_mismatches if not m["is_input_hash"]]), 0
+        )
+
+    def test_check_structure_mismatches(self):
+        x = torch.randn(32, 32, device=self.device_type)
+
+        with DebugMode() as dm1, DebugMode.log_tensor_hashes():
+            x.sin()
+        with DebugMode() as dm2, DebugMode.log_tensor_hashes():
+            x.cos()
+        with DebugMode() as dm3, DebugMode.log_tensor_hashes():
+            x.sin().cos()
+
+        with self.assertRaisesRegex(ValueError, "Operators don't match"):
+            DebugMode.check_hash_mismatches(dm1.logs, dm2.logs)
+
+        with self.assertRaisesRegex(ValueError, "Log lengths don't match"):
+            DebugMode.check_hash_mismatches(dm1.logs, dm3.logs)
+
     def test_pretty_print_dtensor_make_fx(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py
@@ -12,6 +12,7 @@
 import torch.utils._pytree as pytree
 from torch._dynamo.utils import counters, dynamo_timed
 from torch._inductor.config import use_experimental_benchmarker
+from torch.utils._debug_mode import DebugMode
 
 
 logger = torch._logging.getArtifactLogger(__name__, "benchmarking")
@@ -189,12 +190,14 @@ def benchmark(
         else:
             _callable = lambda: fn(*fn_args, **fn_kwargs)  # noqa: E731
 
-        if inferred_device == torch.device("cpu"):
-            return self.benchmark_cpu(_callable, **kwargs)
-        # TODO(nmacchioni): For non-CPU functions we default to using the GPU-specific benchmarking
-        # implementation which was written specifically with CUDA devices in mind, we may want to
-        # explore alternate implementations for other device types.
-        return self.benchmark_gpu(_callable, **kwargs)
+        # Surfacing all kernels during autotuning is super noisy; filtering these out.
+        with DebugMode._benchmarking_inductor():
+            if inferred_device == torch.device("cpu"):
+                return self.benchmark_cpu(_callable, **kwargs)
+            # TODO(nmacchioni): For non-CPU functions we default to using the GPU-specific benchmarking
+            # implementation which was written specifically with CUDA devices in mind, we may want to
+            # explore alternate implementations for other device types.
+            return self.benchmark_gpu(_callable, **kwargs)
 
     @time_and_count
     def benchmark_cpu(
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -25,6 +25,7 @@
 from torch._environment import is_fbcode
 from torch._inductor import metrics
 from torch._prims_common import compute_required_storage_length
+from torch.utils._debug_mode import get_active_debug_mode
 from torch.utils._ordered_set import OrderedSet
 
 from ..triton_bundler import TritonBundler
@@ -1337,6 +1338,17 @@ def run(
         benchmark_run=False,
         **kwargs,
     ):  # type:ignore[override]
+        """Launch triton kernel call and return result."""
+        debug_mode = get_active_debug_mode()
+        debug_call = None
+        if debug_mode:
+            arg_names = list(self.triton_meta.get("signature", {}).keys())
+            kernel_kwargs = dict(zip(arg_names, args))
+            kernel_kwargs.update(kwargs)
+            debug_call = debug_mode.record_triton_kernel(
+                kernel_name=self.fn.__name__, kwargs=kernel_kwargs
+            )
+
         if hasattr(triton, "set_allocator"):
 
             def alloc_fn(size: int, align: int, stream: int | None):
@@ -1392,18 +1404,22 @@ def alloc_fn(size: int, align: int, stream: int | None):
                 args_without_constexprs,
                 profiler_kwargs,
             ):
-                return launcher(
+                result = launcher(
                     *args,
                     **kwargs,
                     stream=stream,
                 )
         else:
-            return launcher(
+            result = launcher(
                 *args,
                 **kwargs,
                 stream=stream,
             )
 
+        if debug_call:
+            debug_call.finalize(self.get_device_interface())
+        return result
+
     def _interpret_args_grid(
         self, args: tuple[Any, ...], cfg: Config
     ) -> tuple[tuple[Any, ...], tuple[int, int, int]]:
diff --git a/torch/utils/_debug_mode.py b/torch/utils/_debug_mode.py