rebase and rerun EXPECTACCEPT=1

tianrengao · tianrengao · commit ee4f1d9579f5 · 2025-10-09T15:10:29.000-07:00
diff --git a/examples/matmul.py b/examples/matmul.py
@@ -9,15 +9,16 @@
 # %%
 from __future__ import annotations
 
-from typing import Any, TYPE_CHECKING
-
-import helion
-import helion.language as hl
+from typing import TYPE_CHECKING
+from typing import Any
 
 import torch
-from helion._testing import run_example
 from torch import Tensor
 
+import helion
+from helion._testing import run_example
+import helion.language as hl
+
 if TYPE_CHECKING:
     from collections.abc import Callable
 
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -146,7 +146,7 @@ def addmm_bwd(grad_out: Tensor, bias: Tensor, mat1: Tensor, mat2: Tensor, alpha:
     _BLOCK_SIZE_5 = 16
     _BLOCK_SIZE_6 = 16
     _BLOCK_SIZE_7 = 16
-    _launcher(_helion_addmm_bwd, (triton.cdiv(m, _BLOCK_SIZE_0) * triton.cdiv(n, _BLOCK_SIZE_1) + triton.cdiv(m, _BLOCK_SIZE_2) * triton.cdiv(k, _BLOCK_SIZE_3) + triton.cdiv(k, _BLOCK_SIZE_5) * triton.cdiv(n, _BLOCK_SIZE_6),), grad_out, grad_input, mat2, grad_mat1, mat1, grad_mat2, grad_input.stride(0), grad_input.stride(1), grad_mat1.stride(0), grad_mat1.stride(1), grad_mat2.stride(0), grad_mat2.stride(1), grad_out.stride(0), grad_out.stride(1), mat1.stride(0), mat1.stride(1), mat2.stride(0), mat2.stride(1), m, n, beta, k, alpha, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, _BLOCK_SIZE_3, _BLOCK_SIZE_4, _BLOCK_SIZE_5, _BLOCK_SIZE_6, _BLOCK_SIZE_7, num_warps=4, num_stages=3)
+    _launcher(_helion_addmm_bwd, (triton.cdiv(m, _BLOCK_SIZE_0) * triton.cdiv(n, _BLOCK_SIZE_1) + triton.cdiv(m, _BLOCK_SIZE_2) * triton.cdiv(k, _BLOCK_SIZE_3) + triton.cdiv(k, _BLOCK_SIZE_5) * triton.cdiv(n, _BLOCK_SIZE_6),), grad_out, grad_input, mat2, grad_mat1, mat1, grad_mat2, grad_input.stride(0), grad_input.stride(1), grad_mat1.stride(0), grad_mat1.stride(1), grad_mat2.stride(0), grad_mat2.stride(1), grad_out.stride(0), grad_out.stride(1), mat1.stride(0), mat1.stride(1), mat2.stride(0), mat2.stride(1), m, n, beta, k, alpha, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, _BLOCK_SIZE_3, _BLOCK_SIZE_4, _BLOCK_SIZE_5, _BLOCK_SIZE_6, _BLOCK_SIZE_7, num_warps=4, num_stages=2)
     return (grad_input, grad_mat1, grad_mat2)
 
 --- assertExpectedJournal(TestExamples.test_attention_block_pointer)
@@ -3050,7 +3050,7 @@ def matmul_bwd(grad_out: Tensor, mat1: Tensor, mat2: Tensor, *, _launcher=_defau
     _BLOCK_SIZE_3 = 16
     _BLOCK_SIZE_4 = 16
     _BLOCK_SIZE_5 = 16
-    _launcher(_helion_matmul_bwd, (triton.cdiv(m, _BLOCK_SIZE_0) * triton.cdiv(k, _BLOCK_SIZE_1) + triton.cdiv(k, _BLOCK_SIZE_3) * triton.cdiv(n, _BLOCK_SIZE_4),), grad_out, mat2, grad_mat1, mat1, grad_mat2, grad_mat1.stride(0), grad_mat1.stride(1), grad_mat2.stride(0), grad_mat2.stride(1), grad_out.stride(0), grad_out.stride(1), mat1.stride(0), mat1.stride(1), mat2.stride(0), mat2.stride(1), m, k, n, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, _BLOCK_SIZE_3, _BLOCK_SIZE_4, _BLOCK_SIZE_5, num_warps=4, num_stages=3)
+    _launcher(_helion_matmul_bwd, (triton.cdiv(m, _BLOCK_SIZE_0) * triton.cdiv(k, _BLOCK_SIZE_1) + triton.cdiv(k, _BLOCK_SIZE_3) * triton.cdiv(n, _BLOCK_SIZE_4),), grad_out, mat2, grad_mat1, mat1, grad_mat2, grad_mat1.stride(0), grad_mat1.stride(1), grad_mat2.stride(0), grad_mat2.stride(1), grad_out.stride(0), grad_out.stride(1), mat1.stride(0), mat1.stride(1), mat2.stride(0), mat2.stride(1), m, k, n, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, _BLOCK_SIZE_3, _BLOCK_SIZE_4, _BLOCK_SIZE_5, num_warps=4, num_stages=2)
     return (grad_mat1, grad_mat2)
 
 --- assertExpectedJournal(TestExamples.test_matmul_layernorm_dynamic_shapes)
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -2,21 +2,19 @@
 
 import unittest
 
-import helion
+from packaging import version
 import torch
-from helion._testing import (
-    check_example,
-    DEVICE,
-    EXAMPLES_DIR,
-    import_path,
-    RefEagerTestBase,
-    skipIfRefEager,
-    skipIfRocm,
-    skipIfXPU,
-    TestCase,
-)
 
-from packaging import version
+import helion
+from helion._testing import DEVICE
+from helion._testing import EXAMPLES_DIR
+from helion._testing import RefEagerTestBase
+from helion._testing import TestCase
+from helion._testing import check_example
+from helion._testing import import_path
+from helion._testing import skipIfRefEager
+from helion._testing import skipIfRocm
+from helion._testing import skipIfXPU
 
 torch.backends.cuda.matmul.fp32_precision = "tf32"
 torch.backends.cudnn.conv.fp32_precision = "tf32"