[Perf] Apply torch.compile for per_block_cast_to_fp8 (vllm-project#24611)

yewentao256 · charlifu · commit fd9423a44415 · 2025-09-25T16:26:47.000Z
Signed-off-by: yewentao256 &lt;zhyanwentao@126.com&gt;
Signed-off-by: charlifu &lt;charlifu@amd.com&gt;
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
@@ -135,7 +135,7 @@ def _align(x: int, y: int) -> int:
 
 
 # Taken from https://github.com/deepseek-ai/DeepGEMM/blob/dd6ed14acbc7445dcef224248a77ab4d22b5f240/deep_gemm/utils/math.py#L38
-# TODO(wentao): optimize this function, using triton or cuda kernel
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def per_block_cast_to_fp8(
         x: torch.Tensor,
         block_size: list[int] = DEFAULT_BLOCK_SIZE,
@@ -187,4 +187,4 @@ def should_use_deepgemm_for_fp8_linear(output_dtype: torch.dtype,
     "is_deep_gemm_e8m0_used",
     "is_deep_gemm_supported",
     "should_use_deepgemm_for_fp8_linear",
-]
+]