ROCm · gshtras · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -16,4 +16,4 @@ jobs:
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
       with:
-        extra_args: --hook-stage manual
+        extra_args: --all-files --hook-stage manual
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
   rev: v2.3.0
   hooks:
   - id: codespell
-    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
+    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|csrc/rocm/.*|csrc/gradlib/.*'
 - repo: https://github.com/PyCQA/isort
   rev: 5.13.2
   hooks:

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -326,6 +326,11 @@ def measure_current_non_torch():
         # Add some extra non-torch memory 256 MiB (simulate NCCL)
         handle2 = lib.cudaMalloc(256 * 1024 * 1024)
 
+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
     # Check that the memory usage is within 5% of the expected values
     # 5% tolerance is caused by cuda runtime.
     # we cannot control cuda runtime in the granularity of bytes,

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -681,10 +681,12 @@ def forward(
                             seq_lens,
                             make_attn_mask=False)  # type: ignore
                     full_scales = (
-                        1.0 / layer._q_scale.item(), 1.0 / layer._k_scale.item(),
-                        1.0 / layer._v_scale.item(), 1.0 / layer._prob_scale.item(),
+                        1.0 / layer._q_scale.item(),
+                        1.0 / layer._k_scale.item(), 1.0 /
+                        layer._v_scale.item(), 1.0 / layer._prob_scale.item(),
                         fp8_out_scale.item()) if (
-                            fp8_out_scale and layer._q_scale and layer._prob_scale
+                            fp8_out_scale and layer._q_scale
+                            and layer._prob_scale
                             and envs.VLLM_USE_ROCM_FP8_FLASH_ATTN) else None
                     out, _ = self.attn_func(
                         query,

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -36,8 +36,8 @@ def apply_w8a8_block_fp8_linear(
 
 
 def input_to_float8(
-    x: torch.Tensor,
-    dtype: Optional[torch.dtype] = None
+        x: torch.Tensor,
+        dtype: Optional[torch.dtype] = None
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """This function quantizes input values to float8 values "
     "with tensor-wise quantization."""