From 8351696468df7939768205df3088f66e3963c3c1 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 21 Jan 2025 11:40:23 -0800 Subject: [PATCH 1/3] Update pre-commit.yml --- .github/workflows/pre-commit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index bf9460151ec1..06564969dc77 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -16,4 +16,4 @@ jobs: - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 with: - extra_args: --hook-stage manual + extra_args: --all-files --hook-stage manual From 23b7f11b2ca70a855125feb6064f1722f9310a75 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Tue, 21 Jan 2025 19:51:48 +0000 Subject: [PATCH 2/3] Reapplying missing format --- tests/test_utils.py | 5 +++++ vllm/attention/backends/rocm_flash_attn.py | 8 +++++--- .../model_executor/layers/quantization/utils/fp8_utils.py | 4 ++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 5f78d1352963..7ebc2c0b857e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -326,6 +326,11 @@ def measure_current_non_torch(): # Add some extra non-torch memory 256 MiB (simulate NCCL) handle2 = lib.cudaMalloc(256 * 1024 * 1024) + # this is an analytic value, it is exact, + # we only have 256 MiB non-torch memory increase + measured_diff = monitored_values.values[-1] - monitored_values.values[0] + assert measured_diff == 256 * 1024 * 1024 + # Check that the memory usage is within 5% of the expected values # 5% tolerance is caused by cuda runtime. # we cannot control cuda runtime in the granularity of bytes, diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index c55f5c02f6d8..45102c655d0b 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -681,10 +681,12 @@ def forward( seq_lens, make_attn_mask=False) # type: ignore full_scales = ( - 1.0 / layer._q_scale.item(), 1.0 / layer._k_scale.item(), - 1.0 / layer._v_scale.item(), 1.0 / layer._prob_scale.item(), + 1.0 / layer._q_scale.item(), + 1.0 / layer._k_scale.item(), 1.0 / + layer._v_scale.item(), 1.0 / layer._prob_scale.item(), fp8_out_scale.item()) if ( - fp8_out_scale and layer._q_scale and layer._prob_scale + fp8_out_scale and layer._q_scale + and layer._prob_scale and envs.VLLM_USE_ROCM_FP8_FLASH_ATTN) else None out, _ = self.attn_func( query, diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 4b30ffecd3ec..b2ce1f9e2965 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -36,8 +36,8 @@ def apply_w8a8_block_fp8_linear( def input_to_float8( - x: torch.Tensor, - dtype: Optional[torch.dtype] = None + x: torch.Tensor, + dtype: Optional[torch.dtype] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """This function quantizes input values to float8 values " "with tensor-wise quantization.""" From 78c5b11ee10ecf520b18f8403c289bf054bb25f4 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Tue, 21 Jan 2025 20:06:43 +0000 Subject: [PATCH 3/3] New codespell exclude location --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 432bf5ed18db..ede092746c94 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: rev: v2.3.0 hooks: - id: codespell - exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' + exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|csrc/rocm/.*|csrc/gradlib/.*' - repo: https://github.com/PyCQA/isort rev: 5.13.2 hooks: