diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index bf9460151ec1..06564969dc77 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -16,4 +16,4 @@ jobs: - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 with: - extra_args: --hook-stage manual + extra_args: --all-files --hook-stage manual diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 432bf5ed18db..ede092746c94 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: rev: v2.3.0 hooks: - id: codespell - exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' + exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|csrc/rocm/.*|csrc/gradlib/.*' - repo: https://github.com/PyCQA/isort rev: 5.13.2 hooks: diff --git a/tests/test_utils.py b/tests/test_utils.py index 5f78d1352963..7ebc2c0b857e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -326,6 +326,11 @@ def measure_current_non_torch(): # Add some extra non-torch memory 256 MiB (simulate NCCL) handle2 = lib.cudaMalloc(256 * 1024 * 1024) + # this is an analytic value, it is exact, + # we only have 256 MiB non-torch memory increase + measured_diff = monitored_values.values[-1] - monitored_values.values[0] + assert measured_diff == 256 * 1024 * 1024 + # Check that the memory usage is within 5% of the expected values # 5% tolerance is caused by cuda runtime. # we cannot control cuda runtime in the granularity of bytes, diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index c55f5c02f6d8..45102c655d0b 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -681,10 +681,12 @@ def forward( seq_lens, make_attn_mask=False) # type: ignore full_scales = ( - 1.0 / layer._q_scale.item(), 1.0 / layer._k_scale.item(), - 1.0 / layer._v_scale.item(), 1.0 / layer._prob_scale.item(), + 1.0 / layer._q_scale.item(), + 1.0 / layer._k_scale.item(), 1.0 / + layer._v_scale.item(), 1.0 / layer._prob_scale.item(), fp8_out_scale.item()) if ( - fp8_out_scale and layer._q_scale and layer._prob_scale + fp8_out_scale and layer._q_scale + and layer._prob_scale and envs.VLLM_USE_ROCM_FP8_FLASH_ATTN) else None out, _ = self.attn_func( query, diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 4b30ffecd3ec..b2ce1f9e2965 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -36,8 +36,8 @@ def apply_w8a8_block_fp8_linear( def input_to_float8( - x: torch.Tensor, - dtype: Optional[torch.dtype] = None + x: torch.Tensor, + dtype: Optional[torch.dtype] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """This function quantizes input values to float8 values " "with tensor-wise quantization."""