Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ jobs:
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
with:
extra_args: --hook-stage manual
extra_args: --all-files --hook-stage manual
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ repos:
rev: v2.3.0
hooks:
- id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|csrc/rocm/.*|csrc/gradlib/.*'
- repo: https://github.com/PyCQA/isort
rev: 5.13.2
hooks:
Expand Down
5 changes: 5 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,11 @@ def measure_current_non_torch():
# Add some extra non-torch memory 256 MiB (simulate NCCL)
handle2 = lib.cudaMalloc(256 * 1024 * 1024)

# this is an analytic value, it is exact,
# we only have 256 MiB non-torch memory increase
measured_diff = monitored_values.values[-1] - monitored_values.values[0]
assert measured_diff == 256 * 1024 * 1024

# Check that the memory usage is within 5% of the expected values
# 5% tolerance is caused by cuda runtime.
# we cannot control cuda runtime in the granularity of bytes,
Expand Down
8 changes: 5 additions & 3 deletions vllm/attention/backends/rocm_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,10 +681,12 @@ def forward(
seq_lens,
make_attn_mask=False) # type: ignore
full_scales = (
1.0 / layer._q_scale.item(), 1.0 / layer._k_scale.item(),
1.0 / layer._v_scale.item(), 1.0 / layer._prob_scale.item(),
1.0 / layer._q_scale.item(),
1.0 / layer._k_scale.item(), 1.0 /
layer._v_scale.item(), 1.0 / layer._prob_scale.item(),
fp8_out_scale.item()) if (
fp8_out_scale and layer._q_scale and layer._prob_scale
fp8_out_scale and layer._q_scale
and layer._prob_scale
and envs.VLLM_USE_ROCM_FP8_FLASH_ATTN) else None
out, _ = self.attn_func(
query,
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/layers/quantization/utils/fp8_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def apply_w8a8_block_fp8_linear(


def input_to_float8(
x: torch.Tensor,
dtype: Optional[torch.dtype] = None
x: torch.Tensor,
dtype: Optional[torch.dtype] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""This function quantizes input values to float8 values "
"with tensor-wise quantization."""
Expand Down
Loading