py linter fixes

sanyalington · sanyalington · commit d539011e3fb3 · 2024-08-14T09:47:30.000Z
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -9,7 +9,7 @@
 from vllm._custom_C import paged_attention_custom
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
 
-NUM_BLOCKS = 1024*1024
+NUM_BLOCKS = 1024 * 1024
 PARTITION_SIZE = 256
 
 
diff --git a/tests/kernels/test_attention_custom.py b/tests/kernels/test_attention_custom.py
@@ -6,28 +6,29 @@
 
 from vllm import _custom_ops as ops
 from vllm._custom_C import paged_attention_custom
-from vllm.utils import get_max_shared_memory_bytes, is_hip
+from vllm.utils import is_hip
 
 from .allclose_default import get_default_atol, get_default_rtol
 
-MAX_SEQ_LEN = 32*1024
+MAX_SEQ_LEN = 32 * 1024
 # There may not be enough gpu memory due to large NUM_BLOCKS.
 # Reduce NUM_BLOCKS when it happens.
-NUM_BLOCKS = 128*1024+4321  # Arbitrary values for testing
+NUM_BLOCKS = 128 * 1024 + 4321  # Arbitrary values for testing
 PARTITION_SIZE = 256
-DTYPES = [torch.bfloat16,torch.half]
-NUM_GEN_SEQS = [1,17]  # Arbitrary values for testing
+DTYPES = [torch.bfloat16, torch.half]
+NUM_GEN_SEQS = [1, 17]  # Arbitrary values for testing
 NUM_HEADS = [(8 * x, 8) for x in range(1, 17)]  # Arbitrary values for testing
 
-HEAD_SIZES = [64,128]
-BLOCK_SIZES = [16,32]
-USE_ALIBI = [True,False]
+HEAD_SIZES = [64, 128]
+BLOCK_SIZES = [16, 32]
+USE_ALIBI = [True, False]
 KV_CACHE_DTYPE = ["auto"]
 SEEDS = [37]
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 1)
 ]
 
+
 def ref_masked_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -279,10 +280,13 @@ def test_paged_attention(
     # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
     # so we use a relaxed tolerance for the test.
     atol, rtol = 1e-4, 1e-5
-    if dtype == torch.bfloat16: atol, rtol = 2e-4, 1e-5
+    if dtype == torch.bfloat16:
+        atol, rtol = 2e-4, 1e-5
     if use_alibi:
-        if dtype == torch.half: atol, rtol = 5e-4, 1e-5
-        if dtype == torch.bfloat16: atol, rtol = 1e-3, 1e-5
+        if dtype == torch.half:
+            atol, rtol = 5e-4, 1e-5
+        if dtype == torch.bfloat16:
+            atol, rtol = 1e-3, 1e-5
     if kv_cache_dtype == "fp8":
         atol, rtol = 1e-2, 1e-5
     assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
@@ -118,9 +118,10 @@ def forward_decode(
         num_seqs, num_heads, head_size = query.shape
         gqa_ratio = num_heads // num_kv_heads
         use_custom = (custom_attn_available
-                      and (query.dtype == torch.half or query.dtype == torch.bfloat16)
+                      and (query.dtype == torch.half
+                           or query.dtype == torch.bfloat16)
                       and (head_size == 128 or head_size == 64)
-                      and (block_size == 16 or block_size==32)
+                      and (block_size == 16 or block_size == 32)
                       and kv_cache_dtype == "auto"
                       and (gqa_ratio >= 1 and gqa_ratio <= 16)
                       and max_seq_len <= 32768)