Use the optimized block sizes after tuning the kernel

vanbasten23 · vanbasten23 · commit 1137a05ba864 · 2025-03-07T04:15:55.000Z
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
@@ -12,8 +12,8 @@
 from vllm.attention.backends.utils import CommonAttentionState
 
 # These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 32
-NUM_KV_PAGES_PER_BLOCK = 128
+NUM_QUERIES_PER_BLOCK = 16
+NUM_KV_PAGES_PER_BLOCK = 256
 
 
 class PallasAttentionBackend(AttentionBackend):