From ad38ffac975dacb41281e87af9e9b3a3a56b347a Mon Sep 17 00:00:00 2001
From: Xiongfei Wei <isaacwxf23@gmail.com>
Date: Sat, 8 Mar 2025 01:48:44 +0000
Subject: [PATCH 1/3] Reduce the size of block_table by getting rid of padding.

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 vllm/v1/attention/backends/pallas.py |  4 ++--
 vllm/v1/worker/tpu_model_runner.py   | 10 +---------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index bf3992281a73..bbbdf50ac0cc 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -12,8 +12,8 @@
 from vllm.attention.backends.utils import CommonAttentionState
 
 # These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 16
-NUM_KV_PAGES_PER_BLOCK = 256
+NUM_QUERIES_PER_BLOCK = 32
+NUM_KV_PAGES_PER_BLOCK = 128
 
 
 class PallasAttentionBackend(AttentionBackend):
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index d4ebb3adcf8d..933c80846733 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -141,16 +141,8 @@ def __init__(
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
 
-        # self.input_batch.block_table has a shape of [max_num_reqs,
-        # max_num_blocks_per_req]. To reduce the number of recompilation,
-        # we want the block_table.shape[0] to be num_tokens.
-        # To make the block_table to be compatible with the paged attention
-        # kernel, we want the block_table[1] to be multiple of
-        # NUM_KV_PAGES_PER_BLOCK.
-        padded_max_num_blocks_per_req = _get_padded_number(
-            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, padded_max_num_blocks_per_req),
+            (self.max_num_tokens, self.max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 

From c881174bd926cbeeb444968e1d945cdb6685d4bd Mon Sep 17 00:00:00 2001
From: Xiongfei Wei <isaacwxf23@gmail.com>
Date: Sat, 8 Mar 2025 02:18:08 +0000
Subject: [PATCH 2/3] remove unused import

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 vllm/v1/worker/tpu_model_runner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 933c80846733..b0e62352cd28 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -22,8 +22,7 @@
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
-                                               NUM_QUERIES_PER_BLOCK,
+from vllm.v1.attention.backends.pallas import (NUM_QUERIES_PER_BLOCK,
                                                PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget

From 541afd664d7f100e8dd69f46df929a95ef98208a Mon Sep 17 00:00:00 2001
From: Xiongfei Wei <isaacwxf23@gmail.com>
Date: Sat, 8 Mar 2025 05:13:07 +0000
Subject: [PATCH 3/3] remove other padding

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
---
 vllm/v1/worker/tpu_model_runner.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index b0e62352cd28..d564a964a37f 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -22,8 +22,7 @@
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (NUM_QUERIES_PER_BLOCK,
-                                               PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -76,10 +75,8 @@ def __init__(
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
-        self.max_num_tokens = _get_padded_number(
-            scheduler_config.max_num_batched_tokens, NUM_QUERIES_PER_BLOCK)
-        self.max_num_reqs = _get_padded_number(scheduler_config.max_num_seqs,
-                                               NUM_QUERIES_PER_BLOCK)
+        self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = scheduler_config.max_num_seqs
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(