From 62f92cce1332f2d2878fc58b74ccf261ee12ffdb Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 12 May 2025 08:37:43 -0700
Subject: [PATCH] change metric from block to token

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/v1/core/kv_cache_manager.py | 12 ++++++------
 vllm/v1/metrics/loggers.py       |  4 ++--
 vllm/v1/metrics/stats.py         |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index ad8468a89dc5..aa0449eea8f1 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -149,11 +149,15 @@ def get_computed_blocks(self,
 
         computed_blocks = (
             self.single_type_manager.find_longest_cache_hit(block_hashes))
+        # NOTE(woosuk): Since incomplete blocks are not eligible for
+        # sharing, `num_computed_tokens` is always a multiple of
+        # `block_size`.
+        num_computed_tokens = len(computed_blocks) * self.block_size
 
         if self.log_stats:
             assert self.prefix_cache_stats is not None
-            self.prefix_cache_stats.queries += len(block_hashes)
-            self.prefix_cache_stats.hits += len(computed_blocks)
+            self.prefix_cache_stats.queries += request.num_tokens
+            self.prefix_cache_stats.hits += num_computed_tokens
 
         if last_block_hash is not None:
             # Add back the last block hash if it was removed.
@@ -161,10 +165,6 @@ def get_computed_blocks(self,
             # we shouldn't modify it directly.
             block_hashes.append(last_block_hash)
 
-        # NOTE(woosuk): Since incomplete blocks are not eligible for
-        # sharing, `num_computed_tokens` is always a multiple of
-        # `block_size`.
-        num_computed_tokens = len(computed_blocks) * self.block_size
         return KVCacheBlocks(computed_blocks), num_computed_tokens
 
     def allocate_slots(
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 7455f1813cd7..6ee40850beb1 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -183,13 +183,13 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
             name="vllm:gpu_prefix_cache_queries",
             documentation=
-            "GPU prefix cache queries, in terms of number of queried blocks.",
+            "GPU prefix cache queries, in terms of number of queried tokens.",
             labelnames=labelnames).labels(*labelvalues)
 
         self.counter_gpu_prefix_cache_hits = prometheus_client.Counter(
             name="vllm:gpu_prefix_cache_hits",
             documentation=
-            "GPU prefix cache hits, in terms of number of cached blocks.",
+            "GPU prefix cache hits, in terms of number of cached tokens.",
             labelnames=labelnames).labels(*labelvalues)
 
         #
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index fd949264885b..8fe1630616a4 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -19,7 +19,7 @@ class PrefixCacheStats:
     # The number of requests in this update.
     requests: int = 0
     # The number of queries in these requests. Note that "queries" here
-    # means the number of blocks that were queried from the cache.
+    # means the number of tokens that were queried from the cache.
     queries: int = 0
     # The number of hits in these requests.
     hits: int = 0