From 62f92cce1332f2d2878fc58b74ccf261ee12ffdb Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Mon, 12 May 2025 08:37:43 -0700 Subject: [PATCH] change metric from block to token Signed-off-by: Chen Zhang --- vllm/v1/core/kv_cache_manager.py | 12 ++++++------ vllm/v1/metrics/loggers.py | 4 ++-- vllm/v1/metrics/stats.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index ad8468a89dc5..aa0449eea8f1 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -149,11 +149,15 @@ def get_computed_blocks(self, computed_blocks = ( self.single_type_manager.find_longest_cache_hit(block_hashes)) + # NOTE(woosuk): Since incomplete blocks are not eligible for + # sharing, `num_computed_tokens` is always a multiple of + # `block_size`. + num_computed_tokens = len(computed_blocks) * self.block_size if self.log_stats: assert self.prefix_cache_stats is not None - self.prefix_cache_stats.queries += len(block_hashes) - self.prefix_cache_stats.hits += len(computed_blocks) + self.prefix_cache_stats.queries += request.num_tokens + self.prefix_cache_stats.hits += num_computed_tokens if last_block_hash is not None: # Add back the last block hash if it was removed. @@ -161,10 +165,6 @@ def get_computed_blocks(self, # we shouldn't modify it directly. block_hashes.append(last_block_hash) - # NOTE(woosuk): Since incomplete blocks are not eligible for - # sharing, `num_computed_tokens` is always a multiple of - # `block_size`. - num_computed_tokens = len(computed_blocks) * self.block_size return KVCacheBlocks(computed_blocks), num_computed_tokens def allocate_slots( diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 7455f1813cd7..6ee40850beb1 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -183,13 +183,13 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): self.counter_gpu_prefix_cache_queries = prometheus_client.Counter( name="vllm:gpu_prefix_cache_queries", documentation= - "GPU prefix cache queries, in terms of number of queried blocks.", + "GPU prefix cache queries, in terms of number of queried tokens.", labelnames=labelnames).labels(*labelvalues) self.counter_gpu_prefix_cache_hits = prometheus_client.Counter( name="vllm:gpu_prefix_cache_hits", documentation= - "GPU prefix cache hits, in terms of number of cached blocks.", + "GPU prefix cache hits, in terms of number of cached tokens.", labelnames=labelnames).labels(*labelvalues) # diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index fd949264885b..8fe1630616a4 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -19,7 +19,7 @@ class PrefixCacheStats: # The number of requests in this update. requests: int = 0 # The number of queries in these requests. Note that "queries" here - # means the number of blocks that were queried from the cache. + # means the number of tokens that were queried from the cache. queries: int = 0 # The number of hits in these requests. hits: int = 0