diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 1cbff1e2d767..25bfe18c7bce 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -4,7 +4,7 @@ from vllm.logger import init_logger from vllm.utils import cdiv from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, - KVCacheBlock, + KVCacheBlock, PrefixCachingMetrics, generate_block_hash_extra_keys, hash_block_tokens, hash_request_tokens) @@ -69,6 +69,12 @@ def __init__( # is finished. self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {} + # Prefix cache metrics. + self.prefix_caching_metrics: PrefixCachingMetrics = { + "query_total": 0, + "query_hit": 0, + } + def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]: """Get the computed (cached) blocks for the request. Note that the computed blocks must be full. @@ -101,6 +107,8 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]: else: break + self.prefix_caching_metrics["query_total"] += len(block_hashes) + self.prefix_caching_metrics["query_hit"] += len(computed_blocks) return computed_blocks def append_slots( @@ -328,6 +336,17 @@ def get_num_common_prefix_blocks( break return num_common_blocks + def get_prefix_caching_hit_rate(self) -> float: + """Get the hit rate of prefix caching. + + Returns: + The hit rate of prefix caching. + """ + if self.prefix_caching_metrics["query_total"] == 0: + return 0.0 + return self.prefix_caching_metrics[ + "query_hit"] / self.prefix_caching_metrics["query_total"] + def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: """Get new blocks from the free block pool. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 22a5d2fb08a4..bb41b466c6bb 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,7 +1,7 @@ """KV-Cache Utilities.""" from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, List, NamedTuple, Optional, Tuple +from typing import Any, List, NamedTuple, Optional, Tuple, TypedDict from vllm.logger import init_logger from vllm.v1.request import Request @@ -24,6 +24,16 @@ class BlockHashType(NamedTuple): extra_keys: Optional[Any] = None +class PrefixCachingMetrics(TypedDict): + """Metrics for prefix caching.""" + + query_total: int + """The total number of queries.""" + + query_hit: int + """The number of queries that hit the prefix cache.""" + + @dataclass class KVCacheBlock: """KV-cache block metadata.""" diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 975ce11fe8af..af1dc7974b7f 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -244,10 +244,17 @@ def _log_stats(self): now = time.time() if now - self._last_logging_time > LOGGING_TIME_S: + prefix_caching_hit_rate = "" + if (hit_rate := self.scheduler.kv_cache_manager. + get_prefix_caching_hit_rate()) > 0: + prefix_caching_hit_rate = ( + f" | PrefixCachingHitRate: {hit_rate:.2f}") + logger.info( - "RUNNING: %s | WAITING: %s", + "RUNNING: %s | WAITING: %s%s", len(self.scheduler.running), len(self.scheduler.waiting), + prefix_caching_hit_rate, ) self._last_logging_time = now