From a2ec1f545a468cc154f21243ecb7b01716c4149f Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Mon, 3 Mar 2025 07:35:34 -0500 Subject: [PATCH] [V0][Metrics] Deprecate some KV/prefix cache metrics vllm:num_requests_swapped, vllm:cpu_cache_usage_perc and vllm:cpu_prefix_cache_hit_rate will no longer be relevant in V1 since we no longer implement KV cache offloading. So these metrics should be considered deprecated. And as agreed in #12592, we have added prefix_cache_queries and prefix_cache_hits counters to replace the prefix_cache_hit_rate gauge as it allows the interval over which the hit rate is calculated to be controlled in a Prometheus query like: ``` rate(prefix_cache_queries[5m]) / rate(prefix_cache_hits[5m]) ``` In theory, we could ease the transition be implementing the old hit rate metric in V1 and the new queries/hits metrics in V0, but it's probably not worthwhile unless we learn the hit rate metric is heavily used by V0 users. Signed-off-by: Mark McLoughlin --- vllm/engine/metrics.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index cb3ca7a11881..2d72252076b5 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -74,31 +74,51 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): ], multiprocess_mode="livemostrecent", ) + + # Deprecated in 0.8 - KV cache offloading is not used in V1 + # TODO: in 0.9, only enable if show_hidden_metrics=True self.gauge_scheduler_swapped = self._gauge_cls( name="vllm:num_requests_swapped", - documentation="Number of requests swapped to CPU.", + documentation=( + "Number of requests swapped to CPU. " + "DEPRECATED: KV cache offloading is not used in V1"), labelnames=labelnames, multiprocess_mode="sum") + # KV Cache Usage in % self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", documentation="GPU KV-cache usage. 1 means 100 percent usage.", labelnames=labelnames, multiprocess_mode="sum") + + # Deprecated in 0.8 - KV cache offloading is not used in V1 + # TODO: in 0.9, only enable if show_hidden_metrics=True self.gauge_cpu_cache_usage = self._gauge_cls( name="vllm:cpu_cache_usage_perc", - documentation="CPU KV-cache usage. 1 means 100 percent usage.", + documentation=( + "CPU KV-cache usage. 1 means 100 percent usage. " + "DEPRECATED: KV cache offloading is not used in V1"), labelnames=labelnames, multiprocess_mode="sum") - # Prefix caching block hit rate + + # Deprecated in 0.8 - KV cache offloading is not used in V1 + # TODO: in 0.9, only enable if show_hidden_metrics=True self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls( name="vllm:cpu_prefix_cache_hit_rate", - documentation="CPU prefix cache block hit rate.", + documentation=( + "CPU prefix cache block hit rate. " + "DEPRECATED: KV cache offloading is not used in V1"), labelnames=labelnames, multiprocess_mode="sum") + + # Deprecated in 0.8 - replaced by queries+hits counters in V1 + # TODO: in 0.9, only enable if show_hidden_metrics=True self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls( name="vllm:gpu_prefix_cache_hit_rate", - documentation="GPU prefix cache block hit rate.", + documentation=("GPU prefix cache block hit rate. " + "DEPRECATED: use vllm:gpu_prefix_cache_queries and " + "vllm:gpu_prefix_cache_queries in V1"), labelnames=labelnames, multiprocess_mode="sum")