From a2ec1f545a468cc154f21243ecb7b01716c4149f Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 3 Mar 2025 07:35:34 -0500
Subject: [PATCH] [V0][Metrics] Deprecate some KV/prefix cache metrics

vllm:num_requests_swapped, vllm:cpu_cache_usage_perc and
vllm:cpu_prefix_cache_hit_rate will no longer be relevant in
V1 since we no longer implement KV cache offloading. So
these metrics should be considered deprecated.

And as agreed in #12592, we have added prefix_cache_queries and
prefix_cache_hits counters to replace the prefix_cache_hit_rate
gauge as it allows the interval over which the hit rate is
calculated to be controlled in a Prometheus query like:

```
rate(prefix_cache_queries[5m]) / rate(prefix_cache_hits[5m])
```

In theory, we could ease the transition be implementing the
old hit rate metric in V1 and the new queries/hits metrics
in V0, but it's probably not worthwhile unless we learn the
hit rate metric is heavily used by V0 users.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 vllm/engine/metrics.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index cb3ca7a11881..2d72252076b5 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -74,31 +74,51 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             ],
             multiprocess_mode="livemostrecent",
         )
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_scheduler_swapped = self._gauge_cls(
             name="vllm:num_requests_swapped",
-            documentation="Number of requests swapped to CPU.",
+            documentation=(
+                "Number of requests swapped to CPU. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
+
         #   KV Cache Usage in %
         self.gauge_gpu_cache_usage = self._gauge_cls(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames,
             multiprocess_mode="sum")
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_cpu_cache_usage = self._gauge_cls(
             name="vllm:cpu_cache_usage_perc",
-            documentation="CPU KV-cache usage. 1 means 100 percent usage.",
+            documentation=(
+                "CPU KV-cache usage. 1 means 100 percent usage. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
-        #   Prefix caching block hit rate
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
             name="vllm:cpu_prefix_cache_hit_rate",
-            documentation="CPU prefix cache block hit rate.",
+            documentation=(
+                "CPU prefix cache block hit rate. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
+
+        # Deprecated in 0.8 - replaced by queries+hits counters in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
             name="vllm:gpu_prefix_cache_hit_rate",
-            documentation="GPU prefix cache block hit rate.",
+            documentation=("GPU prefix cache block hit rate. "
+                           "DEPRECATED: use vllm:gpu_prefix_cache_queries and "
+                           "vllm:gpu_prefix_cache_queries in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")