Add MLA support for v1 disagg connector (#6)

Flechman · web-flow · commit 406d6bfa59a2 · 2025-04-10T15:47:40.000-07:00
Signed-off-by: remi &lt;remi@mistral.ai&gt;
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -11,6 +11,7 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
 from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
@@ -98,6 +99,7 @@ def start_load_kv(self, forward_context: "ForwardContext",
             The number of elements in kv_caches and layer_names should be 
             the same.
         """
+        attn_metadata = forward_context.attn_metadata
 
         def inject_kv_into_layer(
             dst_kv_cache_layer: torch.Tensor,
@@ -108,19 +110,29 @@ def inject_kv_into_layer(
 
             Args:
                 dst_kv_cache_layer (torch.Tensor): the destination KV cache 
-                    layer. In shape [2, num_pages, page_size, xxx].
+                    layer. In shape [2, num_pages, page_size, xxx] if not 
+                    using MLA, [num_pages, page_size, xxx] otherwise.
                 src_kv_cache (torch.Tensor): the source KV cache. In shape
-                    [2, num_tokens, xxx].
+                    [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx] 
+                    otherwise.
                 slot_mapping (torch.Tensor): the slot mapping. In shape 
                     [num_tokens].
             """
             dst_kv_cache_layer_shape = dst_kv_cache_layer.shape
-            num_pages = dst_kv_cache_layer_shape[1]
-            page_size = dst_kv_cache_layer_shape[2]
-            dst_kv_cache_layer = dst_kv_cache_layer.reshape(
-                2, num_pages * page_size, -1)
-            dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
-            dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages = dst_kv_cache_layer_shape[0]
+                page_size = dst_kv_cache_layer_shape[1]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    num_pages * page_size, -1)
+                dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
+            else:
+                num_pages = dst_kv_cache_layer_shape[1]
+                page_size = dst_kv_cache_layer_shape[2]
+                dst_kv_cache_layer = dst_kv_cache_layer.reshape(
+                    2, num_pages * page_size, -1)
+                dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache
+                dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape)
 
         # Get the metadata
         metadata: KVConnectorMetadata = \
@@ -170,7 +182,7 @@ def wait_for_layer_load(self, layer_name: str) -> None:
 
     def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
                       attn_metadata: "AttentionMetadata", **kwargs) -> None:
-        """Start saving the a layer of KV cache from vLLM's paged buffer 
+        """Start saving the KV cache of the layer from vLLM's paged buffer 
         to the connector.
 
         Args:
@@ -187,10 +199,13 @@ def extract_kv_from_layer(
         ) -> torch.Tensor:
             """Extract the KV cache from the layer.
 
-            Assume the shape of the layer is (2, num_pages, page_size, xxx).
+            Assume the shape of the layer is (2, num_pages, page_size, xxx)
+            if MLA is not used, and (num_pages, page_size, xxx) otherwise.
             """
-            # TODO: make this compatible with MLA.
-            assert layer.shape[0] == 2
+            if isinstance(attn_metadata, MLACommonMetadata):
+                num_pages, page_size = layer.shape[0], layer.shape[1]
+                return layer.reshape(num_pages * page_size, -1)[slot_mapping,
+                                                                ...]
             num_pages, page_size = layer.shape[1], layer.shape[2]
             return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping,
                                                                ...]