vllm-project
diff --git a/‎tests/kernels/moe/modular_kernel_tools/mk_objects.py‎
Lines changed: 3 additions & 2 deletions b/‎tests/kernels/moe/modular_kernel_tools/mk_objects.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 110 additions & 15 deletions b/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 110 additions & 15 deletions
diff --git a/‎vllm/distributed/device_communicators/cuda_communicator.py‎
Lines changed: 5 additions & 0 deletions b/‎vllm/distributed/device_communicators/cuda_communicator.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎vllm/distributed/device_communicators/mnnvl_compat.py‎
Lines changed: 28 additions & 0 deletions b/‎vllm/distributed/device_communicators/mnnvl_compat.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎vllm/envs.py‎
Lines changed: 5 additions & 2 deletions b/‎vllm/envs.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py‎
Lines changed: 3 additions & 4 deletions b/‎vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py‎
Lines changed: 3 additions & 4 deletions
@@ -222,7 +222,8 @@ def expert_info(kind) -> ExpertInfo:
     from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
         FlashInferExperts)
     from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
-        FlashInferCutlassMoEPrepareAndFinalize)
+        FlashInferCutlassMoEPrepareAndFinalize,
+        create_flashinfer_prepare_finalize)
 
     register_prepare_and_finalize(
         FlashInferCutlassMoEPrepareAndFinalize,
@@ -373,7 +374,7 @@ def make_prepare_finalize(
         assert prepare_finalize is not None
         return prepare_finalize
     elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
-        return FlashInferCutlassMoEPrepareAndFinalize(
+        return create_flashinfer_prepare_finalize(
             use_dp=moe.moe_parallel_config.dp_size > 1)
     else:
         return MoEPrepareAndFinalizeNoEP()
 
@@ -10,9 +10,15 @@
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.utils import has_deep_ep, has_pplx
+from vllm.utils.flashinfer import has_flashinfer_all2all
 
 from .base_device_communicator import All2AllManagerBase, Cache
 
+if has_flashinfer_all2all():
+    from flashinfer.comm import Mapping
+    from flashinfer.comm.mnnvl import MnnvlConfig
+    from flashinfer.comm.trtllm_alltoall import MnnvlMoe
+
 logger = init_logger(__name__)
 
 
@@ -47,24 +53,22 @@ def naive_multicast(self, x: torch.Tensor,
 
     def dispatch(self, hidden_states: torch.Tensor,
                  router_logits: torch.Tensor):
-        cu_tokens_across_dp_cpu = get_forward_context(
-        ).dp_metadata.cu_tokens_across_dp_cpu
+        sizes = get_forward_context(
+        ).dp_metadata.get_chunk_sizes_across_dp_rank()
+        hidden_states, router_logits = get_dp_group().all_gatherv(
+            [hidden_states, router_logits],
+            dim=0,
+            sizes=sizes,
+        )
 
-        hidden_states = self.naive_multicast(hidden_states,
-                                             cu_tokens_across_dp_cpu)
-        router_logits = self.naive_multicast(router_logits,
-                                             cu_tokens_across_dp_cpu)
         return hidden_states, router_logits
 
     def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        cu_tokens_across_dp_cpu = get_forward_context(
-        ).dp_metadata.cu_tokens_across_dp_cpu
-        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
-            self.dp_rank - 1]
-        end = cu_tokens_across_dp_cpu[self.dp_rank]
-
-        all_hidden_states = self.dp_group.all_reduce(hidden_states)
-        hidden_states = all_hidden_states[start:end, :]
+        sizes = get_forward_context(
+        ).dp_metadata.get_chunk_sizes_across_dp_rank()
+        hidden_states = get_dp_group().reduce_scatterv(hidden_states,
+                                                       dim=0,
+                                                       sizes=sizes)
         return hidden_states
 
     def destroy(self):
@@ -300,4 +304,95 @@ def get_handle(self, kwargs):
 
     # DeepEP LL uses RDMA so no SMs are used for communication
     def max_sms_used(self) -> Optional[int]:
-        return 0
+        return 0
+
+
+class FlashInferAllToAllManager(All2AllManagerBase):
+    """
+    All2All communication based on flashinfer kernels.
+    """
+
+    def __init__(self, cpu_group):
+        assert has_flashinfer_all2all(
+        ), "flashinfer all2all module not found. Please install/check flashinfer"  # noqa
+        super().__init__(cpu_group)
+        logger.debug(
+            "Initialize for flashinfer All2All "
+            "rank=%d, world size=%d", self.rank, self.world_size)
+        self.initialized = False
+        self.alltoall_info = None
+
+    def initialize(
+        self,
+        world_size: int,
+        rank: int,
+        gpus_per_node: int,
+    ):
+        """Initialize workspace"""
+        if self.initialized:
+            return
+
+        self.cleanup()
+        logger.debug("making map: "
+                     "rank=%d, world size=%d", rank, world_size)
+        self.mapping = Mapping(
+            world_size,
+            rank,
+            gpus_per_node,
+            tp_size=world_size,
+        )
+
+        from vllm.distributed.device_communicators.mnnvl_compat import (
+            CustomCommunicator)
+        dp_config = MnnvlConfig(
+            comm_backend=CustomCommunicator(get_dp_group().cpu_group),
+            fabric_page_size=1 << 29,  # 512MB
+            allocation_granularity=0  # Auto-detect
+        )
+
+        self.workspace_tensor = MnnvlMoe.get_moe_workspaces(
+            self.mapping, dp_config)
+        self.prepare_workspace_tensor = MnnvlMoe.get_moe_prepare_workspace(
+            self.mapping, dp_config)
+
+        self.world_size = world_size
+        self.rank = rank
+        self.gpus_per_node = gpus_per_node
+        self.initialized = True
+
+        logger.info("FlashInfer All2All initialized for rank %s, size %s",
+                    rank, world_size)
+
+    def ensure_alltoall_workspace_initialized(self):
+        """Ensure workspace is initialized"""
+        if not has_flashinfer_all2all():
+            return False
+
+        if self.world_size <= 1:
+            return False
+
+        if not self.initialized:
+            self.initialize(
+                world_size=self.world_size,
+                rank=self.rank,
+                gpus_per_node=torch.cuda.device_count,
+            )
+        return self.initialized
+
+    def get_handle(self, kwargs):
+        return self
+
+    def cleanup(self):
+        """Clean up workspace"""
+        if self.initialized and self.workspace_tensor is not None \
+            and self.prepare_workspace_tensor is not None:
+            try:
+                del self.workspace_tensor
+                del self.prepare_workspace_tensor
+            except Exception as e:
+                logger.warning("Failed to cleanup FlashInfer workspace: %s", e)
+            finally:
+                self.workspace_tensor = None
+                self.prepare_workspace_tensor = None
+                self.mapping = None
+                self.initialized = False
@@ -114,6 +114,11 @@ def __init__(self,
                 from .all2all import DeepEPLLAll2AllManager
                 self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
                 logger.info("Using DeepEP Low-Latency all2all manager.")
+            elif all2all_backend == "flashinfer_all2allv":
+                from .all2all import FlashInferAllToAllManager
+                self.all2all_manager = FlashInferAllToAllManager(
+                    self.cpu_group)
+                logger.info("Using Flashinfer all2allv manager.")
             else:
                 raise ValueError(f"Unknown all2all backend: {all2all_backend}")
 
 
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch.distributed as dist
+from flashinfer.comm.mnnvl import CommBackend as CommBackend
+
+from vllm.utils.flashinfer import has_flashinfer_all2all
+
+assert has_flashinfer_all2all(), "Flashinfer alltoallv module cannot be found"
+
+
+class CustomCommunicator(CommBackend):
+
+    def __init__(self, group):
+        self._group = group
+
+    def Get_rank(self) -> int:
+        return self._group.rank()
+
+    def Get_size(self) -> int:
+        return self._group.size()
+
+    def allgather(self, data: int):
+        gathered = [None] * self.Get_size()
+        dist.all_gather_object(gathered, data, group=self._group)
+        return gathered
+
+    def Split(self, color: int, key: int) -> 'CustomCommunicator':
+        return self
@@ -156,7 +156,8 @@
     VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx",
                                   "deepep_high_throughput",
                                   "deepep_low_latency",
-                                  "allgather_reducescatter"] = \
+                                  "allgather_reducescatter",
+                                  "flashinfer_all2allv"] = \
                                   "allgather_reducescatter"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
     VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
@@ -1209,12 +1210,14 @@ def get_vllm_port() -> Optional[int]:
     # - "pplx": use pplx kernels
     # - "deepep_high_throughput", use deepep high-throughput kernels
     # - "deepep_low_latency", use deepep low-latency kernels
+    # - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl
     "VLLM_ALL2ALL_BACKEND":
     env_with_choices("VLLM_ALL2ALL_BACKEND", "allgather_reducescatter",
                      ["naive", "pplx",
                      "deepep_high_throughput",
                      "deepep_low_latency",
-                     "allgather_reducescatter"]),
+                     "allgather_reducescatter",
+                     "flashinfer_all2allv"]),
 
     # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support.
     # Both require compute capability 10.0 or above.
 
@@ -8,7 +8,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
-    FlashInferCutlassMoEPrepareAndFinalize)
+    create_flashinfer_prepare_finalize)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP)
 from vllm.utils.flashinfer import (flashinfer_cutlass_fused_moe,
@@ -108,7 +108,7 @@ def workspace_shapes(
           of each tuple must be the number of tokens.
         """
         aq_m, aq_n = aq.shape
-        workspace2 = ()
+        workspace2 = (0, )
         output_shape = (aq_m, aq_n * 2) if self.quant_dtype != \
             torch.float8_e4m3fn else (aq_m, aq_n)
         workspace_dtype = a.dtype
@@ -192,9 +192,8 @@ def flashinfer_cutlass_moe_fp4(
     expert_map: Optional[torch.Tensor] = None,
     apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
-
     fused_experts = mk.FusedMoEModularKernel(
-        FlashInferCutlassMoEPrepareAndFinalize(use_dp=False),
+        create_flashinfer_prepare_finalize(use_dp=False),
         FlashInferExperts(
             out_dtype=hidden_states.dtype,
             quant_config=quant_config,