vllm-project · cascade812 · Jul 16, 2025 · Jul 16, 2025 · Jul 16, 2025 · Sep 27, 2025
@@ -380,8 +380,12 @@ def __init__(self, config: VllmConfig):
 
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
-        # only do replace for specific shapes
+    # This pass is applied on top of the sequence parallelism pass.
+    # It inherits the same applicability condition as `SequenceParallelismPass`.
+    # See `SequenceParallelismPass.is_applicable` for more details.
+    def is_applicable(self, shape: Optional[int]) -> bool:
+        if self.splitting_ops is None or self.splitting_ops == []:
+            return True
         tp_size = get_tensor_model_parallel_world_size()
         return shape is not None and shape % tp_size == 0
 

@@ -96,7 +96,7 @@ def hash_dict(dict_: dict[Any, Any]):
         encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
         return hashlib.sha256(encoded).hexdigest()
 
-    def is_applicable_for_shape(self, shape: Optional[int]):
+    def is_applicable(self, shape: Optional[int]):
         return True
 
 

@@ -71,7 +71,7 @@ def __call__(self, graph: fx.Graph):
 
         shape = get_pass_context().runtime_shape
         for pass_ in self.passes:
-            if pass_.is_applicable_for_shape(shape):
+            if pass_.is_applicable(shape):
                 pass_(graph)
                 VllmInductorPass.dump_prefix += 1
 

@@ -468,7 +468,22 @@ def __init__(self, config: VllmConfig):
                                         self.device).register(self.patterns)
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
+    # When sequence parallelism is enabled, the residual tensor from RMSNorm
+    # needs to be split along the sequence dimension. However, this dimension
+    # is symbolic during piecewise compilation, and splitting symbolic shapes
+    # is not supported.
+    #
+    # This pass is therefore only applied when the sequence dimension is
+    # concrete:
+    # 1. In full-graph compilation mode (no splitting ops are used).
+    #   For this case we always pad num_tokens to be a multiple of
+    #   tensor_parallel_size, so there's no need to check shape % tp_size == 0.
+    # 2. For specific shape provided during compilation (e.g., from
+    #    `compile_sizes`), which must be divisible by the tensor-parallel
+    #    size.
+    def is_applicable(self, shape: Optional[int]) -> bool:
+        if self.splitting_ops is None or self.splitting_ops == []:
+            return True
         tp_size = get_tensor_model_parallel_world_size()
         return shape is not None and shape % tp_size == 0
 

@@ -29,6 +29,7 @@ class VllmInductorPass(InductorPass):
 
     def __init__(self, config: VllmConfig):
         self.pass_config = config.compilation_config.pass_config
+        self.splitting_ops = config.compilation_config.splitting_ops
         self.model_dtype = config.model_config.dtype if config.model_config \
             else None
         self.device = config.device_config.device if config.device_config \