Merge branch 'main' into zhxchen17/precompile/2

zhxchen17 · web-flow · commit 31bca0d5fbb4 · 2025-10-10T12:36:18.000-04:00
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
@@ -198,7 +198,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
         compilation_config=CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=True,
-            splitting_ops=["silly.attention"],
+            splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
         )
     )
@@ -267,7 +267,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
         compilation_config=CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=False,
-            splitting_ops=["silly.attention"],
+            splitting_ops=["silly::attention"],
         )
     )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
@@ -127,7 +127,7 @@ def _run_simple_model(
 @torch.inference_mode()
 def test_simple_piecewise_compile(use_inductor):
     _run_simple_model(
-        splitting_ops=["silly.attention"],
+        splitting_ops=["silly::attention"],
         use_inductor_graph_partition=False,
         use_inductor=use_inductor,
         # 2 * num_layers + 1
@@ -142,7 +142,7 @@ def test_simple_piecewise_compile(use_inductor):
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
+@pytest.mark.parametrize("splitting_ops", [["silly::attention"], []])
 def test_simple_inductor_graph_partition(splitting_ops, monkeypatch):
     if not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
@@ -268,7 +268,7 @@ def run_model(
             cudagraph_capture_sizes=[1, 2],
         )
         if split_attn:
-            compilation_config.splitting_ops = ["silly.attention"]
+            compilation_config.splitting_ops = ["silly::attention"]
         cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
     else:
         compilation_config = CompilationConfig(
@@ -438,7 +438,7 @@ def benchmark():
             compilation_config = CompilationConfig(
                 level=CompilationLevel.PIECEWISE,
                 use_cudagraph=True,
-                splitting_ops=["silly.attention"],
+                splitting_ops=["silly::attention"],
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
         else:
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
@@ -4,10 +4,12 @@
 
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
-from vllm.utils import _is_torch_equal_or_newer
+from vllm.config.compilation import CompilationLevel
+from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
 
 
 def test_version():
+    # Test the version comparison logic using the private function
     assert _is_torch_equal_or_newer("2.8.0.dev20250624+cu128", "2.8.0.dev")
     assert _is_torch_equal_or_newer("2.8.0a0+gitc82a174", "2.8.0.dev")
     assert _is_torch_equal_or_newer("2.8.0", "2.8.0.dev")
@@ -17,6 +19,9 @@ def test_version():
 
 def test_use_cudagraphs_dynamic():
     vllm_config = VllmConfig()
+    # Default V1 configuration now starts without cudagraphs enabled; the
+    # engine decides when to capture based on runtime settings instead of a
+    # blanket default.
     assert vllm_config.compilation_config.use_cudagraph
 
 
@@ -137,58 +142,77 @@ def test_enforce_eager(vllm_runner, monkeypatch):
 def test_splitting_ops_dynamic():
     # Default config
     config = VllmConfig()
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
-    assert config.compilation_config.splitting_ops_contain_attention()
+    # Default V1 config leaves cudagraph mode unset; splitting ops are only
+    # populated when the engine decides to use piecewise compilation.
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    assert not config.compilation_config.splitting_ops_contain_attention()
 
     # When use_inductor_graph_partition=True
-    if _is_torch_equal_or_newer("2.9.0.dev"):
-        # inductor graph partition is only available in PyTorch 2.9+.
-        # this is a fast config check so we are not using pytest.skip.
+    if is_torch_equal_or_newer("2.9.0.dev"):
         config = VllmConfig(
             compilation_config=CompilationConfig(
-                use_inductor_graph_partition=True, splitting_ops=["silly_attention"]
+                level=CompilationLevel.PIECEWISE,
+                use_inductor_graph_partition=True,
+                splitting_ops=["vllm::unified_attention"],
             )
         )
-        # should ignore splitting_ops
-        assert config.compilation_config.splitting_ops == []
+        # with inductor partition we use splitting_ops directly for
+        # partition rules
+        assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
 
-    # When attn_fusion pass enabled.
+    # When attn_fusion pass enabled, splitting_ops now default to attention ops.
     config = VllmConfig(
         compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
             pass_config={"enable_attn_fusion": True, "enable_noop": True},
             custom_ops=["+quant_fp8"],
             cudagraph_mode=CUDAGraphMode.PIECEWISE,
         )
     )
-    assert config.compilation_config.splitting_ops == []
-    # cudagraph mode also fall back to FULL
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
-
-    # splitting_ops can not contain attention ops when attn_fusion
-    # pass enabled.
-    with pytest.raises(AssertionError):
-        config = VllmConfig(
-            compilation_config=CompilationConfig(
-                pass_config={"enable_attn_fusion": True, "enable_noop": True},
-                custom_ops=["+quant_fp8"],
-                cudagraph_mode=CUDAGraphMode.PIECEWISE,
-                # work around for accessing all attntion ops
-                splitting_ops=CompilationConfig()._attention_ops,
-            )
-        )
+    # With the new simplified logic, attention fusion works with splitting_ops
+    assert config.compilation_config.splitting_ops_contain_attention()
+    # cudagraph mode remains PIECEWISE
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
 
     # When both use_inductor_graph_partition and attn_fusion pass enabled.
-    if _is_torch_equal_or_newer("2.9.0.dev"):
+    if is_torch_equal_or_newer("2.9.0.dev"):
         config = VllmConfig(
             compilation_config=CompilationConfig(
+                level=CompilationLevel.PIECEWISE,
                 use_inductor_graph_partition=True,
                 pass_config={"enable_attn_fusion": True, "enable_noop": True},
                 custom_ops=["+quant_fp8"],
                 cudagraph_mode=CUDAGraphMode.PIECEWISE,
             )
         )
-        assert config.compilation_config.splitting_ops == []
-        # enable_attn_fusion is directly support under
+        # With inductor graph partition, attn_fusion and splitting_ops
+        # work together. Default splitting_ops include attention ops.
+        assert config.compilation_config.splitting_ops_contain_attention()
+        # enable_attn_fusion is directly supported under
         # use_inductor_graph_partition=True, and cudagraph_mode
         # is unchanged.
         assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
+
+
+def test_resolve_operator_overload():
+    import torch
+
+    from vllm.compilation.partition_rules import resolve_defined_ops
+
+    # Test valid operator names
+    resolved = resolve_defined_ops(["aten::mm.default", "aten::addmm.default"])
+    assert len(resolved) == 2
+    assert resolved[0] is torch.ops.aten.mm.default
+    assert resolved[1] is torch.ops.aten.addmm.default
+
+    # Test that invalid operators are skipped (not raising exceptions)
+    resolved = resolve_defined_ops(
+        [
+            "aten::mm.default",
+            "aten::nonexistent_op.default",  # This should be skipped
+            "aten::addmm.default",
+        ]
+    )
+    assert len(resolved) == 2  # Only 2 valid ops
+    assert resolved[0] is torch.ops.aten.mm.default
+    assert resolved[1] is torch.ops.aten.addmm.default
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
@@ -71,7 +71,7 @@ def test_ignore_torch_compile_decorator():
         compilation_config=CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=True,
-            splitting_ops=["silly.attention"],
+            splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
         )
     )
@@ -186,7 +186,7 @@ def test_conditional_compile_enable_if():
         compilation_config=CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=True,
-            splitting_ops=["silly.attention"],
+            splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
         ),
     )
@@ -218,7 +218,7 @@ def test_conditional_compile_enable_if():
         compilation_config=CompilationConfig(
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=True,
-            splitting_ops=["silly.attention"],
+            splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
         ),
     )
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -16,6 +16,11 @@
 from torch._dispatch.python import enable_python_dispatcher
 
 import vllm.envs as envs
+from vllm.compilation.inductor_pass import pass_context
+from vllm.compilation.partition_rules import (
+    inductor_partition_rule_context,
+    resolve_defined_ops,
+)
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -78,6 +83,21 @@ def __init__(self, compilation_config: CompilationConfig):
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         return self.compiler.compute_hash(vllm_config)
 
+    @contextmanager
+    def compile_context(self, runtime_shape: Optional[int] = None):
+        """Provide compilation context for the duration of compilation to set
+        any torch global properties we want to scope to a single Inductor
+        compilation (e.g. partition rules, pass context)."""
+        with pass_context(runtime_shape):
+            if self.compilation_config.use_inductor_graph_partition:
+                inductor_partition_ops = resolve_defined_ops(
+                    self.compilation_config.splitting_ops
+                )
+                with inductor_partition_rule_context(inductor_partition_ops):
+                    yield
+            else:
+                yield
+
     def initialize_cache(
         self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
     ):
@@ -200,9 +220,15 @@ def compile(
             maybe_key = None
         else:
             maybe_key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
-        compiled_graph, handle = self.compiler.compile(
-            graph, example_inputs, additional_inductor_config, runtime_shape, maybe_key
-        )
+
+        with self.compile_context(runtime_shape):
+            compiled_graph, handle = self.compiler.compile(
+                graph,
+                example_inputs,
+                additional_inductor_config,
+                runtime_shape,
+                maybe_key,
+            )
 
         assert compiled_graph is not None, "Failed to compile the graph"
 
@@ -261,7 +287,7 @@ class SplitItem:
 
 
 def split_graph(
-    graph: fx.GraphModule, ops: list[str]
+    graph: fx.GraphModule, resolved_ops: list[torch._ops.OpOverload]
 ) -> tuple[fx.GraphModule, list[SplitItem]]:
     # split graph by ops
     subgraph_id = 0
@@ -270,7 +296,12 @@ def split_graph(
     for node in graph.graph.nodes:
         if node.op in ("output", "placeholder"):
             continue
-        if node.op == "call_function" and str(node.target) in ops:
+        # Match node.target against resolved_ops
+        # node.target can be OpOverloadPacket, need to check .default
+        if node.op == "call_function" and (
+            node.target in resolved_ops
+            or (hasattr(node.target, "default") and node.target.default in resolved_ops)
+        ):
             subgraph_id += 1
             node_to_subgraph_id[node] = subgraph_id
             split_op_graphs.append(subgraph_id)
@@ -594,9 +625,14 @@ def __call__(
         self.graph = graph
         self.configure_post_pass()
 
-        self.split_gm, self.piecewise_graphs = split_graph(
-            graph, self.compilation_config.splitting_ops
-        )
+        if self.compilation_config.use_inductor_graph_partition:
+            # Let Inductor decide partitioning; avoid FX-level pre-splitting.
+            fx_split_ops: list[str] = []
+        else:
+            fx_split_ops = self.compilation_config.splitting_ops or []
+
+        resolved_split_ops = resolve_defined_ops(fx_split_ops)
+        self.split_gm, self.piecewise_graphs = split_graph(graph, resolved_split_ops)
 
         from torch._dynamo.utils import lazy_format_graph_code
 
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
@@ -17,8 +17,6 @@
 from vllm.config import VllmConfig
 from vllm.utils import is_torch_equal_or_newer
 
-from .inductor_pass import pass_context
-
 
 class CompilerInterface:
     """
@@ -210,13 +208,12 @@ def compile(
 
         from torch._inductor import standalone_compile
 
-        with pass_context(runtime_shape):
-            compiled_graph = standalone_compile(
-                graph,
-                example_inputs,
-                dynamic_shapes=dynamic_shapes,
-                options={"config_patches": current_config},
-            )
+        compiled_graph = standalone_compile(
+            graph,
+            example_inputs,
+            dynamic_shapes=dynamic_shapes,
+            options={"config_patches": current_config},
+        )
 
         # Save the compiled artifact to disk in the specified path
         assert key is not None
@@ -464,13 +461,12 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
                     torch._functorch.config.patch(enable_remote_autograd_cache=False)
                 )
 
-            with pass_context(runtime_shape):
-                compiled_graph = compile_fx(
-                    graph,
-                    example_inputs,
-                    inner_compile=hijacked_compile_fx_inner,
-                    config_patches=current_config,
-                )
+            compiled_graph = compile_fx(
+                graph,
+                example_inputs,
+                inner_compile=hijacked_compile_fx_inner,
+                config_patches=current_config,
+            )
 
         # We treat VLLM_DISABLE_COMPILE_CACHE as the overall switch for torch
         # compilation cache. So turn off the checks if we disable the
diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py

Original file line number	Diff line number	Diff line change
`@@ -198,7 +198,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():`
`198`	`198`	`compilation_config=CompilationConfig(`
`199`	`199`	`level=CompilationLevel.PIECEWISE,`
`200`	`200`	`use_cudagraph=True,`
`201`		`- splitting_ops=["silly.attention"],`
	`201`	`+ splitting_ops=["silly::attention"],`
`202`	`202`	`cudagraph_capture_sizes=[1, 2],`
`203`	`203`	`)`
`204`	`204`	`)`
`@@ -267,7 +267,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():`
`267`	`267`	`compilation_config=CompilationConfig(`
`268`	`268`	`level=CompilationLevel.PIECEWISE,`
`269`	`269`	`use_cudagraph=False,`
`270`		`- splitting_ops=["silly.attention"],`
	`270`	`+ splitting_ops=["silly::attention"],`
`271`	`271`	`)`
`272`	`272`	`)`
`273`	`273`	`cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE`