vllm-project
diff --git a/‎tests/compile/test_aot_compile.py‎
Lines changed: 106 additions & 28 deletions b/‎tests/compile/test_aot_compile.py‎
Lines changed: 106 additions & 28 deletions
diff --git a/‎tools/pre_commit/check_pickle_imports.py‎
Lines changed: 1 addition & 0 deletions b/‎tools/pre_commit/check_pickle_imports.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/compilation/backends.py‎
Lines changed: 16 additions & 64 deletions b/‎vllm/compilation/backends.py‎
Lines changed: 16 additions & 64 deletions
@@ -1,61 +1,139 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import tempfile
 from contextlib import contextmanager
 
 import pytest
 import torch
 
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import (
+    CompilationConfig,
+    CompilationLevel,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.forward_context import set_forward_context
+from vllm.utils import is_torch_equal_or_newer
 
 
-class MyMod(torch.nn.Module):
+def reference_fn(x: torch.Tensor):
+    assert x.shape[0] <= 42
+    assert x.shape[0] % 2 == 0
+    for _ in range(3000):
+        x = x + x.shape[0]
+    return x
 
+
+@support_torch_compile
+class CompiledMod(torch.nn.Module):
     def __init__(self, **kwargs):
         super().__init__()
 
     def forward(self, x: torch.Tensor):
-        for _ in range(3000):
-            x = x + x.shape[0]
-        return x
+        return reference_fn(x)
 
 
 def make_vllm_config() -> VllmConfig:
-    return VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE, ))
+    return VllmConfig(
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+        )
+    )
 
 
 @contextmanager
 def use_vllm_config(vllm_config: VllmConfig):
-    with set_forward_context(
-        {}, vllm_config), set_current_vllm_config(vllm_config):
+    with set_forward_context({}, vllm_config), set_current_vllm_config(vllm_config):
         yield
 
 
-def test_no_eval_frame(monkeypatch: pytest.MonkeyPatch):
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
-        mod = MyMod()
-        args = (torch.randn(10, 10), )
-        expected = mod(*args)
-        CompiledMod = support_torch_compile(MyMod)
-
         vllm_config = make_vllm_config()
-        m.setenv("VLLM_USE_AOT_COMPILE", "0")
-        try:
-            with use_vllm_config(vllm_config), torch.compiler.set_stance(
-                    "fail_on_recompile"):
+        args = (torch.randn(10, 10),)
+        expected = reference_fn(*args)
+        with use_vllm_config(vllm_config):
+            m.setenv("VLLM_USE_AOT_COMPILE", "0")
+            with (
+                pytest.raises(RuntimeError, match="Detected recompile"),
+                torch.compiler.set_stance("fail_on_recompile"),
+            ):
                 CompiledMod(vllm_config=vllm_config)(*args)
-        except RuntimeError as e:
-            assert "Detected recompile" in str(e)
-        else:
-            raise AssertionError("Expected exception to be raised")
 
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            torch._dynamo.reset()
+            with torch.compiler.set_stance("fail_on_recompile"):
+                actual = CompiledMod(vllm_config=vllm_config)(*args)
+            assert torch.allclose(actual, expected)
+
+
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
+    with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m:
+        args = (torch.randn(10, 10),)
         m.setenv("VLLM_USE_AOT_COMPILE", "1")
-        torch._dynamo.reset()
-        with use_vllm_config(vllm_config), torch.compiler.set_stance(
-                "fail_on_recompile"):
-            ret = CompiledMod(vllm_config=vllm_config)(*args)
+        m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+        m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+        vllm_config = make_vllm_config()
+        with use_vllm_config(vllm_config), pytest.raises(FileNotFoundError):
+            CompiledMod(vllm_config=vllm_config)(*args)
+
+
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        args = (torch.randn(10, 10),)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                expected = CompiledMod(vllm_config=vllm_config)(*args)
+
+            m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                ret = CompiledMod(vllm_config=vllm_config)(*args)
             assert torch.allclose(ret, expected)
+
+
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_shape_env(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that the shape environment is correctly serialized and preserved
+    when loading from cache.
+    """
+    with monkeypatch.context() as m:
+        args = (torch.randn(10, 10),)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                compiled_mod = CompiledMod(vllm_config=vllm_config)
+                compiled_mod(*args)
+                artifacts = compiled_mod.aot_compiled_fn._artifacts
+                guards_string = artifacts.compiled_fn.shape_env.format_guards()
+                assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
+
+            m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                compiled_mod = CompiledMod(vllm_config=vllm_config)
+                compiled_mod(*args)
+                artifacts = compiled_mod.aot_compiled_fn._artifacts
+                guards_string = artifacts.compiled_fn.shape_env.format_guards()
+                assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
@@ -22,6 +22,7 @@
     "vllm/multimodal/hasher.py",
     "vllm/transformers_utils/config.py",
     "vllm/model_executor/models/registry.py",
+    "vllm/compilation/caching.py",
     "tests/utils_/test_utils.py",
     "tests/tokenization/test_cached_tokenizer.py",
     "vllm/distributed/utils.py",
 
@@ -3,6 +3,7 @@
 
 import ast
 import dataclasses
+import hashlib
 import os
 import pprint
 import time
@@ -25,6 +26,7 @@
 from vllm.platforms import current_platform
 from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
 
+from .caching import VllmSerializableFunction
 from .compiler_interface import (
     CompilerInterface,
     EagerAdaptor,
@@ -195,6 +197,7 @@ def compile(
                 # there can be multiple graphs due to piecewise compilation.
                 now = time.time()
                 elapsed = now - compilation_start_time
+                compilation_config.compilation_time += elapsed
                 if runtime_shape is None:
                     logger.info(
                         "Directly load the compiled graph(s) for dynamic shape "
@@ -472,35 +475,6 @@ def set_model_tag(tag: str):
         model_tag = old_tag
 
 
-try:
-    from torch._dynamo.aot_compile import SerializableCallable
-except ImportError:
-    SerializableCallable = object
-
-assert isinstance(SerializableCallable, type)
-
-
-class VllmCompiledFunction(SerializableCallable):
-
-    def __init__(self, graph_module, example_inputs, vllm_config,
-                 optimized_call):
-        self.graph_module = graph_module
-        self.example_inputs = example_inputs
-        self.vllm_config = vllm_config
-        self.optimized_call = optimized_call
-
-    def __call__(self, *args, **kwargs):
-        return self.optimized_call(*args, **kwargs)
-
-    @classmethod
-    def serialize_compile_artifacts(cls, compiled_fn):
-        raise NotImplementedError("serialization not implemented")
-
-    @classmethod
-    def deserialize_compile_artifacts(cls, data):
-        raise NotImplementedError("deserialization not implemented")
-
-
 class VllmBackend:
     """The compilation backend for `torch.compile` with vLLM.
     It is used for compilation level of `CompilationLevel.PIECEWISE`,
@@ -578,47 +552,23 @@ def configure_post_pass(self):
                 self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
         inductor_config[PASS_KEY] = self.post_grad_pass_manager
 
-    def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
+    def __call__(
+        self, graph: fx.GraphModule, example_inputs
+    ) -> VllmSerializableFunction:
+        from .caching import _compute_code_hash, compilation_config_hash_factors
+
         vllm_config = self.vllm_config
         if not self.compilation_config.cache_dir:
             # no provided cache dir, generate one based on the known factors
             # that affects the compilation. if none of the factors change,
             # the cache dir will be the same so that we can reuse the compiled
             # graph.
 
-            factors = []
-            # 0. factors come from the env, for example, The values of
-            # VLLM_PP_LAYER_PARTITION will affect the computation graph.
-            env_hash = envs.compute_hash()
-            factors.append(env_hash)
-
-            # 1. factors come from the vllm_config (it mainly summarizes how the
-            #    model is created)
-            config_hash = vllm_config.compute_hash()
-            factors.append(config_hash)
-
+            factors = compilation_config_hash_factors(vllm_config)
             # 2. factors come from the code files that are traced by Dynamo (
             #    it mainly summarizes how the model is used in forward pass)
-            forward_code_files = list(sorted(self.compilation_config.traced_files))
+            code_hash = _compute_code_hash(self.compilation_config.traced_files)
             self.compilation_config.traced_files.clear()
-            logger.debug(
-                "Traced files (to be considered for compilation cache):\n%s",
-                "\n".join(forward_code_files),
-            )
-            hash_content = []
-            for filepath in forward_code_files:
-                hash_content.append(filepath)
-                if filepath == "<string>":
-                    # This means the function was dynamically generated, with
-                    # e.g. exec(). We can't actually check these.
-                    continue
-                with open(filepath) as f:
-                    hash_content.append(f.read())
-            import hashlib
-
-            code_hash = hashlib.md5(
-                "\n".join(hash_content).encode(), usedforsecurity=False
-            ).hexdigest()
             factors.append(code_hash)
 
             # 3. compiler hash
@@ -724,8 +674,9 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
             or not self.compilation_config.cudagraph_copy_inputs
         ):
-            return VllmCompiledFunction(graph, example_inputs, vllm_config,
-                                        self.split_gm)
+            return VllmSerializableFunction(
+                graph, example_inputs, self.prefix, self.split_gm
+            )
 
         # if we need to copy input buffers for cudagraph
         from torch._guards import detect_fake_mode
@@ -770,5 +721,6 @@ def copy_and_call(*args):
                 list_args[index] = static_tensor
             return self.split_gm(*list_args)
 
-        return VllmCompiledFunction(graph, example_inputs, vllm_config,
-                                    copy_and_call)
+        return VllmSerializableFunction(
+            graph, example_inputs, self.prefix, copy_and_call
+        )