vllm-project
diff --git a/‎tests/compile/test_aot_compile.py‎
Lines changed: 82 additions & 22 deletions b/‎tests/compile/test_aot_compile.py‎
Lines changed: 82 additions & 22 deletions
diff --git a/‎tools/pre_commit/check_pickle_imports.py‎
Lines changed: 1 addition & 0 deletions b/‎tools/pre_commit/check_pickle_imports.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/compilation/backends.py‎
Lines changed: 16 additions & 62 deletions b/‎vllm/compilation/backends.py‎
Lines changed: 16 additions & 62 deletions
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import tempfile
 from contextlib import contextmanager
 
 import pytest
@@ -12,15 +13,22 @@
 from vllm.forward_context import set_forward_context
 
 
-class MyMod(torch.nn.Module):
+def reference_fn(x: torch.Tensor):
+    assert x.shape[0] <= 42
+    assert x.shape[0] % 2 == 0
+    for _ in range(3000):
+        x = x + x.shape[0]
+    return x
+
+
+@support_torch_compile
+class CompiledMod(torch.nn.Module):
 
     def __init__(self, **kwargs):
         super().__init__()
 
     def forward(self, x: torch.Tensor):
-        for _ in range(3000):
-            x = x + x.shape[0]
-        return x
+        return reference_fn(x)
 
 
 def make_vllm_config() -> VllmConfig:
@@ -30,32 +38,84 @@ def make_vllm_config() -> VllmConfig:
 
 @contextmanager
 def use_vllm_config(vllm_config: VllmConfig):
-    with set_forward_context(
-        {}, vllm_config), set_current_vllm_config(vllm_config):
+    with set_forward_context({}, vllm_config), \
+        set_current_vllm_config(vllm_config):
         yield
 
 
-def test_no_eval_frame(monkeypatch: pytest.MonkeyPatch):
+def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
-        mod = MyMod()
+        vllm_config = make_vllm_config()
         args = (torch.randn(10, 10), )
-        expected = mod(*args)
-        CompiledMod = support_torch_compile(MyMod)
+        expected = reference_fn(*args)
+        with use_vllm_config(vllm_config):
+            m.setenv("VLLM_USE_AOT_COMPILE", "0")
+            with pytest.raises(RuntimeError, match="Detected recompile"), \
+                torch.compiler.set_stance("fail_on_recompile"):
+                CompiledMod(vllm_config=vllm_config)(*args)
 
-        vllm_config = make_vllm_config()
-        m.setenv("VLLM_USE_AOT_COMPILE", "0")
-        try:
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            torch._dynamo.reset()
             with use_vllm_config(vllm_config), torch.compiler.set_stance(
                     "fail_on_recompile"):
-                CompiledMod(vllm_config=vllm_config)(*args)
-        except RuntimeError as e:
-            assert "Detected recompile" in str(e)
-        else:
-            raise AssertionError("Expected exception to be raised")
+                actual = CompiledMod(vllm_config=vllm_config)(*args)
+            assert torch.allclose(actual, expected)
 
+
+def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
+    with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context(
+    ) as m:
+        args = (torch.randn(10, 10), )
         m.setenv("VLLM_USE_AOT_COMPILE", "1")
-        torch._dynamo.reset()
-        with use_vllm_config(vllm_config), torch.compiler.set_stance(
-                "fail_on_recompile"):
-            ret = CompiledMod(vllm_config=vllm_config)(*args)
+        m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+        m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+        vllm_config = make_vllm_config()
+        with use_vllm_config(vllm_config), pytest.raises(FileNotFoundError):
+            CompiledMod(vllm_config=vllm_config)(*args)
+
+
+def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        args = (torch.randn(10, 10), )
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                expected = CompiledMod(vllm_config=vllm_config)(*args)
+
+            m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                ret = CompiledMod(vllm_config=vllm_config)(*args)
             assert torch.allclose(ret, expected)
+
+
+def test_shape_env(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that the shape environment is correctly serialized and preserved
+    when loading from cache.
+    """
+    with monkeypatch.context() as m:
+        args = (torch.randn(10, 10), )
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            m.setenv("VLLM_CACHE_ROOT", tmpdirname)
+            m.setenv("VLLM_USE_AOT_COMPILE", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                compiled_mod = CompiledMod(vllm_config=vllm_config)
+                compiled_mod(*args)
+                artifacts = compiled_mod.aot_compiled_fn._artifacts
+                guards_string = artifacts.compiled_fn.shape_env.format_guards()
+                assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
+
+            m.setenv("VLLM_FORCE_AOT_LOAD", "1")
+            vllm_config = make_vllm_config()
+            with use_vllm_config(vllm_config):
+                compiled_mod = CompiledMod(vllm_config=vllm_config)
+                compiled_mod(*args)
+                artifacts = compiled_mod.aot_compiled_fn._artifacts
+                guards_string = artifacts.compiled_fn.shape_env.format_guards()
+                assert guards_string == " - s77 <= 42\n - Eq(Mod(s77, 2), 0)"
@@ -22,6 +22,7 @@
     'vllm/multimodal/hasher.py',
     'vllm/transformers_utils/config.py',
     'vllm/model_executor/models/registry.py',
+    "vllm/compilation/caching.py",
     'tests/utils_/test_utils.py',
     'tests/tokenization/test_cached_tokenizer.py',
     'vllm/distributed/utils.py',
 
@@ -3,6 +3,7 @@
 
 import ast
 import dataclasses
+import hashlib
 import os
 import pprint
 import time
@@ -20,6 +21,7 @@
 from vllm.platforms import current_platform
 from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
 
+from .caching import VllmSerializableFunction
 from .compiler_interface import (CompilerInterface, EagerAdaptor,
                                  InductorAdaptor, InductorStandaloneAdaptor)
 from .counter import compilation_counter
@@ -160,6 +162,7 @@ def compile(self,
                 # there can be multiple graphs due to piecewise compilation.
                 now = time.time()
                 elapsed = now - compilation_start_time
+                compilation_config.compilation_time += elapsed
                 if runtime_shape is None:
                     logger.info(
                         "Directly load the compiled graph(s) for dynamic shape "
@@ -398,35 +401,6 @@ def set_model_tag(tag: str):
         model_tag = old_tag
 
 
-try:
-    from torch._dynamo.aot_compile import SerializableCallable
-except ImportError:
-    SerializableCallable = object
-
-assert isinstance(SerializableCallable, type)
-
-
-class VllmCompiledFunction(SerializableCallable):
-
-    def __init__(self, graph_module, example_inputs, vllm_config,
-                 optimized_call):
-        self.graph_module = graph_module
-        self.example_inputs = example_inputs
-        self.vllm_config = vllm_config
-        self.optimized_call = optimized_call
-
-    def __call__(self, *args, **kwargs):
-        return self.optimized_call(*args, **kwargs)
-
-    @classmethod
-    def serialize_compile_artifacts(cls, compiled_fn):
-        raise NotImplementedError("serialization not implemented")
-
-    @classmethod
-    def deserialize_compile_artifacts(cls, data):
-        raise NotImplementedError("deserialization not implemented")
-
-
 class VllmBackend:
     """The compilation backend for `torch.compile` with vLLM.
     It is used for compilation level of `CompilationLevel.PIECEWISE`,
@@ -502,7 +476,11 @@ def configure_post_pass(self):
                 self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
         inductor_config[PASS_KEY] = self.post_grad_pass_manager
 
-    def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
+    def __call__(self, graph: fx.GraphModule,
+                 example_inputs) -> VllmSerializableFunction:
+
+        from .caching import (_compute_code_hash,
+                              compilation_config_hash_factors)
 
         vllm_config = self.vllm_config
         if not self.compilation_config.cache_dir:
@@ -511,37 +489,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             # the cache dir will be the same so that we can reuse the compiled
             # graph.
 
-            factors = []
-            # 0. factors come from the env, for example, The values of
-            # VLLM_PP_LAYER_PARTITION will affect the computation graph.
-            env_hash = envs.compute_hash()
-            factors.append(env_hash)
-
-            # 1. factors come from the vllm_config (it mainly summarizes how the
-            #    model is created)
-            config_hash = vllm_config.compute_hash()
-            factors.append(config_hash)
-
+            factors = compilation_config_hash_factors(vllm_config)
             # 2. factors come from the code files that are traced by Dynamo (
             #    it mainly summarizes how the model is used in forward pass)
-            forward_code_files = list(
-                sorted(self.compilation_config.traced_files))
+            code_hash = _compute_code_hash(
+                self.compilation_config.traced_files)
             self.compilation_config.traced_files.clear()
-            logger.debug(
-                "Traced files (to be considered for compilation cache):\n%s",
-                "\n".join(forward_code_files))
-            hash_content = []
-            for filepath in forward_code_files:
-                hash_content.append(filepath)
-                if filepath == "<string>":
-                    # This means the function was dynamically generated, with
-                    # e.g. exec(). We can't actually check these.
-                    continue
-                with open(filepath) as f:
-                    hash_content.append(f.read())
-            import hashlib
-            code_hash = hashlib.md5("\n".join(hash_content).encode(),
-                                    usedforsecurity=False).hexdigest()
+
             factors.append(code_hash)
 
             # 3. compiler hash
@@ -634,8 +588,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
         if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE or \
             not self.compilation_config.cudagraph_copy_inputs:
-            return VllmCompiledFunction(graph, example_inputs, vllm_config,
-                                        self.split_gm)
+            return VllmSerializableFunction(graph, example_inputs, self.prefix,
+                                            self.split_gm)
 
         # if we need to copy input buffers for cudagraph
         from torch._guards import detect_fake_mode
@@ -677,5 +631,5 @@ def copy_and_call(*args):
                 list_args[index] = static_tensor
             return self.split_gm(*list_args)
 
-        return VllmCompiledFunction(graph, example_inputs, vllm_config,
-                                    copy_and_call)
+        return VllmSerializableFunction(graph, example_inputs, self.prefix,
+                                        copy_and_call)