From 397ed640327dbfc9ab05023daff6f0123295f8b6 Mon Sep 17 00:00:00 2001 From: Angky William Date: Mon, 31 Mar 2025 20:11:48 -0700 Subject: [PATCH 01/17] Dynamic LoRA plugin Signed-off-by: Angky William --- tests/lora/test_lora_resolver.py | 69 +++++++++++++++++++ vllm/entrypoints/openai/serving_engine.py | 3 + vllm/entrypoints/openai/serving_models.py | 34 ++++++++++ vllm/lora/resolver.py | 81 +++++++++++++++++++++++ 4 files changed, 187 insertions(+) create mode 100644 tests/lora/test_lora_resolver.py create mode 100644 vllm/lora/resolver.py diff --git a/tests/lora/test_lora_resolver.py b/tests/lora/test_lora_resolver.py new file mode 100644 index 000000000000..47adba25df53 --- /dev/null +++ b/tests/lora/test_lora_resolver.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional + +import pytest + +from vllm.lora.request import LoRARequest +from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry + + +class DummyLoRAResolver(LoRAResolver): + """A dummy LoRA resolver for testing.""" + + async def resolve_lora(self, lora_name: str) -> Optional[LoRARequest]: + if lora_name == "test_lora": + return LoRARequest(lora_name=lora_name, + lora_path="/dummy/path", + lora_int_id=abs(hash(lora_name))) + return None + + +def test_resolver_registry_registration(): + """Test basic resolver registration functionality.""" + registry = LoRAResolverRegistry + resolver = DummyLoRAResolver() + + # Register a new resolver + registry.register_resolver("dummy", resolver) + assert "dummy" in registry.get_supported_resolvers() + + # Get registered resolver + retrieved_resolver = registry.get_resolver("dummy") + assert retrieved_resolver is resolver + + +def test_resolver_registry_duplicate_registration(): + """Test registering a resolver with an existing name.""" + registry = LoRAResolverRegistry + resolver1 = DummyLoRAResolver() + resolver2 = DummyLoRAResolver() + + registry.register_resolver("dummy", resolver1) + registry.register_resolver("dummy", resolver2) + + assert registry.get_resolver("dummy") is resolver2 + + +def test_resolver_registry_unknown_resolver(): + """Test getting a non-existent resolver.""" + registry = LoRAResolverRegistry + + with pytest.raises(KeyError, match="not found"): + registry.get_resolver("unknown_resolver") + + +@pytest.mark.asyncio +async def test_dummy_resolver_resolve(): + """Test the dummy resolver's resolve functionality.""" + dummy_resolver = DummyLoRAResolver() + + # Test successful resolution + result = await dummy_resolver.resolve_lora("test_lora") + assert isinstance(result, LoRARequest) + assert result.lora_name == "test_lora" + assert result.lora_path == "/dummy/path" + + # Test failed resolution + result = await dummy_resolver.resolve_lora("nonexistent_lora") + assert result is None diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 7cb4a2dce1dc..d9ad1dbbc983 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -131,6 +131,9 @@ async def _check_model( lora.lora_name for lora in self.models.lora_requests ]: return None + if request.model is not None and (await self.models.resolve_lora( + request.model)): + return None if request.model in [ prompt_adapter.prompt_adapter_name for prompt_adapter in self.models.prompt_adapter_requests diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 7a68452efc65..358b01f8bd00 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -2,6 +2,8 @@ import json import pathlib +from asyncio import Lock +from collections import defaultdict from dataclasses import dataclass from http import HTTPStatus from typing import Optional, Union @@ -15,6 +17,7 @@ UnloadLoRAAdapterRequest) from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.utils import AtomicCounter @@ -68,6 +71,13 @@ def __init__( self.lora_requests: list[LoRARequest] = [] self.lora_id_counter = AtomicCounter(0) + self.lora_resolvers: list[LoRAResolver] = [] + for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers( + ): + self.lora_resolvers.append( + LoRAResolverRegistry.get_resolver(lora_resolver_name)) + self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock) + self.prompt_adapter_requests = [] if prompt_adapters is not None: for i, prompt_adapter in enumerate(prompt_adapters, start=1): @@ -234,6 +244,30 @@ async def _check_unload_lora_adapter_request( return None + async def resolve_lora(self, lora_name: str) -> Optional[LoRARequest]: + """Attempt to resolve a LoRA adapter using available resolvers. + + Args: + lora_name: Name/identifier of the LoRA adapter + + Returns: + Optional[LoRARequest]: LoRA request if found, None otherwise + """ + async with self.lora_resolver_lock[lora_name]: + # First check if this LoRA is already loaded + for existing in self.lora_requests: + if existing.lora_name == lora_name: + return existing + + # Try to resolve using available resolvers + for resolver in self.lora_resolvers: + lora_request = await resolver.resolve_lora(lora_name) + if lora_request is not None: + self.lora_requests.append(lora_request) + return lora_request + + return None + def create_error_response( message: str, diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py new file mode 100644 index 000000000000..0af3d859d999 --- /dev/null +++ b/vllm/lora/resolver.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import AbstractSet, Dict, Optional + +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest + +logger = init_logger(__name__) + + +class LoRAResolver(ABC): + """Base class for LoRA adapter resolvers. + + This class defines the interface for resolving and fetching LoRA adapters. + Implementations of this class should handle the logic for locating and + downloading LoRA adapters from various sources (e.g. S3, cloud storage, + etc.). + """ + + @abstractmethod + async def resolve_lora(self, lora_name: str) -> Optional[LoRARequest]: + """Abstract method to resolve and fetch a LoRA model adapter. + + Implements logic to locate and download LoRA adapter based on the name. + Implementations might fetch from a blob storage or other sources. + + Args: + lora_name: The name or identifier of the LoRA model to resolve. + + Returns: + Optional[LoRARequest]: The resolved LoRA model information, or None + if the LoRA model cannot be found. + """ + pass + + +@dataclass +class _LoRAResolverRegistry: + resolvers: Dict[str, LoRAResolver] = field(default_factory=dict) + + def get_supported_resolvers(self) -> AbstractSet[str]: + """Get all registered resolver names.""" + return self.resolvers.keys() + + def register_resolver( + self, + resolver_name: str, + resolver: LoRAResolver, + ) -> None: + """Register a LoRA resolver. + Args: + resolver_name: Name to register the resolver under. + resolver: The LoRA resolver instance to register. + """ + if resolver_name in self.resolvers: + logger.warning( + "LoRA resolver %s is already registered, and will be " + "overwritten by the new resolver instance %s.", resolver_name, + resolver) + + self.resolvers[resolver_name] = resolver + + def get_resolver(self, resolver_name: str) -> LoRAResolver: + """Get a registered resolver instance by name. + Args: + resolver_name: Name of the resolver to get. + Returns: + The resolver instance. + Raises: + KeyError: If the resolver is not found in the registry. + """ + if resolver_name not in self.resolvers: + raise KeyError( + f"LoRA resolver '{resolver_name}' not found. " + f"Available resolvers: {list(self.resolvers.keys())}") + return self.resolvers[resolver_name] + + +LoRAResolverRegistry = _LoRAResolverRegistry() From 1320607a59c66ca2c1c56b32466bbeef79bd4063 Mon Sep 17 00:00:00 2001 From: Angky William Date: Wed, 9 Apr 2025 16:23:33 -0700 Subject: [PATCH 02/17] Check lora_resolver output before adding to lora_requests Signed-off-by: Angky William --- vllm/entrypoints/openai/serving_models.py | 27 ++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 358b01f8bd00..fb8836563361 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -260,12 +260,33 @@ async def resolve_lora(self, lora_name: str) -> Optional[LoRARequest]: return existing # Try to resolve using available resolvers + unique_id = abs(hash(lora_name)) for resolver in self.lora_resolvers: lora_request = await resolver.resolve_lora(lora_name) - if lora_request is not None: - self.lora_requests.append(lora_request) - return lora_request + if lora_request is not None: + lora_request.lora_int_id = unique_id + + try: + await self.engine_client.add_lora(lora_request) + # Successfully added, append and return + self.lora_requests.append(lora_request) + logger.info( + "Resolved and loaded LoRA adapter '%s' using %s", + lora_name, resolver.__class__.__name__) + return lora_request + except BaseException as e: + # Log the error and try the next resolver + logger.warning( + "Failed to load LoRA '%s' resolved by %s: %s. " + "Trying next resolver.", lora_name, + resolver.__class__.__name__, e) + continue # Try the next resolver + + # If no resolver could successfully resolve and load the LoRA + logger.warning( + "Could not resolve or load LoRA adapter '%s' with any " + "available resolver.", lora_name) return None From 6088b3693eae2988a9b08cf54b154142ddcd5a91 Mon Sep 17 00:00:00 2001 From: Angky William Date: Thu, 10 Apr 2025 12:26:28 -0700 Subject: [PATCH 03/17] add base_model_name to LoRaResolver.resolve_lora and make resolve_lora error response more robust Signed-off-by: Angky William --- vllm/entrypoints/openai/serving_engine.py | 15 +++++--- vllm/entrypoints/openai/serving_models.py | 42 ++++++++++++++--------- vllm/lora/resolver.py | 5 ++- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 8d9e95e2c2fe..e9dd012e2e11 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -125,21 +125,28 @@ async def _check_model( self, request: AnyRequest, ) -> Optional[ErrorResponse]: + + error_response = None + if self._is_model_supported(request.model): return None if request.model in [ lora.lora_name for lora in self.models.lora_requests ]: return None - if request.model is not None and (await self.models.resolve_lora( - request.model)): - return None + if request.model and (load_result := await self.models.resolve_lora(request.model)): + if isinstance(load_result, LoRARequest): + return None + if isinstance(load_result, ErrorResponse) and \ + load_result.code == HTTPStatus.BAD_REQUEST.value: + error_response = load_result if request.model in [ prompt_adapter.prompt_adapter_name for prompt_adapter in self.models.prompt_adapter_requests ]: return None - return self.create_error_response( + + return error_response or self.create_error_response( message=f"The model `{request.model}` does not exist.", err_type="NotFoundError", status_code=HTTPStatus.NOT_FOUND) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index fb8836563361..5a4b12d4388e 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -19,7 +19,6 @@ from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.utils import AtomicCounter logger = init_logger(__name__) @@ -66,10 +65,10 @@ def __init__( self.base_model_paths = base_model_paths self.max_model_len = model_config.max_model_len self.engine_client = engine_client + self.model_config = model_config self.static_lora_modules = lora_modules self.lora_requests: list[LoRARequest] = [] - self.lora_id_counter = AtomicCounter(0) self.lora_resolvers: list[LoRAResolver] = [] for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers( @@ -158,7 +157,7 @@ async def load_lora_adapter( return error_check_ret lora_name, lora_path = request.lora_name, request.lora_path - unique_id = self.lora_id_counter.inc(1) + unique_id = abs(hash(lora_name)) lora_request = LoRARequest(lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path) @@ -244,14 +243,16 @@ async def _check_unload_lora_adapter_request( return None - async def resolve_lora(self, lora_name: str) -> Optional[LoRARequest]: + async def resolve_lora(self, lora_name: str) -> Union[LoRARequest, ErrorResponse]: """Attempt to resolve a LoRA adapter using available resolvers. Args: lora_name: Name/identifier of the LoRA adapter Returns: - Optional[LoRARequest]: LoRA request if found, None otherwise + LoRARequest if found and loaded successfully. + ErrorResponse (404) if no resolver finds the adapter. + ErrorResponse (400) if adapter(s) are found but none load successfully. """ async with self.lora_resolver_lock[lora_name]: # First check if this LoRA is already loaded @@ -259,35 +260,44 @@ async def resolve_lora(self, lora_name: str) -> Optional[LoRARequest]: if existing.lora_name == lora_name: return existing - # Try to resolve using available resolvers + base_model_name = self.model_config.model unique_id = abs(hash(lora_name)) + found_adapter = False + + # Try to resolve using available resolvers for resolver in self.lora_resolvers: - lora_request = await resolver.resolve_lora(lora_name) + lora_request = await resolver.resolve_lora(base_model_name, lora_name) if lora_request is not None: + found_adapter = True lora_request.lora_int_id = unique_id try: await self.engine_client.add_lora(lora_request) - # Successfully added, append and return self.lora_requests.append(lora_request) logger.info( "Resolved and loaded LoRA adapter '%s' using %s", lora_name, resolver.__class__.__name__) return lora_request except BaseException as e: - # Log the error and try the next resolver logger.warning( "Failed to load LoRA '%s' resolved by %s: %s. " "Trying next resolver.", lora_name, resolver.__class__.__name__, e) - continue # Try the next resolver - - # If no resolver could successfully resolve and load the LoRA - logger.warning( - "Could not resolve or load LoRA adapter '%s' with any " - "available resolver.", lora_name) - return None + continue + + if found_adapter: + # An adapter was found by at least one resolver, but all attempts to load it failed. + return create_error_response( + message=f"LoRA adapter '{lora_name}' was found but could not be loaded.", + err_type="BadRequestError", + status_code=HTTPStatus.BAD_REQUEST) + else: + # No adapter was found + return create_error_response( + message=f"LoRA adapter {lora_name} does not exist", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND) def create_error_response( diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py index 0af3d859d999..478d504276f7 100644 --- a/vllm/lora/resolver.py +++ b/vllm/lora/resolver.py @@ -20,13 +20,16 @@ class LoRAResolver(ABC): """ @abstractmethod - async def resolve_lora(self, lora_name: str) -> Optional[LoRARequest]: + async def resolve_lora( + self, base_model_name: str, lora_name: str + ) -> Optional[LoRARequest]: """Abstract method to resolve and fetch a LoRA model adapter. Implements logic to locate and download LoRA adapter based on the name. Implementations might fetch from a blob storage or other sources. Args: + base_model_name: The name or identifier of the base model to resolve. lora_name: The name or identifier of the LoRA model to resolve. Returns: From 91ab230c0961339b1988261a8762efa8c47c4def Mon Sep 17 00:00:00 2001 From: Angky William Date: Thu, 10 Apr 2025 12:55:37 -0700 Subject: [PATCH 04/17] Fix precommit check Signed-off-by: Angky William --- vllm/entrypoints/openai/serving_engine.py | 3 ++- vllm/entrypoints/openai/serving_models.py | 13 ++++++++----- vllm/lora/resolver.py | 9 ++++----- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index e9dd012e2e11..117c5b692874 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -134,7 +134,8 @@ async def _check_model( lora.lora_name for lora in self.models.lora_requests ]: return None - if request.model and (load_result := await self.models.resolve_lora(request.model)): + if request.model and (load_result := await self.models.resolve_lora( + request.model)): if isinstance(load_result, LoRARequest): return None if isinstance(load_result, ErrorResponse) and \ diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 5a4b12d4388e..52d19fb72c67 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -243,7 +243,8 @@ async def _check_unload_lora_adapter_request( return None - async def resolve_lora(self, lora_name: str) -> Union[LoRARequest, ErrorResponse]: + async def resolve_lora( + self, lora_name: str) -> Union[LoRARequest, ErrorResponse]: """Attempt to resolve a LoRA adapter using available resolvers. Args: @@ -252,7 +253,7 @@ async def resolve_lora(self, lora_name: str) -> Union[LoRARequest, ErrorResponse Returns: LoRARequest if found and loaded successfully. ErrorResponse (404) if no resolver finds the adapter. - ErrorResponse (400) if adapter(s) are found but none load successfully. + ErrorResponse (400) if adapter(s) are found but none load. """ async with self.lora_resolver_lock[lora_name]: # First check if this LoRA is already loaded @@ -266,7 +267,8 @@ async def resolve_lora(self, lora_name: str) -> Union[LoRARequest, ErrorResponse # Try to resolve using available resolvers for resolver in self.lora_resolvers: - lora_request = await resolver.resolve_lora(base_model_name, lora_name) + lora_request = await resolver.resolve_lora( + base_model_name, lora_name) if lora_request is not None: found_adapter = True @@ -287,9 +289,10 @@ async def resolve_lora(self, lora_name: str) -> Union[LoRARequest, ErrorResponse continue if found_adapter: - # An adapter was found by at least one resolver, but all attempts to load it failed. + # An adapter was found, but all attempts to load it failed. return create_error_response( - message=f"LoRA adapter '{lora_name}' was found but could not be loaded.", + message=(f"LoRA adapter '{lora_name}' was found " + "but could not be loaded."), err_type="BadRequestError", status_code=HTTPStatus.BAD_REQUEST) else: diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py index 478d504276f7..6726ca9a903f 100644 --- a/vllm/lora/resolver.py +++ b/vllm/lora/resolver.py @@ -20,17 +20,16 @@ class LoRAResolver(ABC): """ @abstractmethod - async def resolve_lora( - self, base_model_name: str, lora_name: str - ) -> Optional[LoRARequest]: + async def resolve_lora(self, base_model_name: str, + lora_name: str) -> Optional[LoRARequest]: """Abstract method to resolve and fetch a LoRA model adapter. Implements logic to locate and download LoRA adapter based on the name. Implementations might fetch from a blob storage or other sources. Args: - base_model_name: The name or identifier of the base model to resolve. - lora_name: The name or identifier of the LoRA model to resolve. + base_model_name: The name/identifier of the base model to resolve. + lora_name: The name/identifier of the LoRA model to resolve. Returns: Optional[LoRARequest]: The resolved LoRA model information, or None From e4886bcd855edb5711b1e61c04d0a3cffeed6280 Mon Sep 17 00:00:00 2001 From: Angky William Date: Thu, 10 Apr 2025 13:04:48 -0700 Subject: [PATCH 05/17] update lora resolver test Signed-off-by: Angky William --- tests/lora/test_lora_resolver.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/lora/test_lora_resolver.py b/tests/lora/test_lora_resolver.py index 47adba25df53..8ebc2ae98fc4 100644 --- a/tests/lora/test_lora_resolver.py +++ b/tests/lora/test_lora_resolver.py @@ -11,11 +11,13 @@ class DummyLoRAResolver(LoRAResolver): """A dummy LoRA resolver for testing.""" - async def resolve_lora(self, lora_name: str) -> Optional[LoRARequest]: + async def resolve_lora(self, base_model_name: str, + lora_name: str) -> Optional[LoRARequest]: if lora_name == "test_lora": - return LoRARequest(lora_name=lora_name, - lora_path="/dummy/path", - lora_int_id=abs(hash(lora_name))) + return LoRARequest( + lora_name=lora_name, + lora_path=f"/dummy/path/{base_model_name}/{lora_name}", + lora_int_id=abs(hash(lora_name))) return None @@ -57,13 +59,16 @@ def test_resolver_registry_unknown_resolver(): async def test_dummy_resolver_resolve(): """Test the dummy resolver's resolve functionality.""" dummy_resolver = DummyLoRAResolver() + base_model_name = "base_model_test" + lora_name = "test_lora" # Test successful resolution - result = await dummy_resolver.resolve_lora("test_lora") + result = await dummy_resolver.resolve_lora(base_model_name, lora_name) assert isinstance(result, LoRARequest) - assert result.lora_name == "test_lora" - assert result.lora_path == "/dummy/path" + assert result.lora_name == lora_name + assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}" # Test failed resolution - result = await dummy_resolver.resolve_lora("nonexistent_lora") + result = await dummy_resolver.resolve_lora(base_model_name, + "nonexistent_lora") assert result is None From a152aa9e700f1d1cbcc09b982d22df60d85c5162 Mon Sep 17 00:00:00 2001 From: Angky William Date: Thu, 10 Apr 2025 15:23:30 -0700 Subject: [PATCH 06/17] comment out engine_client.add_lora Signed-off-by: Angky William --- vllm/entrypoints/openai/serving_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 52d19fb72c67..d4373b2a4346 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -275,7 +275,7 @@ async def resolve_lora( lora_request.lora_int_id = unique_id try: - await self.engine_client.add_lora(lora_request) + # await self.engine_client.add_lora(lora_request) self.lora_requests.append(lora_request) logger.info( "Resolved and loaded LoRA adapter '%s' using %s", From 9d1c422e30f608f724edb6ff9af237d8f1d680b1 Mon Sep 17 00:00:00 2001 From: Angky William Date: Thu, 10 Apr 2025 16:11:33 -0700 Subject: [PATCH 07/17] use atomic counter as lora_int_id Signed-off-by: Angky William --- vllm/entrypoints/openai/serving_models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index d4373b2a4346..87fdf22d4a9e 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -19,6 +19,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.utils import AtomicCounter logger = init_logger(__name__) @@ -69,6 +70,7 @@ def __init__( self.static_lora_modules = lora_modules self.lora_requests: list[LoRARequest] = [] + self.lora_id_counter = AtomicCounter(0) self.lora_resolvers: list[LoRAResolver] = [] for lora_resolver_name in LoRAResolverRegistry.get_supported_resolvers( @@ -157,7 +159,7 @@ async def load_lora_adapter( return error_check_ret lora_name, lora_path = request.lora_name, request.lora_path - unique_id = abs(hash(lora_name)) + unique_id = self.lora_id_counter.inc(1) lora_request = LoRARequest(lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path) @@ -262,7 +264,7 @@ async def resolve_lora( return existing base_model_name = self.model_config.model - unique_id = abs(hash(lora_name)) + unique_id = self.lora_id_counter.inc(1) found_adapter = False # Try to resolve using available resolvers From e0a2cefee873924165b30208eae6bfb7a3fe3cd0 Mon Sep 17 00:00:00 2001 From: Angky William Date: Thu, 10 Apr 2025 16:58:58 -0700 Subject: [PATCH 08/17] Re add engine_client.add_lora Signed-off-by: Angky William --- vllm/entrypoints/openai/serving_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 87fdf22d4a9e..74433a1a3c3f 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -277,7 +277,7 @@ async def resolve_lora( lora_request.lora_int_id = unique_id try: - # await self.engine_client.add_lora(lora_request) + await self.engine_client.add_lora(lora_request) self.lora_requests.append(lora_request) logger.info( "Resolved and loaded LoRA adapter '%s' using %s", From 9ca98c156283df27bcc6b5bbea1aa5b4a61b7cda Mon Sep 17 00:00:00 2001 From: Angky William Date: Fri, 11 Apr 2025 17:58:20 -0700 Subject: [PATCH 09/17] Test for LoRA Resolver Signed-off-by: Angky William --- .../entrypoints/openai/test_lora_resolvers.py | 184 ++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 tests/entrypoints/openai/test_lora_resolvers.py diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py new file mode 100644 index 000000000000..75e443b42ab2 --- /dev/null +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: Apache-2.0 + +from contextlib import suppress +from dataclasses import dataclass, field +from http import HTTPStatus +from typing import Optional +from unittest.mock import MagicMock + +import pytest + +from vllm.config import MultiModalConfig +from vllm.engine.multiprocessing.client import MQLLMEngineClient +from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse +from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) +from vllm.lora.request import LoRARequest +from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry +from vllm.transformers_utils.tokenizer import get_tokenizer + +MODEL_NAME = "openai-community/gpt2" +BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] + +MOCK_RESOLVER_NAME = "mock_test_resolver" + + +@dataclass +class MockHFConfig: + model_type: str = "any" + + +@dataclass +class MockModelConfig: + """Minimal mock ModelConfig for testing.""" + model: str = MODEL_NAME + tokenizer: str = MODEL_NAME + trust_remote_code: bool = False + tokenizer_mode: str = "auto" + max_model_len: int = 100 + tokenizer_revision: Optional[str] = None + multimodal_config: MultiModalConfig = field( + default_factory=MultiModalConfig) + hf_config: MockHFConfig = field(default_factory=MockHFConfig) + logits_processor_pattern: Optional[str] = None + diff_sampling_param: Optional[dict] = None + allowed_local_media_path: str = "" + encoder_config = None + generation_config: str = "auto" + + def get_diff_sampling_param(self): + return self.diff_sampling_param or {} + + +class MockLoRAResolver(LoRAResolver): + + async def resolve_lora(self, base_model_name: str, + lora_name: str) -> Optional[LoRARequest]: + if lora_name == "test-lora": + return LoRARequest(lora_name="test-lora", + lora_int_id=1, + lora_local_path="/fake/path/test-lora") + elif lora_name == "invalid-lora": + return LoRARequest(lora_name="invalid-lora", + lora_int_id=2, + lora_local_path="/fake/path/invalid-lora") + return None + + +@pytest.fixture(autouse=True) +def register_mock_resolver(): + """Fixture to register and unregister the mock LoRA resolver.""" + resolver = MockLoRAResolver() + LoRAResolverRegistry.register_resolver(MOCK_RESOLVER_NAME, resolver) + yield + # Cleanup: remove the resolver after the test runs + if MOCK_RESOLVER_NAME in LoRAResolverRegistry.resolvers: + del LoRAResolverRegistry.resolvers[MOCK_RESOLVER_NAME] + + +@pytest.fixture +def mock_serving_setup(): + """Provides a mocked engine and serving completion instance.""" + mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + + def mock_add_lora_side_effect(lora_request: LoRARequest): + """Simulate engine behavior when adding LoRAs.""" + if lora_request.lora_name == "test-lora": + # Simulate successful addition + return + elif lora_request.lora_name == "invalid-lora": + # Simulate failure during addition (e.g. invalid format) + raise ValueError(f"Simulated failure adding LoRA: " + f"{lora_request.lora_name}") + + mock_engine.add_lora.side_effect = mock_add_lora_side_effect + mock_engine.generate.reset_mock() + mock_engine.add_lora.reset_mock() + + mock_model_config = MockModelConfig() + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) + + serving_completion = OpenAIServingCompletion(mock_engine, + mock_model_config, + models, + request_logger=None) + + return mock_engine, serving_completion + + +@pytest.mark.asyncio +async def test_serving_completion_with_lora_resolver(mock_serving_setup): + """Test completion with a mock LoRA resolver (happy path)""" + mock_engine, serving_completion = mock_serving_setup + + lora_model_name = "test-lora" + req_found = CompletionRequest( + model=lora_model_name, + prompt="Generate with LoRA", + ) + + # Suppress potential errors during the mocked generate call, + # as we are primarily checking for add_lora and generate calls + with suppress(Exception): + await serving_completion.create_completion(req_found) + + mock_engine.add_lora.assert_called_once() + called_lora_request = mock_engine.add_lora.call_args[0][0] + assert isinstance(called_lora_request, LoRARequest) + assert called_lora_request.lora_name == lora_model_name + + mock_engine.generate.assert_called_once() + + +@pytest.mark.asyncio +async def test_serving_completion_resolver_not_found(mock_serving_setup): + """Test requesting a LoRA that's not found by the resolver""" + mock_engine, serving_completion = mock_serving_setup + + non_existent_model = "non-existent-lora-adapter" + req = CompletionRequest( + model=non_existent_model, + prompt="what is 1+1?", + ) + + response = await serving_completion.create_completion(req) + + mock_engine.add_lora.assert_not_called() + mock_engine.generate.assert_not_called() + + assert isinstance(response, ErrorResponse) + assert response.code == HTTPStatus.NOT_FOUND.value + assert non_existent_model in response.message + + +@pytest.mark.asyncio +async def test_serving_completion_resolver_add_lora_fails(mock_serving_setup): + """Test requesting a LoRA that fails during engine.add_lora""" + mock_engine, serving_completion = mock_serving_setup + + invalid_model = "invalid-lora" + req = CompletionRequest( + model=invalid_model, + prompt="what is 1+1?", + ) + + response = await serving_completion.create_completion(req) + + # Assert add_lora was called before the failure + mock_engine.add_lora.assert_called_once() + called_lora_request = mock_engine.add_lora.call_args[0][0] + assert isinstance(called_lora_request, LoRARequest) + assert called_lora_request.lora_name == invalid_model + + # Assert generate was *not* called due to the failure + mock_engine.generate.assert_not_called() + + # Assert the correct error response + assert isinstance(response, ErrorResponse) + assert response.code == HTTPStatus.BAD_REQUEST.value + assert invalid_model in response.message From 02c33dabba23d9cf0088b74394e0e13a32d5c262 Mon Sep 17 00:00:00 2001 From: Angky William Date: Fri, 11 Apr 2025 18:02:21 -0700 Subject: [PATCH 10/17] Rename lora/test_lora_resolver to lora/test_resolver Signed-off-by: Angky William --- tests/lora/{test_lora_resolver.py => test_resolver.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/lora/{test_lora_resolver.py => test_resolver.py} (100%) diff --git a/tests/lora/test_lora_resolver.py b/tests/lora/test_resolver.py similarity index 100% rename from tests/lora/test_lora_resolver.py rename to tests/lora/test_resolver.py From 0c5c109f4d9b5b3d4ec0727c0beb3e6b12764f11 Mon Sep 17 00:00:00 2001 From: Angky William Date: Mon, 14 Apr 2025 11:20:20 -0700 Subject: [PATCH 11/17] Add VLLM_ALLOW_RUNTIME_LORA_UPDATING flag check Signed-off-by: Angky William --- .../entrypoints/openai/test_lora_resolvers.py | 34 +++++++++++++++---- vllm/entrypoints/openai/serving_engine.py | 5 +-- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 75e443b42ab2..b061fa33da72 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -112,8 +112,10 @@ def mock_add_lora_side_effect(lora_request: LoRARequest): @pytest.mark.asyncio -async def test_serving_completion_with_lora_resolver(mock_serving_setup): - """Test completion with a mock LoRA resolver (happy path)""" +async def test_serving_completion_with_lora_resolver(mock_serving_setup, + monkeypatch): + monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true") + mock_engine, serving_completion = mock_serving_setup lora_model_name = "test-lora" @@ -136,8 +138,10 @@ async def test_serving_completion_with_lora_resolver(mock_serving_setup): @pytest.mark.asyncio -async def test_serving_completion_resolver_not_found(mock_serving_setup): - """Test requesting a LoRA that's not found by the resolver""" +async def test_serving_completion_resolver_not_found(mock_serving_setup, + monkeypatch): + monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true") + mock_engine, serving_completion = mock_serving_setup non_existent_model = "non-existent-lora-adapter" @@ -157,8 +161,10 @@ async def test_serving_completion_resolver_not_found(mock_serving_setup): @pytest.mark.asyncio -async def test_serving_completion_resolver_add_lora_fails(mock_serving_setup): - """Test requesting a LoRA that fails during engine.add_lora""" +async def test_serving_completion_resolver_add_lora_fails( + mock_serving_setup, monkeypatch): + monkeypatch.setenv("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "true") + mock_engine, serving_completion = mock_serving_setup invalid_model = "invalid-lora" @@ -182,3 +188,19 @@ async def test_serving_completion_resolver_add_lora_fails(mock_serving_setup): assert isinstance(response, ErrorResponse) assert response.code == HTTPStatus.BAD_REQUEST.value assert invalid_model in response.message + + +@pytest.mark.asyncio +async def test_serving_completion_flag_not_set(mock_serving_setup): + mock_engine, serving_completion = mock_serving_setup + + lora_model_name = "test-lora" + req_found = CompletionRequest( + model=lora_model_name, + prompt="Generate with LoRA", + ) + + await serving_completion.create_completion(req_found) + + mock_engine.add_lora.assert_not_called() + mock_engine.generate.assert_not_called() diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 117c5b692874..49b346a23baf 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -10,6 +10,7 @@ from pydantic import Field from starlette.datastructures import Headers +import vllm.envs as envs from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient # yapf conflicts with isort for this block @@ -134,8 +135,8 @@ async def _check_model( lora.lora_name for lora in self.models.lora_requests ]: return None - if request.model and (load_result := await self.models.resolve_lora( - request.model)): + if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and ( + load_result := await self.models.resolve_lora(request.model)): if isinstance(load_result, LoRARequest): return None if isinstance(load_result, ErrorResponse) and \ From c5c172d2b140879e96118e4715bb20b774a5a3b9 Mon Sep 17 00:00:00 2001 From: Angky William Date: Mon, 14 Apr 2025 12:09:46 -0700 Subject: [PATCH 12/17] Add documentation for LoRAResolver plugin Signed-off-by: Angky William --- docs/source/features/lora.md | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index a71da72e4360..04b2c6562556 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -106,19 +106,19 @@ curl http://localhost:8000/v1/completions \ ## Dynamically serving LoRA Adapters -In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading -LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility +In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed. Note: Enabling this feature in production environments is risky as users may participate in model adapter management. -To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` -is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. +To enable dynamic LoRA configuration, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` +is set to `True`. ```bash export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True ``` +### Using API Endpoints Loading a LoRA Adapter: To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary @@ -153,6 +153,19 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \ }' ``` +### Using Plugins +Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter. + +You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds. + +You can either install existing plugins or implement your own. + +Steps to implement your own LoRAResolver plugin: +1. Implement the LoRAResolver interface. +2. Register the plugin with vLLM. + +For more information on vLLM's plugin system and how to create custom plugins, please refer to the [Plugins documentation](../design/plugin_system.md). + ## New format for `--lora-modules` In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: From 19af34caedf6b6edb034922bb0ff6d68fac423f4 Mon Sep 17 00:00:00 2001 From: Angky William Date: Mon, 14 Apr 2025 12:12:34 -0700 Subject: [PATCH 13/17] Fix doc new line Signed-off-by: Angky William --- docs/source/features/lora.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index 04b2c6562556..cf0155bf1749 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -106,8 +106,7 @@ curl http://localhost:8000/v1/completions \ ## Dynamically serving LoRA Adapters -In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility -to change models on-the-fly is needed. +In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed. Note: Enabling this feature in production environments is risky as users may participate in model adapter management. From c72da0e8205bb9044f67bcba91c522e04a9cb3dd Mon Sep 17 00:00:00 2001 From: Angky William Date: Mon, 14 Apr 2025 13:49:54 -0700 Subject: [PATCH 14/17] Add engine.generate lora_request assertion for successful completion test Signed-off-by: Angky William --- tests/entrypoints/openai/test_lora_resolvers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index b061fa33da72..3ffd138b789e 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -135,6 +135,9 @@ async def test_serving_completion_with_lora_resolver(mock_serving_setup, assert called_lora_request.lora_name == lora_model_name mock_engine.generate.assert_called_once() + generate_lora_request = mock_engine.generate.call_args[1]['lora_request'] + assert isinstance(generate_lora_request, LoRARequest) + assert generate_lora_request.lora_name == lora_model_name @pytest.mark.asyncio From b577d2b983ce4be9ef80f17f604222197349397e Mon Sep 17 00:00:00 2001 From: Angky William Date: Mon, 14 Apr 2025 13:53:11 -0700 Subject: [PATCH 15/17] Rename generate_lora_request to called_lora_request Signed-off-by: Angky William --- tests/entrypoints/openai/test_lora_resolvers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 3ffd138b789e..c96151349eb3 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -135,9 +135,9 @@ async def test_serving_completion_with_lora_resolver(mock_serving_setup, assert called_lora_request.lora_name == lora_model_name mock_engine.generate.assert_called_once() - generate_lora_request = mock_engine.generate.call_args[1]['lora_request'] - assert isinstance(generate_lora_request, LoRARequest) - assert generate_lora_request.lora_name == lora_model_name + called_lora_request = mock_engine.generate.call_args[1]['lora_request'] + assert isinstance(called_lora_request, LoRARequest) + assert called_lora_request.lora_name == lora_model_name @pytest.mark.asyncio From 82f8e409ccbdf08fa82d5cc06876818c24c3dddd Mon Sep 17 00:00:00 2001 From: Angky William Date: Tue, 15 Apr 2025 10:53:01 -0700 Subject: [PATCH 16/17] Add example of LoRAResolver Signed-off-by: Angky William --- docs/source/features/lora.md | 43 ++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index cf0155bf1749..0b66fb42e632 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -161,9 +161,48 @@ You can either install existing plugins or implement your own. Steps to implement your own LoRAResolver plugin: 1. Implement the LoRAResolver interface. -2. Register the plugin with vLLM. -For more information on vLLM's plugin system and how to create custom plugins, please refer to the [Plugins documentation](../design/plugin_system.md). + Example of a simple S3 LoRAResolver implementation: + + ```python + import os + import s3fs + from vllm.lora.request import LoRARequest + from vllm.lora.resolver import LoRAResolver + + class S3LoRAResolver(LoRAResolver): + def __init__(self): + self.s3 = s3fs.S3FileSystem() + self.s3_path_format = os.getenv("S3_PATH_TEMPLATE") + self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE") + + async def resolve_lora(self, base_model_name, lora_name): + s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name) + local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name) + + # Download the LoRA from S3 to the local path + await self.s3._get( + s3_path, local_path, recursive=True, maxdepth=1 + ) + + lora_request = LoRARequest( + lora_name=lora_name, + lora_path=local_path, + lora_int_id=abs(hash(lora_name)) + ) + return lora_request + ``` + +2. Register LoRAResolver plugin with vLLM. + + ```python + from vllm.lora.resolver import LoRAResolverRegistry + + s3_resolver = S3LoRAResolver() + LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver) + ``` + + For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md). ## New format for `--lora-modules` From 3b47f945fe70162b4d044794f3d9e85d13dec061 Mon Sep 17 00:00:00 2001 From: Angky William Date: Tue, 15 Apr 2025 10:56:11 -0700 Subject: [PATCH 17/17] Minor doc update, remove redundant word Signed-off-by: Angky William --- docs/source/features/lora.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index 0b66fb42e632..b5b51095b3a7 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -193,7 +193,7 @@ Steps to implement your own LoRAResolver plugin: return lora_request ``` -2. Register LoRAResolver plugin with vLLM. +2. Register LoRAResolver plugin. ```python from vllm.lora.resolver import LoRAResolverRegistry