Skip to content
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright (c) IBM Corporation
# SPDX-License-Identifier: MIT

import asyncio
import logging

import ray

from orchestrator.modules.operators.console_output import RichConsoleSpinnerMessage

logger = logging.getLogger(__name__)


class DeploymentWaiter:
def __init__(self, k8s_name: str):
self.k8s_name = k8s_name
self.model_downloaded_event = asyncio.Event()


class DeploymentConflictManager:
def __init__(self):
self.deployments_to_wait_for: dict[str, DeploymentWaiter] = {}
self.model_already_downloaded: set[str] = set()

def maybe_add_deployment(self, model: str, k8s_name: str) -> bool:
if (
model in self.model_already_downloaded
or model in self.deployments_to_wait_for
):
return False

self.deployments_to_wait_for[model] = DeploymentWaiter(k8s_name=k8s_name)
return True

async def wait(self, request_id: str, k8s_name: str, model: str) -> None:
waiter = self.deployments_to_wait_for.get(model, None)
# making sure a deployment does not wait for itself to be READY
if waiter is not None and waiter.k8s_name != k8s_name:
console = ray.get_actor(name="RichConsoleQueue")
while True:
console.put.remote(
message=RichConsoleSpinnerMessage(
id=request_id,
label=f"({request_id}) Waiting on deployment ({waiter.k8s_name}) to download the model required for this experiment",
state="start",
)
)
await waiter.model_downloaded_event.wait()
# If after we got awaken the model is not among the downloaded models, it means that
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# If after we got awaken the model is not among the downloaded models, it means that
# If after we got awoken the model is not among the downloaded models, it means that

# something has gone wrong, such as the deployment we were waiting for has failed.
# If am the first to wake up let me add myself as the deployment to be waited for and stop waiting.
if (
model not in self.model_already_downloaded
and not self.maybe_add_deployment(k8s_name=k8s_name, model=model)
):
# If I am not the first to wake up, I get the new waiter object and continue waiting
waiter = self.deployments_to_wait_for.get(model, None)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you get None here, at the next iteration an exception will be raised -are there conditions where one could get None?

continue

console.put.remote(
message=RichConsoleSpinnerMessage(
id=request_id,
label=f"({request_id}) Done waiting for conflicting K8s deployment",
state="stop",
)
)
break

def signal(self, k8s_name: str, model: str, error: bool = False) -> None:
if model in self.deployments_to_wait_for:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inverting this if (returning None early) would reduce indentation. Also, what are the cases in which this if wouldn't be true (i.e., is it possible)?

waiter = self.deployments_to_wait_for.pop(model)
assert (
waiter.k8s_name == k8s_name
), f"This environment deployment ({k8s_name}) shouldn't have been created because it is conflicting with deployment {waiter.k8s_name}"
if not error:
self.model_already_downloaded.add(model)
waiter.model_downloaded_event.set()
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
# SPDX-License-Identifier: MIT

import asyncio
import copy
import logging
import time
from enum import Enum

import ray
from ado_actuators.vllm_performance.deployment_management import (
DeploymentConflictManager,
)
from ado_actuators.vllm_performance.k8s.manage_components import (
ComponentsManager,
)
Expand All @@ -25,7 +27,6 @@ class EnvironmentState(Enum):
"""

NONE = "None"
CREATING = "creating"
READY = "ready"


Expand All @@ -34,32 +35,28 @@ class Environment:
environment class
"""

def __init__(self, model: str):
def __init__(self, model: str, configuration: str):
"""
Defines an environment for a model
:param model: LLM model name
:param configuration: The full deployment configuration
"""
self.k8s_name = ComponentsYaml.get_k8s_name(model=model)
self.state = EnvironmentState.NONE
self.in_use = 0

def update_creating(self):
val = copy.deepcopy(self)
val.state = EnvironmentState.CREATING
val.in_use = 1
return val
self.configuration = configuration
self.model = model


class EnvironmentsQueue:
def __init__(self):
self.environments_queue = []

async def wait(self):
async def wait(self) -> None:
wait_event = asyncio.Event()
self.environments_queue.append(wait_event)
await wait_event.wait()

def signal_next(self):
def signal_next(self) -> None:
if len(self.environments_queue) > 0:
event = self.environments_queue.pop(0)
event.set()
Expand Down Expand Up @@ -89,8 +86,10 @@ def __init__(
:param pvc_name: name of the PVC to be created / used
:param pvc_template: template of the PVC to be created
"""
self.environments = {}
self.in_use_environments: dict[str, Environment] = {}
self.free_environments: list[Environment] = []
self.environments_queue = EnvironmentsQueue()
self.deployment_conflict_manager = DeploymentConflictManager()
self.namespace = namespace
self.max_concurrent = max_concurrent
self.in_cluster = in_cluster
Expand All @@ -106,16 +105,30 @@ def __init__(
pvc_template=pvc_template,
)

def _wipe_deployment(self, identifier: str) -> None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"wipe" conveys to me more of a "factory reset" feeling rather than destroying/deleting

"""
Deletes a deployment. Intended to be used for cleanup or error recovery
param: identifier: the deployment identifier
"""
try:
self.manager.delete_service(k8s_name=identifier)
except ApiException as e:
if e.reason != "Not Found":
raise e
Comment on lines +115 to +117
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so if you get an ApiException that isn't not found it's fine to try to delete the deployment?

try:
self.manager.delete_deployment(k8s_name=identifier)
except ApiException as e:
if e.reason != "Not Found":
raise e

def environment_usage(self) -> dict:

return {"max": self.max_concurrent, "in_use": len(self.environments)}
return {"max": self.max_concurrent, "in_use": self.active_environments}

async def wait_for_env(self):
await self.environments_queue.wait()

def get_environment(
self, model: str, definition: str, increment_usage: bool = False
) -> Environment | None:
def get_environment(self, model: str, definition: str) -> Environment | None:
"""
Get an environment for definition
:param model: LLM model name
Expand All @@ -125,68 +138,117 @@ def get_environment(
:param increment_usage: increment usage flag
:return: environment state
"""
print(
f"getting environment for model {model}, currently {self.active_environments} deployments"
)
Comment on lines +141 to +143
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be a print?


env = self.environments.get(definition, None)
# check if there's an existing free environment satisfying the request
env = self.get_matching_free_environment(definition)
if env is None:
if len(self.environments) >= self.max_concurrent:
if self.active_environments >= self.max_concurrent:
# can't create more environments now, need clean up
available = False
for key, env in self.environments.items():
if env.in_use == 0:
available = True
start = time.time()
try:
self.manager.delete_service(k8s_name=env.k8s_name)
self.manager.delete_deployment(k8s_name=env.k8s_name)
except ApiException as e:
logger.error(f"Error deleting deployment or service {e}")
del self.environments[key]
logging.info(
f"deleted environment {env.k8s_name} in {time.time() - start} sec. "
f"Environments length {len(self.environments)}"
)
time.sleep(3)
break
if not available:
if len(self.free_environments) == 0:
# No room for creating a new environment
logger.debug(
f"There are already {self.max_concurrent} actively in use, and I can't create a new one"
)
return None
# mark new one
env = Environment(model=model)
self.environments[definition] = env.update_creating()
return env
if increment_usage:
env = self.environments.get(definition)
env.in_use += 1
self.environments[definition] = env
# There are unused environments, let's evict one

# Gets the oldest env in the list
environment_to_evict = self.free_environments[0]
try:
self.manager.delete_service(k8s_name=environment_to_evict.k8s_name)
self.manager.delete_deployment(
k8s_name=environment_to_evict.k8s_name
)
logger.info(
f"deleted environment {environment_to_evict.k8s_name}. "
f"Active environments {self.active_environments}"
)
except ApiException as e:
logger.error(f"Error deleting deployment or service {e}")
# If all the Kubernetes resources got deleted, let's remove this environment from our records
self.free_environments.pop(0)
time.sleep(3)

# We either made space or we had enough space already
env = Environment(model=model, configuration=definition)
logger.debug(f"New environment created for definition {definition}")

# If deployments target the same model and the model is not in the HF cache, they would all try to download it.
# This can lead to corruption of the HF cache data (shared PVC).
# To avoid this situation, we keep track of the models downloaded by the actuator during the current operation.
# If a deployment wants to download a model for the first time, we do not allow other deployment using the
# same model to start in parallel.
# Once the very first download of a model is done we let any number of deployments using the same model to start
# in parallel as they would only read the model from the cache.
self.deployment_conflict_manager.maybe_add_deployment(
k8s_name=env.k8s_name, model=model
)

self.in_use_environments[env.k8s_name] = env

return env

def get_experiment_pvc_name(self):
@property
def active_environments(self) -> int:
return len(self.in_use_environments) + len(self.free_environments)

def get_experiment_pvc_name(self) -> str:
return self.manager.pvc_name

def done_creating(self, definition: str) -> None:
def done_creating(self, identifier: str) -> None:
"""
Report creation
:param definition: environment definition
:param identifier: environment identifier
:return: None
"""
env = self.environments.get(definition, None)
if env is None:
return
env.state = EnvironmentState.READY
self.environments[definition] = env
self.in_use_environments[identifier].state = EnvironmentState.READY
model = self.in_use_environments[identifier].model

self.deployment_conflict_manager.signal(k8s_name=identifier, model=model)

def cleanup_failed_deployment(self, identifier: str) -> None:
env = self.in_use_environments[identifier]
self._wipe_deployment(identifier=identifier)
self.done_using(identifier=identifier, reclaim_on_completion=True)
self.deployment_conflict_manager.signal(
k8s_name=identifier, model=env.model, error=True
)

def get_matching_free_environment(self, configuration: str) -> Environment | None:
"""
Find a deployment matching a deployment configuration
:param configuration: The deployment configuration to match
:return: An already existing deployment or None
"""
for id, env in enumerate(self.free_environments):
if env.configuration == configuration:
del self.free_environments[id]
return env
return None

async def wait_deployment_before_starting(
self, env: Environment, request_id: str
) -> None:
await self.deployment_conflict_manager.wait(
request_id=request_id, k8s_name=env.k8s_name, model=env.model
)

def done_using(self, definition: str) -> None:
def done_using(self, identifier: str, reclaim_on_completion: bool = False) -> None:
"""
Report test completion
:param definition: environment definition
:param wipe: flag to indicate the environment is to be completely removed and not freed for later use
:return: None
"""
env = self.environments.get(definition)
if env is None:
return
env.in_use -= 1
self.environments[definition] = env
env = self.in_use_environments.pop(identifier)
if not reclaim_on_completion:
self.free_environments.append(env)

# If another measurement is waiting in queue we wake it up
# Wake up any other deployment waiting in the queue for a
# free environment.
self.environments_queue.signal_next()

def cleanup(self) -> None:
Expand All @@ -195,17 +257,10 @@ def cleanup(self) -> None:
:return: None
"""
logger.info("Cleaning environments")
for env in self.environments.values():
try:
self.manager.delete_service(k8s_name=env.k8s_name)
except ApiException as e:
if e.reason != "Not Found":
raise e
try:
self.manager.delete_deployment(k8s_name=env.k8s_name)
except ApiException as e:
if e.reason != "Not Found":
raise e
all_envs = list(self.in_use_environments.values()) + self.free_environments
for env in all_envs:
self._wipe_deployment(identifier=env.k8s_name)

# We only delete the PVC if it was created by this actuator
if self.manager.pvc_created:
logger.debug("Deleting PVC")
Expand Down
Loading