CrayLabs
diff --git a/‎doc/changelog.md‎
Lines changed: 1 addition & 0 deletions b/‎doc/changelog.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/dragon.rst‎
Lines changed: 45 additions & 0 deletions b/‎doc/dragon.rst‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎smartsim/_core/launcher/dragon/dragonBackend.py‎
Lines changed: 77 additions & 6 deletions b/‎smartsim/_core/launcher/dragon/dragonBackend.py‎
Lines changed: 77 additions & 6 deletions
diff --git a/‎smartsim/_core/launcher/dragon/dragonLauncher.py‎
Lines changed: 6 additions & 0 deletions b/‎smartsim/_core/launcher/dragon/dragonLauncher.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎smartsim/_core/launcher/step/dragonStep.py‎
Lines changed: 9 additions & 1 deletion b/‎smartsim/_core/launcher/step/dragonStep.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎smartsim/_core/launcher/step/step.py‎
Lines changed: 2 additions & 1 deletion b/‎smartsim/_core/launcher/step/step.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎smartsim/_core/schemas/dragonRequests.py‎
Lines changed: 32 additions & 1 deletion b/‎smartsim/_core/schemas/dragonRequests.py‎
Lines changed: 32 additions & 1 deletion
diff --git a/‎smartsim/settings/dragonRunSettings.py‎
Lines changed: 31 additions & 0 deletions b/‎smartsim/settings/dragonRunSettings.py‎
Lines changed: 31 additions & 0 deletions
@@ -13,6 +13,7 @@ Jump to:
 
 Description
 
+- Add hardware pinning capability when using dragon
 - Add EnvironmentConfigLoader for ML Worker Manager
 - Add Model schema with model metadata included
 - Removed device from schemas, MessageHandler and tests
 
@@ -65,6 +65,51 @@ In the next sections, we detail how Dragon is integrated into SmartSim.
 
 For more information on HPC launchers, visit the :ref:`Run Settings<run_settings_hpc_ex>` page.
 
+Hardware Pinning
+================
+
+Dragon also enables users to specify hardware constraints using ``DragonRunSettings``. For 
+example, you may may configure the run settings to require that nodes executing the
+``Model`` support the `"gpu"` feature.
+
+.. code-block:: python
+
+    # Because "dragon" was specified as the launcher during Experiment initialization,
+    # create_run_settings will return a DragonRunSettings object
+    rs = exp.create_run_settings(exe="mpi_app",
+                                 exe_args=["--option", "value"],
+                                 env_vars={"MYVAR": "VALUE"})
+
+    # Specify that the nodes features must include a GPU
+    rs.set_feature("gpu")
+
+For more fine-grained control, CPU and GPU affinity can be specified using the
+``DragonRunSettings`` object. The following example demonstrates how to specify
+CPU affinity and GPU affinities simultaneously. Note that affinities are passed
+as a list of device indices.
+
+.. code-block:: python
+
+    # Because "dragon" was specified as the launcher during Experiment initialization,
+    # create_run_settings will return a DragonRunSettings object
+    rs = exp.create_run_settings(exe="mpi_app",
+                                 exe_args=["--option", "value"],
+                                 env_vars={"MYVAR": "VALUE"})
+
+    # Request the first 8 CPUs for this job
+    rs.set_cpu_affinity(list(range(9)))
+
+    # Request the first two GPUs on the node for this job
+    rs.set_gpu_affinity([0, 1])
+
+.. note::
+
+    SmartSim submits jobs in the order they are received. On a heterogeneous system, SmartSim
+    will attempt to allocate non-GPU nodes first. However, a process may be allocated to a GPU
+    node if only GPU nodes are available, regardless of the requested features.
+
+    To ensure a process is allocated to a specific node, configure a hostname constraint.
+    
 =================
 The Dragon Server
 =================
 
@@ -211,9 +211,12 @@ def group_infos(self) -> dict[str, ProcessGroupInfo]:
     def _initialize_hosts(self) -> None:
         with self._queue_lock:
             self._hosts: t.List[str] = sorted(
-                dragon_machine.Node(node).hostname
-                for node in dragon_machine.System().nodes
+                node for node in dragon_machine.System().nodes
             )
+            self._nodes = [dragon_machine.Node(node) for node in self._hosts]
+            self._cpus = [node.num_cpus for node in self._nodes]
+            self._gpus = [node.num_gpus for node in self._nodes]
+
             """List of hosts available in allocation"""
             self._free_hosts: t.Deque[str] = collections.deque(self._hosts)
             """List of hosts on which steps can be launched"""
@@ -285,6 +288,34 @@ def current_time(self) -> float:
         """Current time for DragonBackend object, in seconds since the Epoch"""
         return time.time()
 
+    def _can_honor_policy(
+        self, request: DragonRunRequest
+    ) -> t.Tuple[bool, t.Optional[str]]:
+        # ensure the policy can be honored
+        if request.policy:
+            if request.policy.device == "gpu":
+                # make sure nodes w/GPUs exist
+                if not any(self._gpus):
+                    return False, "Cannot satisfy request, no GPUs available"
+
+            if request.policy.cpu_affinity:
+                # make sure some node has enough CPUs
+                available = max(self._cpus)
+                requested = max(request.policy.cpu_affinity)
+
+                if requested >= available:
+                    return False, "Cannot satisfy request, not enough CPUs available"
+
+            if request.policy.gpu_affinity:
+                # make sure some node has enough GPUs
+                available = max(self._gpus)
+                requested = max(request.policy.gpu_affinity)
+
+                if requested >= available:
+                    return False, "Cannot satisfy request, not enough GPUs available"
+
+        return True, None
+
     def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]:
         """Check if request can be honored with resources available in the allocation.
 
@@ -299,6 +330,11 @@ def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]
         if self._shutdown_requested:
             message = "Cannot satisfy request, server is shutting down."
             return False, message
+
+        honorable, err = self._can_honor_policy(request)
+        if not honorable:
+            return False, err
+
         return True, None
 
     def _allocate_step(
@@ -391,6 +427,44 @@ def _stop_steps(self) -> None:
                 self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
                 self._group_infos[step_id].return_codes = [-9]
 
+    @staticmethod
+    def create_run_policy(
+        request: DragonRunRequest, node_name: str
+    ) -> "dragon_policy.Policy":
+        if isinstance(request, DragonRunRequest):
+            run_request: DragonRunRequest = request
+
+            device = dragon_policy.Policy.Device.DEFAULT
+            affinity = dragon_policy.Policy.Affinity.DEFAULT
+            cpu_affinity: t.List[int] = []
+            gpu_affinity: t.List[int] = []
+
+            if run_request.policy is not None:
+                if run_request.policy.cpu_affinity:
+                    affinity = dragon_policy.Policy.Affinity.SPECIFIC
+                    cpu_affinity = run_request.policy.cpu_affinity
+                    device = dragon_policy.Policy.Device.CPU
+
+                if run_request.policy.gpu_affinity:
+                    affinity = dragon_policy.Policy.Affinity.SPECIFIC
+                    gpu_affinity = run_request.policy.gpu_affinity
+                    device = dragon_policy.Policy.Device.GPU
+
+            if affinity != dragon_policy.Policy.Affinity.DEFAULT:
+                return dragon_policy.Policy(
+                    placement=dragon_policy.Policy.Placement.HOST_NAME,
+                    host_name=node_name,
+                    affinity=affinity,
+                    device=device,
+                    cpu_affinity=cpu_affinity,
+                    gpu_affinity=gpu_affinity,
+                )
+
+        return dragon_policy.Policy(
+            placement=dragon_policy.Policy.Placement.HOST_NAME,
+            host_name=node_name,
+        )
+
     def _start_steps(self) -> None:
         self._heartbeat()
         with self._queue_lock:
@@ -412,10 +486,7 @@ def _start_steps(self) -> None:
 
                 policies = []
                 for node_name in hosts:
-                    local_policy = dragon_policy.Policy(
-                        placement=dragon_policy.Policy.Placement.HOST_NAME,
-                        host_name=node_name,
-                    )
+                    local_policy = self.create_run_policy(request, node_name)
                     policies.extend([local_policy] * request.tasks_per_node)
                     tmp_proc = dragon_process.ProcessTemplate(
                         target=request.exe,
 
@@ -29,6 +29,8 @@
 import os
 import typing as t
 
+from smartsim._core.schemas.dragonRequests import DragonRunPolicy
+
 from ...._core.launcher.stepMapping import StepMap
 from ....error import LauncherError, SmartSimError
 from ....log import get_logger
@@ -168,6 +170,9 @@ def run(self, step: Step) -> t.Optional[str]:
             merged_env = self._connector.merge_persisted_env(os.environ.copy())
             nodes = int(run_args.get("nodes", None) or 1)
             tasks_per_node = int(run_args.get("tasks-per-node", None) or 1)
+
+            policy = DragonRunPolicy.from_run_args(run_args)
+
             response = _assert_schema_type(
                 self._connector.send_request(
                     DragonRunRequest(
@@ -181,6 +186,7 @@ def run(self, step: Step) -> t.Optional[str]:
                         current_env=merged_env,
                         output_file=out,
                         error_file=err,
+                        policy=policy,
                     )
                 ),
                 DragonRunResponse,
 
@@ -30,7 +30,11 @@
 import sys
 import typing as t
 
-from ...._core.schemas.dragonRequests import DragonRunRequest, request_registry
+from ...._core.schemas.dragonRequests import (
+    DragonRunPolicy,
+    DragonRunRequest,
+    request_registry,
+)
 from ....error.errors import SSUnsupportedError
 from ....log import get_logger
 from ....settings import (
@@ -166,8 +170,11 @@ def _write_request_file(self) -> str:
             nodes = int(run_args.get("nodes", None) or 1)
             tasks_per_node = int(run_args.get("tasks-per-node", None) or 1)
 
+            policy = DragonRunPolicy.from_run_args(run_args)
+
             cmd = step.get_launch_cmd()
             out, err = step.get_output_files()
+
             request = DragonRunRequest(
                 exe=cmd[0],
                 exe_args=cmd[1:],
@@ -179,6 +186,7 @@ def _write_request_file(self) -> str:
                 current_env=os.environ,
                 output_file=out,
                 error_file=err,
+                policy=policy,
             )
             requests.append(request_registry.to_string(request))
         with open(request_file, "w", encoding="utf-8") as script_file:
 
@@ -26,6 +26,7 @@
 
 from __future__ import annotations
 
+import copy
 import functools
 import os.path as osp
 import pathlib
@@ -51,7 +52,7 @@ def __init__(self, name: str, cwd: str, step_settings: SettingsBase) -> None:
         self.entity_name = name
         self.cwd = cwd
         self.managed = False
-        self.step_settings = step_settings
+        self.step_settings = copy.deepcopy(step_settings)
         self.meta: t.Dict[str, str] = {}
 
     @property
 
@@ -26,7 +26,7 @@
 
 import typing as t
 
-from pydantic import BaseModel, Field, PositiveInt
+from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
 
 import smartsim._core.schemas.utils as _utils
 
@@ -39,6 +39,36 @@
 class DragonRequest(BaseModel): ...
 
 
+class DragonRunPolicy(BaseModel):
+    """Policy specifying hardware constraints when running a Dragon job"""
+
+    device: t.Literal["cpu", "gpu"] = Field(default="cpu")
+    cpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list)
+    gpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list)
+
+    @staticmethod
+    def from_run_args(
+        run_args: t.Dict[str, t.Union[int, str, float, None]]
+    ) -> "DragonRunPolicy":
+        features: str = str(run_args.get("node-feature", ""))
+
+        device = "gpu" if "gpu" in features else "cpu"
+
+        gpu_args = str(run_args.get("gpu-affinity", ""))
+        cpu_args = str(run_args.get("cpu-affinity", ""))
+        gpu_affinity = [x for x in gpu_args.split(",") if x]
+        cpu_affinity = [x for x in cpu_args.split(",") if x]
+
+        if device == "cpu" and not (cpu_affinity or gpu_affinity):
+            return DragonRunPolicy()
+
+        return DragonRunPolicy(
+            device=device,
+            cpu_affinity=cpu_affinity,
+            gpu_affinity=gpu_affinity,
+        )
+
+
 class DragonRunRequestView(DragonRequest):
     exe: t.Annotated[str, Field(min_length=1)]
     exe_args: t.List[t.Annotated[str, Field(min_length=1)]] = []
@@ -57,6 +87,7 @@ class DragonRunRequestView(DragonRequest):
 @request_registry.register("run")
 class DragonRunRequest(DragonRunRequestView):
     current_env: t.Dict[str, t.Optional[str]] = {}
+    policy: t.Optional[DragonRunPolicy] = None
 
     def __str__(self) -> str:
         return str(DragonRunRequestView.parse_obj(self.dict(exclude={"current_env"})))
 
@@ -28,6 +28,8 @@
 
 import typing as t
 
+from typing_extensions import override
+
 from ..log import get_logger
 from .base import RunSettings
 
@@ -63,16 +65,45 @@ def __init__(
             **kwargs,
         )
 
+    @override
     def set_nodes(self, nodes: int) -> None:
         """Set the number of nodes
 
         :param nodes: number of nodes to run with
         """
         self.run_args["nodes"] = nodes
 
+    @override
     def set_tasks_per_node(self, tasks_per_node: int) -> None:
         """Set the number of tasks for this job
 
         :param tasks_per_node: number of tasks per node
         """
         self.run_args["tasks-per-node"] = tasks_per_node
+
+    @override
+    def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None:
+        """Add a node feature requirement
+
+        :param tasks_per_node: number of tasks per node
+        """
+        if isinstance(feature_list, str):
+            feature_list = feature_list.strip().split()
+        elif not all(isinstance(feature, str) for feature in feature_list):
+            raise TypeError("feature_list must be string or list of strings")
+
+        self.run_args["node-feature"] = ",".join(feature_list)
+
+    def set_cpu_affinity(self, devices: t.List[int]) -> None:
+        """Set the CPU affinity for this job
+
+        :param devices: list of CPU indices to execute on
+        """
+        self.run_args["cpu-affinity"] = ",".join(str(device) for device in devices)
+
+    def set_gpu_affinity(self, devices: t.List[int]) -> None:
+        """Set the GPU affinity for this job
+
+        :param devices: list of GPU indices to execute on.
+        """
+        self.run_args["gpu-affinity"] = ",".join(str(device) for device in devices)