Skip to content

Commit 4f899c3

Browse files
committed
fixes per review comments, fix bug in from_run_args
1 parent e4aab01 commit 4f899c3

File tree

9 files changed

+306
-29
lines changed

9 files changed

+306
-29
lines changed

doc/dragon.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,9 @@ as a list of device indices.
104104
105105
.. note::
106106

107-
SmartSim submits jobs in the order they are received. On a heterogeneous system, SmartSim
108-
will attempt to allocate non-GPU nodes first. However, a process may be allocated to a GPU
109-
node if only GPU nodes are available, regardless of the requested features.
107+
SmartSim submits jobs in the order they are received. However, a process may
108+
be allocated to a GPU node if only GPU nodes are available, regardless of
109+
the requested features.
110110

111111
To ensure a process is allocated to a specific node, configure a hostname constraint.
112112

smartsim/_core/launcher/dragon/dragonBackend.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ def _stop_steps(self) -> None:
429429

430430
@staticmethod
431431
def create_run_policy(
432-
request: DragonRunRequest, node_name: str
432+
request: DragonRequest, node_name: str
433433
) -> "dragon_policy.Policy":
434434
if isinstance(request, DragonRunRequest):
435435
run_request: DragonRunRequest = request
@@ -439,18 +439,30 @@ def create_run_policy(
439439
cpu_affinity: t.List[int] = []
440440
gpu_affinity: t.List[int] = []
441441

442+
# Customize policy only if the client requested it, otherwise use default
442443
if run_request.policy is not None:
444+
# Don't change device from default unless specified
445+
if run_request.policy.device == "gpu":
446+
device = dragon_policy.Policy.Device.GPU
447+
elif run_request.policy.device == "cpu":
448+
device = dragon_policy.Policy.Device.CPU
449+
450+
# affinities are not mutually exclusive. If specified, both are used
443451
if run_request.policy.cpu_affinity:
444452
affinity = dragon_policy.Policy.Affinity.SPECIFIC
445453
cpu_affinity = run_request.policy.cpu_affinity
446-
device = dragon_policy.Policy.Device.CPU
447454

448455
if run_request.policy.gpu_affinity:
449456
affinity = dragon_policy.Policy.Affinity.SPECIFIC
450457
gpu_affinity = run_request.policy.gpu_affinity
451-
device = dragon_policy.Policy.Device.GPU
452458

453-
if affinity != dragon_policy.Policy.Affinity.DEFAULT:
459+
# TODO: if device == "cpu" and gpu_affinity is set, will it fail?
460+
# Should we override device in the gpu_affinity setter or raise?
461+
462+
if (
463+
affinity != dragon_policy.Policy.Affinity.DEFAULT
464+
or device != dragon_policy.Policy.Device.DEFAULT
465+
):
454466
return dragon_policy.Policy(
455467
placement=dragon_policy.Policy.Placement.HOST_NAME,
456468
host_name=node_name,

smartsim/_core/schemas/dragonRequests.py

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@
2626

2727
import typing as t
2828

29-
from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
29+
from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt, ValidationError
3030

3131
import smartsim._core.schemas.utils as _utils
32+
from smartsim.error.errors import SmartSimError
3233

3334
# Black and Pylint disagree about where to put the `...`
3435
# pylint: disable=multiple-statements
@@ -42,31 +43,50 @@ class DragonRequest(BaseModel): ...
4243
class DragonRunPolicy(BaseModel):
4344
"""Policy specifying hardware constraints when running a Dragon job"""
4445

45-
device: t.Literal["cpu", "gpu"] = Field(default="cpu")
46+
device: t.Literal["cpu", "gpu", ""] = Field(default="")
47+
"""Specify a device category on which to run the job. Uses system default
48+
if not specified"""
4649
cpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list)
50+
"""List of CPU indices to which the job should be pinned"""
4751
gpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list)
52+
"""List of GPU indices to which the job should be pinned"""
4853

4954
@staticmethod
5055
def from_run_args(
5156
run_args: t.Dict[str, t.Union[int, str, float, None]]
5257
) -> "DragonRunPolicy":
58+
"""Create a DragonRunPolicy from a dictionary of run arguments"""
5359
features: str = str(run_args.get("node-feature", ""))
5460

55-
device = "gpu" if "gpu" in features else "cpu"
61+
device = ""
62+
if "gpu" in features:
63+
device = "gpu"
64+
elif "cpu" in features:
65+
device = "cpu"
5666

57-
gpu_args = str(run_args.get("gpu-affinity", ""))
58-
cpu_args = str(run_args.get("cpu-affinity", ""))
59-
gpu_affinity = [x for x in gpu_args.split(",") if x]
60-
cpu_affinity = [x for x in cpu_args.split(",") if x]
67+
gpu_args = ""
68+
if gpu_arg_value := run_args.get("gpu-affinity", None):
69+
gpu_args = str(gpu_arg_value)
6170

62-
if device == "cpu" and not (cpu_affinity or gpu_affinity):
71+
cpu_args = ""
72+
if cpu_arg_value := run_args.get("cpu-affinity", None):
73+
cpu_args = str(cpu_arg_value)
74+
75+
# run args converted to a string must be split back into a list[int]
76+
gpu_affinity = [int(x.strip()) for x in gpu_args.split(",") if x]
77+
cpu_affinity = [int(x.strip()) for x in cpu_args.split(",") if x]
78+
79+
if device == "" and not (cpu_affinity or gpu_affinity):
6380
return DragonRunPolicy()
6481

65-
return DragonRunPolicy(
66-
device=device,
67-
cpu_affinity=cpu_affinity,
68-
gpu_affinity=gpu_affinity,
69-
)
82+
try:
83+
return DragonRunPolicy(
84+
device=device,
85+
cpu_affinity=cpu_affinity,
86+
gpu_affinity=gpu_affinity,
87+
)
88+
except ValidationError as ex:
89+
raise SmartSimError("Unable to build DragonRunPolicy") from ex
7090

7191

7292
class DragonRunRequestView(DragonRequest):

smartsim/settings/dragonRunSettings.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,12 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None:
8383

8484
@override
8585
def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None:
86-
"""Add a node feature requirement
86+
"""Specify the node feature for this job
8787
88-
:param tasks_per_node: number of tasks per node
88+
:param feature_list: a collection of strings representing the
89+
required node features.
90+
91+
Currently supported node features are: "gpu"
8992
"""
9093
if isinstance(feature_list, str):
9194
feature_list = feature_list.strip().split()

tests/test_dragon_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def test_dragon_client_main(
180180
assert isinstance(request_arg, DragonRunRequest)
181181

182182
policy = request_arg.policy
183-
assert policy.device == "cpu" if not policy.gpu_affinity else "gpu"
183+
assert policy.device == "" if not policy.gpu_affinity else "gpu"
184184

185185
# make sure each policy has been read in correctly with valid affinity indices
186186
assert len(policy.cpu_affinity) == len(set(policy.cpu_affinity))

tests/test_dragon_launcher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -701,7 +701,7 @@ def test_run_step_success(test_dir: str) -> None:
701701
assert req_name.startswith(step0.name)
702702

703703
req_policy_device = dragon_run_request.policy.device
704-
assert req_policy_device == "cpu" # default device is cpu
704+
assert req_policy_device == "" # default device is cpu
705705

706706
req_policy_cpu_affinity = dragon_run_request.policy.cpu_affinity
707707
assert not req_policy_cpu_affinity # default should be empty list

0 commit comments

Comments
 (0)