From d1dc5e26c7407f74662d895a9a70b41abf06a753 Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Mon, 25 Jan 2021 10:39:11 -0800 Subject: [PATCH 1/7] WIP --- ml-agents/mlagents/trainers/cli_utils.py | 7 +++++++ ml-agents/mlagents/trainers/settings.py | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py index 5cc9d7c292..9dd8184b6b 100644 --- a/ml-agents/mlagents/trainers/cli_utils.py +++ b/ml-agents/mlagents/trainers/cli_utils.py @@ -252,6 +252,13 @@ def _create_parser() -> argparse.ArgumentParser: help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing " "the graphics driver. Use this only if your agents don't use visual observations.", ) + + torch_conf = argparser.add_argument_group(title="Torch Configuration") + torch_conf.add_argument( + "--torch-device", + default=None, + help='Settings for the default torch.device used in training, for example, "cpu", "cuda", or "cuda:0"', + ) return argparser diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py index 9f47bd567b..2630f5896c 100644 --- a/ml-agents/mlagents/trainers/settings.py +++ b/ml-agents/mlagents/trainers/settings.py @@ -733,6 +733,11 @@ class EngineSettings: no_graphics: bool = parser.get_default("no_graphics") +@attr.s(auto_attribs=True) +class TorchSettings: + device: Optional[str] = parser.get_default("torch_device") + + @attr.s(auto_attribs=True) class RunOptions(ExportableSettings): default_settings: Optional[TrainerSettings] = None From a5e58042619f437676aacb922188a2fbd151626a Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Wed, 27 Jan 2021 13:46:40 -0800 Subject: [PATCH 2/7] set torch config --- ml-agents/mlagents/torch_utils/__init__.py | 1 + ml-agents/mlagents/torch_utils/torch.py | 33 ++++++++++++++---- ml-agents/mlagents/trainers/cli_utils.py | 2 ++ ml-agents/mlagents/trainers/learn.py | 1 + ml-agents/mlagents/trainers/settings.py | 4 +++ .../trainers/tests/test_torch_utils.py | 34 +++++++++++++++++++ 6 files changed, 68 insertions(+), 7 deletions(-) create mode 100644 ml-agents/mlagents/trainers/tests/test_torch_utils.py diff --git a/ml-agents/mlagents/torch_utils/__init__.py b/ml-agents/mlagents/torch_utils/__init__.py index 9ba35a3500..0acc96997d 100644 --- a/ml-agents/mlagents/torch_utils/__init__.py +++ b/ml-agents/mlagents/torch_utils/__init__.py @@ -1,3 +1,4 @@ from mlagents.torch_utils.torch import torch as torch # noqa from mlagents.torch_utils.torch import nn # noqa +from mlagents.torch_utils.torch import set_torch_config # noqa from mlagents.torch_utils.torch import default_device # noqa diff --git a/ml-agents/mlagents/torch_utils/torch.py b/ml-agents/mlagents/torch_utils/torch.py index a3cb67ddf5..81649f7a16 100644 --- a/ml-agents/mlagents/torch_utils/torch.py +++ b/ml-agents/mlagents/torch_utils/torch.py @@ -3,6 +3,11 @@ from distutils.version import LooseVersion import pkg_resources from mlagents.torch_utils import cpu_utils +from mlagents.trainers.settings import TorchSettings +from mlagents_envs.logging_util import get_logger + + +logger = get_logger(__name__) def assert_torch_installed(): @@ -32,14 +37,28 @@ def assert_torch_installed(): torch.set_num_threads(cpu_utils.get_num_threads_to_use()) os.environ["KMP_BLOCKTIME"] = "0" -if torch.cuda.is_available(): - torch.set_default_tensor_type(torch.cuda.FloatTensor) - device = torch.device("cuda") -else: - torch.set_default_tensor_type(torch.FloatTensor) - device = torch.device("cpu") +_device = torch.device("cpu") + + +def set_torch_config(torch_settings: TorchSettings) -> None: + global _device + + if torch_settings.device is None: + device_str = "cuda" if torch.cuda.is_available() else "cpu" + else: + device_str = torch_settings.device + + _device = torch.device(device_str) + + if _device.type == "cuda": + torch.set_default_tensor_type(torch.cuda.FloatTensor) + else: + torch.set_default_tensor_type(torch.FloatTensor) + logger.info(f"default Torch device: {_device}") + + nn = torch.nn def default_device(): - return device + return _device diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py index 9dd8184b6b..c0c34c719c 100644 --- a/ml-agents/mlagents/trainers/cli_utils.py +++ b/ml-agents/mlagents/trainers/cli_utils.py @@ -257,6 +257,8 @@ def _create_parser() -> argparse.ArgumentParser: torch_conf.add_argument( "--torch-device", default=None, + dest="device", + action=DetectDefault, help='Settings for the default torch.device used in training, for example, "cpu", "cuda", or "cuda:0"', ) return argparser diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py index 82b65f59b4..bdedba2d20 100644 --- a/ml-agents/mlagents/trainers/learn.py +++ b/ml-agents/mlagents/trainers/learn.py @@ -62,6 +62,7 @@ def run_training(run_seed: int, options: RunOptions) -> None: :param run_options: Command line arguments for training. """ with hierarchical_timer("run_training.setup"): + torch_utils.set_torch_config(options.torch_settings) checkpoint_settings = options.checkpoint_settings env_settings = options.env_settings engine_settings = options.engine_settings diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py index 2630f5896c..02865c96f3 100644 --- a/ml-agents/mlagents/trainers/settings.py +++ b/ml-agents/mlagents/trainers/settings.py @@ -748,6 +748,7 @@ class RunOptions(ExportableSettings): engine_settings: EngineSettings = attr.ib(factory=EngineSettings) environment_parameters: Optional[Dict[str, EnvironmentParameterSettings]] = None checkpoint_settings: CheckpointSettings = attr.ib(factory=CheckpointSettings) + torch_settings: TorchSettings = attr.ib(factory=TorchSettings) # These are options that are relevant to the run itself, and not the engine or environment. # They will be left here. @@ -789,6 +790,7 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions": "checkpoint_settings": {}, "env_settings": {}, "engine_settings": {}, + "torch_settings": {}, } if config_path is not None: configured_dict.update(load_config(config_path)) @@ -813,6 +815,8 @@ def from_argparse(args: argparse.Namespace) -> "RunOptions": configured_dict["env_settings"][key] = val elif key in attr.fields_dict(EngineSettings): configured_dict["engine_settings"][key] = val + elif key in attr.fields_dict(TorchSettings): + configured_dict["torch_settings"][key] = val else: # Base options configured_dict[key] = val diff --git a/ml-agents/mlagents/trainers/tests/test_torch_utils.py b/ml-agents/mlagents/trainers/tests/test_torch_utils.py new file mode 100644 index 0000000000..c86b177d15 --- /dev/null +++ b/ml-agents/mlagents/trainers/tests/test_torch_utils.py @@ -0,0 +1,34 @@ +import pytest +from unittest import mock + +import torch # noqa I201 + +from mlagents.torch_utils import set_torch_config, default_device +from mlagents.trainers.settings import TorchSettings + + +@pytest.mark.parametrize( + "device_str, expected_type, expected_index, expected_tensor_type", + [ + ("cpu", "cpu", None, torch.FloatTensor), + ("cuda", "cuda", None, torch.cuda.FloatTensor), + ("cuda:42", "cuda", 42, torch.cuda.FloatTensor), + ("opengl", "opengl", None, torch.FloatTensor), + ], +) +@mock.patch.object(torch, "set_default_tensor_type") +def test_set_torch_device( + mock_set_default_tensor_type, + device_str, + expected_type, + expected_index, + expected_tensor_type, +): + torch_settings = TorchSettings(device=device_str) + set_torch_config(torch_settings) + assert default_device().type == expected_type + if expected_index is None: + assert default_device().index is None + else: + assert default_device().index == expected_index + mock_set_default_tensor_type.assert_called_once_with(expected_tensor_type) From 3ed9c750b164cf01a13bc49006bea5de2d475c7d Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Wed, 27 Jan 2021 13:55:26 -0800 Subject: [PATCH 3/7] changelog --- com.unity.ml-agents/CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index b74c70411b..22d00f78c1 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -25,6 +25,8 @@ removed when training with a player. The Editor still requires it to be clamped Changed the namespace and file names of classes in com.unity.ml-agents.extensions. (#4849) #### ml-agents / ml-agents-envs / gym-unity (Python) +- Added a `--torch-device` commandline option to `mlagent-learn`, which sets the default + [`torch.device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.device) used for training. (#4888) ### Bug Fixes #### com.unity.ml-agents (C#) From 9d9637f5139ff8a97a27c5c5ac79cb972b05b24d Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Wed, 27 Jan 2021 14:04:01 -0800 Subject: [PATCH 4/7] better default, restore device after test --- ml-agents/mlagents/torch_utils/torch.py | 4 ++++ .../trainers/tests/test_torch_utils.py | 23 ++++++++++++------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/ml-agents/mlagents/torch_utils/torch.py b/ml-agents/mlagents/torch_utils/torch.py index 81649f7a16..ddccd7a179 100644 --- a/ml-agents/mlagents/torch_utils/torch.py +++ b/ml-agents/mlagents/torch_utils/torch.py @@ -37,6 +37,7 @@ def assert_torch_installed(): torch.set_num_threads(cpu_utils.get_num_threads_to_use()) os.environ["KMP_BLOCKTIME"] = "0" + _device = torch.device("cpu") @@ -57,6 +58,9 @@ def set_torch_config(torch_settings: TorchSettings) -> None: logger.info(f"default Torch device: {_device}") +# Initialize to default settings +set_torch_config(TorchSettings(device=None)) + nn = torch.nn diff --git a/ml-agents/mlagents/trainers/tests/test_torch_utils.py b/ml-agents/mlagents/trainers/tests/test_torch_utils.py index c86b177d15..7146831319 100644 --- a/ml-agents/mlagents/trainers/tests/test_torch_utils.py +++ b/ml-agents/mlagents/trainers/tests/test_torch_utils.py @@ -24,11 +24,18 @@ def test_set_torch_device( expected_index, expected_tensor_type, ): - torch_settings = TorchSettings(device=device_str) - set_torch_config(torch_settings) - assert default_device().type == expected_type - if expected_index is None: - assert default_device().index is None - else: - assert default_device().index == expected_index - mock_set_default_tensor_type.assert_called_once_with(expected_tensor_type) + try: + torch_settings = TorchSettings(device=device_str) + set_torch_config(torch_settings) + assert default_device().type == expected_type + if expected_index is None: + assert default_device().index is None + else: + assert default_device().index == expected_index + mock_set_default_tensor_type.assert_called_once_with(expected_tensor_type) + except Exception: + raise + finally: + # restore the defaults + torch_settings = TorchSettings(device=None) + set_torch_config(torch_settings) From 80f3bdf531054e3a5aca725b03577e25b9e16cf0 Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Wed, 27 Jan 2021 14:47:29 -0800 Subject: [PATCH 5/7] remove --cpu, docs --- com.unity.ml-agents/CHANGELOG.md | 1 + docs/Training-ML-Agents.md | 9 ++++++++- ml-agents/mlagents/trainers/cli_utils.py | 6 ------ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index 22d00f78c1..42ab24af72 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -27,6 +27,7 @@ removed when training with a player. The Editor still requires it to be clamped #### ml-agents / ml-agents-envs / gym-unity (Python) - Added a `--torch-device` commandline option to `mlagent-learn`, which sets the default [`torch.device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.device) used for training. (#4888) +- The `--cpu` commandline option had no effect and was removed. Use `--torch-device=cpu` to force CPU training. (#4888) ### Bug Fixes #### com.unity.ml-agents (C#) diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md index 7919c3deda..f335fc489e 100644 --- a/docs/Training-ML-Agents.md +++ b/docs/Training-ML-Agents.md @@ -188,7 +188,8 @@ using the help utility: mlagents-learn --help ``` -These additional CLI arguments are grouped into environment, engine and checkpoint. The available settings and example values are shown below. +These additional CLI arguments are grouped into environment, engine, checkpoint and torch. +The available settings and example values are shown below. #### Environment settings @@ -227,6 +228,12 @@ checkpoint_settings: inference: false ``` +#### Torch settings: +```yaml +torch_settings: + device: null +``` + ### Behavior Configurations The primary section of the trainer config file is a diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py index c0c34c719c..6849731600 100644 --- a/ml-agents/mlagents/trainers/cli_utils.py +++ b/ml-agents/mlagents/trainers/cli_utils.py @@ -177,12 +177,6 @@ def _create_parser() -> argparse.ArgumentParser: "passed to the executable.", action=DetectDefault, ) - argparser.add_argument( - "--cpu", - default=False, - action=DetectDefaultStoreTrue, - help="Forces training using CPU only", - ) argparser.add_argument( "--torch", default=False, From 26a497bdcd4ca756dd7cb99460146f6755e1ead5 Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Wed, 27 Jan 2021 14:49:07 -0800 Subject: [PATCH 6/7] better example --- docs/Training-ML-Agents.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md index f335fc489e..c6811d7fd7 100644 --- a/docs/Training-ML-Agents.md +++ b/docs/Training-ML-Agents.md @@ -229,9 +229,10 @@ checkpoint_settings: ``` #### Torch settings: + ```yaml torch_settings: - device: null + device: cpu ``` ### Behavior Configurations From 7eaa905a0642be6f92972a062a0e803708b4fb28 Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Wed, 27 Jan 2021 17:38:55 -0800 Subject: [PATCH 7/7] [skip ci] more than one agent --- com.unity.ml-agents/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md index 42ab24af72..7e24935720 100755 --- a/com.unity.ml-agents/CHANGELOG.md +++ b/com.unity.ml-agents/CHANGELOG.md @@ -25,7 +25,7 @@ removed when training with a player. The Editor still requires it to be clamped Changed the namespace and file names of classes in com.unity.ml-agents.extensions. (#4849) #### ml-agents / ml-agents-envs / gym-unity (Python) -- Added a `--torch-device` commandline option to `mlagent-learn`, which sets the default +- Added a `--torch-device` commandline option to `mlagents-learn`, which sets the default [`torch.device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.torch.device) used for training. (#4888) - The `--cpu` commandline option had no effect and was removed. Use `--torch-device=cpu` to force CPU training. (#4888)