diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 89144f4053..350c2bfc9e 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -20,11 +20,8 @@ jobs:
python-version: [3.6.x, 3.7.x, 3.8.x]
include:
- python-version: 3.6.x
- pip_constraints: test_constraints_min_version.txt
- python-version: 3.7.x
- pip_constraints: test_constraints_max_tf1_version.txt
- python-version: 3.8.x
- pip_constraints: test_constraints_max_tf2_version.txt
steps:
- uses: actions/checkout@v2
- name: Set up Python
@@ -37,7 +34,7 @@ jobs:
# This path is specific to Ubuntu
path: ~/.cache/pip
# Look to see if there is a cache hit for the corresponding requirements file
- key: ${{ runner.os }}-pip-${{ hashFiles('ml-agents/setup.py', 'ml-agents-envs/setup.py', 'gym-unity/setup.py', 'test_requirements.txt', matrix.pip_constraints) }}
+ key: ${{ runner.os }}-pip-${{ hashFiles('ml-agents/setup.py', 'ml-agents-envs/setup.py', 'gym-unity/setup.py', 'test_requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-
${{ runner.os }}-
@@ -48,10 +45,10 @@ jobs:
# pin pip to workaround https://github.com/pypa/pip/issues/9180
python -m pip install pip==20.2
python -m pip install --upgrade setuptools
- python -m pip install --progress-bar=off -e ./ml-agents-envs -c ${{ matrix.pip_constraints }}
- python -m pip install --progress-bar=off -e ./ml-agents -c ${{ matrix.pip_constraints }}
- python -m pip install --progress-bar=off -r test_requirements.txt -c ${{ matrix.pip_constraints }}
- python -m pip install --progress-bar=off -e ./gym-unity -c ${{ matrix.pip_constraints }}
+ python -m pip install --progress-bar=off -e ./ml-agents-envs
+ python -m pip install --progress-bar=off -e ./ml-agents
+ python -m pip install --progress-bar=off -r test_requirements.txt
+ python -m pip install --progress-bar=off -e ./gym-unity
- name: Save python dependencies
run: |
pip freeze > pip_versions-${{ matrix.python-version }}.txt
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
index 6b8ac35680..3255e8f83d 100755
--- a/com.unity.ml-agents/CHANGELOG.md
+++ b/com.unity.ml-agents/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to
### Major Changes
#### com.unity.ml-agents (C#)
#### ml-agents / ml-agents-envs / gym-unity (Python)
+- TensorFlow trainers have been removed, please use the Torch trainers instead. (#4707)
- PyTorch trainers now support training agents with both continuous and discrete action spaces. (#4702)
### Minor Changes
#### com.unity.ml-agents / com.unity.ml-agents.extensions (C#)
diff --git a/docs/ML-Agents-Overview.md b/docs/ML-Agents-Overview.md
index e7696bc41d..9b253e0949 100644
--- a/docs/ML-Agents-Overview.md
+++ b/docs/ML-Agents-Overview.md
@@ -372,7 +372,7 @@ your agent's behavior:
below).
- `rnd`: represents an intrinsic reward signal that encourages exploration
in sparse-reward environments that is defined by the Curiosity module (see
- below). (Not available for TensorFlow trainers)
+ below).
### Deep Reinforcement Learning
@@ -437,8 +437,6 @@ of the trained model is used as intrinsic reward. The more an Agent visits a sta
more accurate the predictions and the lower the rewards which encourages the Agent to
explore new states with higher prediction errors.
-__Note:__ RND is not available for TensorFlow trainers (only PyTorch trainers)
-
### Imitation Learning
It is often more intuitive to simply demonstrate the behavior we want an agent
diff --git a/docs/Training-Configuration-File.md b/docs/Training-Configuration-File.md
index 7c923763cf..d6d55b0939 100644
--- a/docs/Training-Configuration-File.md
+++ b/docs/Training-Configuration-File.md
@@ -32,7 +32,7 @@ choice of the trainer (which we review on subsequent sections).
| `time_horizon` | (default = `64`) How many steps of experience to collect per-agent before adding it to the experience buffer. When this limit is reached before the end of an episode, a value estimate is used to predict the overall expected reward from the agent's current state. As such, this parameter trades off between a less biased, but higher variance estimate (long time horizon) and more biased, but less varied estimate (short time horizon). In cases where there are frequent rewards within an episode, or episodes are prohibitively large, a smaller number can be more ideal. This number should be large enough to capture all the important behavior within a sequence of an agent's actions.
Typical range: `32` - `2048` |
| `max_steps` | (default = `500000`) Total number of steps (i.e., observation collected and action taken) that must be taken in the environment (or across all environments if using multiple in parallel) before ending the training process. If you have multiple agents with the same behavior name within your environment, all steps taken by those agents will contribute to the same `max_steps` count.
Typical range: `5e5` - `1e7` |
| `keep_checkpoints` | (default = `5`) The maximum number of model checkpoints to keep. Checkpoints are saved after the number of steps specified by the checkpoint_interval option. Once the maximum number of checkpoints has been reached, the oldest checkpoint is deleted when saving a new checkpoint. |
-| `checkpoint_interval` | (default = `500000`) The number of experiences collected between each checkpoint by the trainer. A maximum of `keep_checkpoints` checkpoints are saved before old ones are deleted. Each checkpoint saves the `.onnx` (and `.nn` if using TensorFlow) files in `results/` folder.|
+| `checkpoint_interval` | (default = `500000`) The number of experiences collected between each checkpoint by the trainer. A maximum of `keep_checkpoints` checkpoints are saved before old ones are deleted. Each checkpoint saves the `.onnx` files in `results/` folder.|
| `init_path` | (default = None) Initialize trainer from a previously saved model. Note that the prior run should have used the same trainer configurations as the current run, and have been saved with the same version of ML-Agents.
You should provide the full path to the folder where the checkpoints were saved, e.g. `./models/{run-id}/{behavior_name}`. This option is provided in case you want to initialize different behaviors from different runs; in most cases, it is sufficient to use the `--initialize-from` CLI parameter to initialize all models from the same run. |
| `threaded` | (default = `true`) By default, model updates can happen while the environment is being stepped. This violates the [on-policy](https://spinningup.openai.com/en/latest/user/algorithms.html#the-on-policy-algorithms) assumption of PPO slightly in exchange for a training speedup. To maintain the strict on-policyness of PPO, you can disable parallel updates by setting `threaded` to `false`. There is usually no reason to turn `threaded` off for SAC. |
| `hyperparameters -> learning_rate` | (default = `3e-4`) Initial learning rate for gradient descent. Corresponds to the strength of each gradient descent update step. This should typically be decreased if training is unstable, and the reward does not consistently increase.
Typical range: `1e-5` - `1e-3` |
diff --git a/docs/Training-ML-Agents.md b/docs/Training-ML-Agents.md
index ae3ddc42c4..7919c3deda 100644
--- a/docs/Training-ML-Agents.md
+++ b/docs/Training-ML-Agents.md
@@ -317,9 +317,6 @@ behaviors:
save_steps: 50000
swap_steps: 2000
team_change: 100000
-
- # use TensorFlow backend
- framework: tensorflow
```
Here is an equivalent file if we use an SAC trainer instead. Notice that the
diff --git a/docs/Unity-Inference-Engine.md b/docs/Unity-Inference-Engine.md
index d7b46f8f04..83655cea2c 100644
--- a/docs/Unity-Inference-Engine.md
+++ b/docs/Unity-Inference-Engine.md
@@ -19,19 +19,6 @@ Graphics Emulation is set to **OpenGL(ES) 3.0 or 2.0 emulation**. Also there
might be non-fatal build time errors when target platform includes Graphics API
that does not support **Unity Compute Shaders**.
-## Supported formats
-
-There are currently two supported model formats:
-
-- Barracuda (`.nn`) files use a proprietary format produced by the
- [`tensorflow_to_barracuda.py`]() script.
-- ONNX (`.onnx`) files use an
- [industry-standard open format](https://onnx.ai/about.html) produced by the
- [tf2onnx package](https://github.com/onnx/tensorflow-onnx).
-
-Export to ONNX is used if using PyTorch (the default). To enable it
-while using TensorFlow, make sure `tf2onnx>=1.6.1` is installed in pip.
-
## Using the Unity Inference Engine
When using a model, drag the model file into the **Model** field in the
@@ -56,7 +43,5 @@ If you wish to run inference on an externally trained model, you should use
Barracuda directly, instead of trying to run it through ML-Agents.
## Model inference outside of Unity
-We do not provide support for inference anywhere outside of Unity. The
-`frozen_graph_def.pb` and `.onnx` files produced by training are open formats
-for TensorFlow and ONNX respectively; if you wish to convert these to another
+We do not provide support for inference anywhere outside of Unity. The `.onnx` files produced by training use the open format ONNX; if you wish to convert a `.onnx` file to another
format or run inference with them, refer to their documentation.
diff --git a/ml-agents-envs/setup.py b/ml-agents-envs/setup.py
index bcd73a82a8..8381d11102 100644
--- a/ml-agents-envs/setup.py
+++ b/ml-agents-envs/setup.py
@@ -48,7 +48,7 @@ def run(self):
install_requires=[
"cloudpickle",
"grpcio>=1.11.0",
- "numpy>=1.14.1,<1.19.0",
+ "numpy>=1.14.1",
"Pillow>=4.2.1",
"protobuf>=3.6",
"pyyaml>=3.1.0",
diff --git a/ml-agents/mlagents/tf_utils/__init__.py b/ml-agents/mlagents/tf_utils/__init__.py
deleted file mode 100644
index b128304716..0000000000
--- a/ml-agents/mlagents/tf_utils/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from mlagents.tf_utils.tf import tf as tf # noqa
-from mlagents.tf_utils.tf import set_warnings_enabled # noqa
-from mlagents.tf_utils.tf import generate_session_config # noqa
-from mlagents.tf_utils.tf import is_available # noqa
diff --git a/ml-agents/mlagents/tf_utils/tf.py b/ml-agents/mlagents/tf_utils/tf.py
deleted file mode 100644
index 457cf01a0e..0000000000
--- a/ml-agents/mlagents/tf_utils/tf.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# This should be the only place that we import tensorflow directly.
-# Everywhere else is caught by the banned-modules setting for flake8
-
-from distutils.version import LooseVersion
-
-try:
- import tensorflow as tf # noqa I201
-
- # LooseVersion handles things "1.2.3a" or "4.5.6-rc7" fairly sensibly.
- _is_tensorflow2 = LooseVersion(tf.__version__) >= LooseVersion("2.0.0")
-
- if _is_tensorflow2:
- import tensorflow.compat.v1 as tf
-
- tf.disable_v2_behavior()
- tf_logging = tf.logging
- else:
- try:
- # Newer versions of tf 1.x will complain that tf.logging is deprecated
- tf_logging = tf.compat.v1.logging
- except AttributeError:
- # Fall back to the safe import, even if it might generate a warning or two.
- tf_logging = tf.logging
-except ImportError:
- tf = None
-
-
-def is_available():
- """
- Returns whether Torch is available in this Python environment
- """
- return tf is not None
-
-
-def set_warnings_enabled(is_enabled: bool) -> None:
- """
- Enable or disable tensorflow warnings (notably, this disables deprecation warnings.
- :param is_enabled:
- """
- if is_available():
- level = tf_logging.WARN if is_enabled else tf_logging.ERROR
- tf_logging.set_verbosity(level)
-
-
-def generate_session_config() -> "tf.ConfigProto":
- """
- Generate a ConfigProto to use for ML-Agents that doesn't consume all of the GPU memory
- and allows for soft placement in the case of multi-GPU.
- """
- if is_available():
- config = tf.ConfigProto()
- config.gpu_options.allow_growth = True
- # For multi-GPU training, set allow_soft_placement to True to allow
- # placing the operation into an alternative device automatically
- # to prevent from exceptions if the device doesn't suppport the operation
- # or the device does not exist
- config.allow_soft_placement = True
- return config
- else:
- return None
diff --git a/ml-agents/mlagents/tf_utils/globals.py b/ml-agents/mlagents/torch_utils/globals.py
similarity index 100%
rename from ml-agents/mlagents/tf_utils/globals.py
rename to ml-agents/mlagents/torch_utils/globals.py
diff --git a/ml-agents/mlagents/trainers/cli_utils.py b/ml-agents/mlagents/trainers/cli_utils.py
index 9acae72b7e..5cc9d7c292 100644
--- a/ml-agents/mlagents/trainers/cli_utils.py
+++ b/ml-agents/mlagents/trainers/cli_utils.py
@@ -4,6 +4,21 @@
from mlagents.trainers.exception import TrainerConfigError
from mlagents_envs.environment import UnityEnvironment
import argparse
+from mlagents_envs import logging_util
+
+logger = logging_util.get_logger(__name__)
+
+
+class RaiseRemovedWarning(argparse.Action):
+ """
+ Internal custom Action to raise warning when argument is called.
+ """
+
+ def __init__(self, nargs=0, **kwargs):
+ super().__init__(nargs=nargs, **kwargs)
+
+ def __call__(self, arg_parser, namespace, values, option_string=None):
+ logger.warning(f"The command line argument {option_string} was removed.")
class DetectDefault(argparse.Action):
@@ -171,16 +186,14 @@ def _create_parser() -> argparse.ArgumentParser:
argparser.add_argument(
"--torch",
default=False,
- action=DetectDefaultStoreTrue,
- help="Use the PyTorch framework. Note that this option is not required anymore as PyTorch is the"
- "default framework, and will be removed in the next release.",
+ action=RaiseRemovedWarning,
+ help="(Removed) Use the PyTorch framework.",
)
argparser.add_argument(
"--tensorflow",
default=False,
- action=DetectDefaultStoreTrue,
- help="(Deprecated) Use the TensorFlow framework instead of PyTorch. Install TensorFlow "
- "before using this option.",
+ action=RaiseRemovedWarning,
+ help="(Removed) Use the TensorFlow framework.",
)
eng_conf = argparser.add_argument_group(title="Engine Configuration")
diff --git a/ml-agents/mlagents/trainers/learn.py b/ml-agents/mlagents/trainers/learn.py
index 27cf0f2731..b5955adfa0 100644
--- a/ml-agents/mlagents/trainers/learn.py
+++ b/ml-agents/mlagents/trainers/learn.py
@@ -10,7 +10,6 @@
import mlagents.trainers
import mlagents_envs
-from mlagents import tf_utils
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents.trainers.trainer import TrainerFactory
@@ -21,7 +20,7 @@
GaugeWriter,
ConsoleWriter,
)
-from mlagents.trainers.cli_utils import parser, DetectDefault
+from mlagents.trainers.cli_utils import parser
from mlagents_envs.environment import UnityEnvironment
from mlagents.trainers.settings import RunOptions
@@ -135,8 +134,6 @@ def run_training(run_seed: int, options: RunOptions) -> None:
param_manager=env_parameter_manager,
init_path=maybe_init_path,
multi_gpu=False,
- force_torch="torch" in DetectDefault.non_default_args,
- force_tensorflow="tensorflow" in DetectDefault.non_default_args,
)
# Create controller and begin training.
tc = TrainerController(
@@ -242,8 +239,6 @@ def run_cli(options: RunOptions) -> None:
log_level = logging_util.DEBUG
else:
log_level = logging_util.INFO
- # disable noisy warnings from tensorflow
- tf_utils.set_warnings_enabled(False)
logging_util.set_log_level(log_level)
diff --git a/ml-agents/mlagents/trainers/model_saver/tf_model_saver.py b/ml-agents/mlagents/trainers/model_saver/tf_model_saver.py
deleted file mode 100644
index 8463ca4416..0000000000
--- a/ml-agents/mlagents/trainers/model_saver/tf_model_saver.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import os
-import shutil
-from typing import Optional, Union, cast
-from mlagents_envs.exception import UnityPolicyException
-from mlagents_envs.logging_util import get_logger
-from mlagents.tf_utils import tf
-from mlagents.trainers.model_saver.model_saver import BaseModelSaver
-from mlagents.trainers.tf.model_serialization import export_policy_model
-from mlagents.trainers.settings import TrainerSettings, SerializationSettings
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
-from mlagents.trainers import __version__
-
-
-logger = get_logger(__name__)
-
-
-class TFModelSaver(BaseModelSaver):
- """
- ModelSaver class for TensorFlow
- """
-
- def __init__(
- self, trainer_settings: TrainerSettings, model_path: str, load: bool = False
- ):
- super().__init__()
- self.model_path = model_path
- self.initialize_path = trainer_settings.init_path
- self._keep_checkpoints = trainer_settings.keep_checkpoints
- self.load = load
-
- # Currently only support saving one policy. This is the one to be saved.
- self.policy: Optional[TFPolicy] = None
- self.graph = None
- self.sess = None
- self.tf_saver = None
-
- def register(self, module: Union[TFPolicy, TFOptimizer]) -> None:
- if isinstance(module, TFPolicy):
- self._register_policy(module)
- elif isinstance(module, TFOptimizer):
- self._register_optimizer(module)
- else:
- raise UnityPolicyException(
- "Registering Object of unsupported type {} to Saver ".format(
- type(module)
- )
- )
-
- def _register_policy(self, policy: TFPolicy) -> None:
- if self.policy is None:
- self.policy = policy
- self.graph = self.policy.graph
- self.sess = self.policy.sess
- with self.policy.graph.as_default():
- self.tf_saver = tf.train.Saver(max_to_keep=self._keep_checkpoints)
-
- def save_checkpoint(self, behavior_name: str, step: int) -> str:
- checkpoint_path = os.path.join(self.model_path, f"{behavior_name}-{step}")
- # Save the TF checkpoint and graph definition
- if self.graph:
- with self.graph.as_default():
- if self.tf_saver:
- self.tf_saver.save(self.sess, f"{checkpoint_path}.ckpt")
- tf.train.write_graph(
- self.graph, self.model_path, "raw_graph_def.pb", as_text=False
- )
- # also save the policy so we have optimized model files for each checkpoint
- self.export(checkpoint_path, behavior_name)
- return checkpoint_path
-
- def export(self, output_filepath: str, behavior_name: str) -> None:
- # save model if there is only one worker or
- # only on worker-0 if there are multiple workers
- if self.policy and self.policy.rank is not None and self.policy.rank != 0:
- return
- if self.graph is None:
- logger.info("No model to export")
- return
- export_policy_model(
- self.model_path, output_filepath, behavior_name, self.graph, self.sess
- )
-
- def initialize_or_load(self, policy: Optional[TFPolicy] = None) -> None:
- # If there is an initialize path, load from that. Else, load from the set model path.
- # If load is set to True, don't reset steps to 0. Else, do. This allows a user to,
- # e.g., resume from an initialize path.
- if policy is None:
- policy = self.policy
- policy = cast(TFPolicy, policy)
- reset_steps = not self.load
- if self.initialize_path is not None:
- self._load_graph(
- policy, self.initialize_path, reset_global_steps=reset_steps
- )
- elif self.load:
- self._load_graph(policy, self.model_path, reset_global_steps=reset_steps)
- else:
- policy.initialize()
- TFPolicy.broadcast_global_variables(0)
-
- def _load_graph(
- self, policy: TFPolicy, model_path: str, reset_global_steps: bool = False
- ) -> None:
- # This prevents normalizer init up from executing on load
- policy.first_normalization_update = False
- with policy.graph.as_default():
- logger.info(f"Loading model from {model_path}.")
- ckpt = tf.train.get_checkpoint_state(model_path)
- if ckpt is None:
- raise UnityPolicyException(
- "The model {} could not be loaded. Make "
- "sure you specified the right "
- "--run-id and that the previous run you are loading from had the same "
- "behavior names.".format(model_path)
- )
- if self.tf_saver:
- try:
- self.tf_saver.restore(policy.sess, ckpt.model_checkpoint_path)
- except tf.errors.NotFoundError:
- raise UnityPolicyException(
- "The model {} was found but could not be loaded. Make "
- "sure the model is from the same version of ML-Agents, has the same behavior parameters, "
- "and is using the same trainer configuration as the current run.".format(
- model_path
- )
- )
- self._check_model_version(__version__)
- if reset_global_steps:
- policy.set_step(0)
- logger.info(
- "Starting training from step 0 and saving to {}.".format(
- self.model_path
- )
- )
- else:
- logger.info(f"Resuming training from step {policy.get_current_step()}.")
-
- def _check_model_version(self, version: str) -> None:
- """
- Checks whether the model being loaded was created with the same version of
- ML-Agents, and throw a warning if not so.
- """
- if self.policy is not None and self.policy.version_tensors is not None:
- loaded_ver = tuple(
- num.eval(session=self.sess) for num in self.policy.version_tensors
- )
- if loaded_ver != TFPolicy._convert_version_string(version):
- logger.warning(
- f"The model checkpoint you are loading from was saved with ML-Agents version "
- f"{loaded_ver[0]}.{loaded_ver[1]}.{loaded_ver[2]} but your current ML-Agents"
- f"version is {version}. Model may not behave properly."
- )
-
- def copy_final_model(self, source_nn_path: str) -> None:
- """
- Copy the .nn file at the given source to the destination.
- Also copies the corresponding .onnx file if it exists.
- """
- final_model_name = os.path.splitext(source_nn_path)[0]
-
- if SerializationSettings.convert_to_barracuda:
- source_path = f"{final_model_name}.nn"
- destination_path = f"{self.model_path}.nn"
- shutil.copyfile(source_path, destination_path)
- logger.info(f"Copied {source_path} to {destination_path}.")
-
- if SerializationSettings.convert_to_onnx:
- try:
- source_path = f"{final_model_name}.onnx"
- destination_path = f"{self.model_path}.onnx"
- shutil.copyfile(source_path, destination_path)
- logger.info(f"Copied {source_path} to {destination_path}.")
- except OSError:
- pass
diff --git a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py b/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
deleted file mode 100644
index f4e432366a..0000000000
--- a/ml-agents/mlagents/trainers/optimizer/tf_optimizer.py
+++ /dev/null
@@ -1,168 +0,0 @@
-from typing import Dict, Any, List, Tuple, Optional
-import numpy as np
-
-from mlagents.tf_utils.tf import tf
-from mlagents.trainers.buffer import AgentBuffer
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.optimizer import Optimizer
-from mlagents.trainers.trajectory import SplitObservations
-from mlagents.trainers.tf.components.reward_signals.reward_signal_factory import (
- create_reward_signal,
-)
-from mlagents.trainers.settings import TrainerSettings, RewardSignalType
-from mlagents.trainers.tf.components.bc.module import BCModule
-
-
-class TFOptimizer(Optimizer): # pylint: disable=W0223
- def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
- super().__init__()
- self.sess = policy.sess
- self.policy = policy
- self.update_dict: Dict[str, tf.Tensor] = {}
- self.value_heads: Dict[str, tf.Tensor] = {}
- self.create_reward_signals(trainer_params.reward_signals)
- self.memory_in: tf.Tensor = None
- self.memory_out: tf.Tensor = None
- self.m_size: int = 0
- self.bc_module: Optional[BCModule] = None
- # Create pretrainer if needed
- if trainer_params.behavioral_cloning is not None:
- self.bc_module = BCModule(
- self.policy,
- trainer_params.behavioral_cloning,
- policy_learning_rate=trainer_params.hyperparameters.learning_rate,
- default_batch_size=trainer_params.hyperparameters.batch_size,
- default_num_epoch=3,
- )
-
- def get_trajectory_value_estimates(
- self, batch: AgentBuffer, next_obs: List[np.ndarray], done: bool
- ) -> Tuple[Dict[str, np.ndarray], Dict[str, float]]:
- feed_dict: Dict[tf.Tensor, Any] = {
- self.policy.batch_size_ph: batch.num_experiences,
- self.policy.sequence_length_ph: batch.num_experiences, # We want to feed data in batch-wise, not time-wise.
- }
-
- if self.policy.vec_obs_size > 0:
- feed_dict[self.policy.vector_in] = batch["vector_obs"]
- if self.policy.vis_obs_size > 0:
- for i in range(len(self.policy.visual_in)):
- _obs = batch["visual_obs%d" % i]
- feed_dict[self.policy.visual_in[i]] = _obs
- if self.policy.use_recurrent:
- feed_dict[self.policy.memory_in] = [
- np.zeros((self.policy.m_size), dtype=np.float32)
- ]
- feed_dict[self.memory_in] = [np.zeros((self.m_size), dtype=np.float32)]
- if self.policy.prev_action is not None:
- feed_dict[self.policy.prev_action] = batch["prev_action"]
-
- if self.policy.use_recurrent:
- value_estimates, policy_mem, value_mem = self.sess.run(
- [self.value_heads, self.policy.memory_out, self.memory_out], feed_dict
- )
- prev_action = (
- batch["discrete_action"][-1]
- if not self.policy.use_continuous_act
- else None
- )
- else:
- value_estimates = self.sess.run(self.value_heads, feed_dict)
- prev_action = None
- policy_mem = None
- value_mem = None
- value_estimates = {k: np.squeeze(v, axis=1) for k, v in value_estimates.items()}
-
- # We do this in a separate step to feed the memory outs - a further optimization would
- # be to append to the obs before running sess.run.
- final_value_estimates = self._get_value_estimates(
- next_obs, done, policy_mem, value_mem, prev_action
- )
-
- return value_estimates, final_value_estimates
-
- def _get_value_estimates(
- self,
- next_obs: List[np.ndarray],
- done: bool,
- policy_memory: np.ndarray = None,
- value_memory: np.ndarray = None,
- prev_action: np.ndarray = None,
- ) -> Dict[str, float]:
- """
- Generates value estimates for bootstrapping.
- :param experience: AgentExperience to be used for bootstrapping.
- :param done: Whether or not this is the last element of the episode, in which case the value estimate will be 0.
- :return: The value estimate dictionary with key being the name of the reward signal and the value the
- corresponding value estimate.
- """
-
- feed_dict: Dict[tf.Tensor, Any] = {
- self.policy.batch_size_ph: 1,
- self.policy.sequence_length_ph: 1,
- }
- vec_vis_obs = SplitObservations.from_observations(next_obs)
- for i in range(len(vec_vis_obs.visual_observations)):
- feed_dict[self.policy.visual_in[i]] = [vec_vis_obs.visual_observations[i]]
-
- if self.policy.vec_obs_size > 0:
- feed_dict[self.policy.vector_in] = [vec_vis_obs.vector_observations]
- if policy_memory is not None:
- feed_dict[self.policy.memory_in] = policy_memory
- if value_memory is not None:
- feed_dict[self.memory_in] = value_memory
- if prev_action is not None:
- feed_dict[self.policy.prev_action] = [prev_action]
- value_estimates = self.sess.run(self.value_heads, feed_dict)
-
- value_estimates = {k: float(v) for k, v in value_estimates.items()}
-
- # If we're done, reassign all of the value estimates that need terminal states.
- if done:
- for k in value_estimates:
- if self.reward_signals[k].use_terminal_states:
- value_estimates[k] = 0.0
-
- return value_estimates
-
- def create_reward_signals(
- self, reward_signal_configs: Dict[RewardSignalType, Any]
- ) -> None:
- """
- Create reward signals
- :param reward_signal_configs: Reward signal config.
- """
- # Create reward signals
- for reward_signal, settings in reward_signal_configs.items():
- # Name reward signals by string in case we have duplicates later
- self.reward_signals[reward_signal.value] = create_reward_signal(
- self.policy, reward_signal, settings
- )
- self.update_dict.update(
- self.reward_signals[reward_signal.value].update_dict
- )
-
- @classmethod
- def create_optimizer_op(
- cls, learning_rate: tf.Tensor, name: str = "Adam"
- ) -> tf.train.Optimizer:
- return tf.train.AdamOptimizer(learning_rate=learning_rate, name=name)
-
- def _execute_model(
- self, feed_dict: Dict[tf.Tensor, np.ndarray], out_dict: Dict[str, tf.Tensor]
- ) -> Dict[str, np.ndarray]:
- """
- Executes model.
- :param feed_dict: Input dictionary mapping nodes to input data.
- :param out_dict: Output dictionary mapping names to nodes.
- :return: Dictionary mapping names to input data.
- """
- network_out = self.sess.run(list(out_dict.values()), feed_dict=feed_dict)
- run_out = dict(zip(list(out_dict.keys()), network_out))
- return run_out
-
- def _make_zero_mem(self, m_size: int, length: int) -> List[np.ndarray]:
- return [
- np.zeros((m_size), dtype=np.float32)
- for i in range(0, length, self.policy.sequence_length)
- ]
diff --git a/ml-agents/mlagents/trainers/policy/tf_policy.py b/ml-agents/mlagents/trainers/policy/tf_policy.py
deleted file mode 100644
index 6f2bd7b310..0000000000
--- a/ml-agents/mlagents/trainers/policy/tf_policy.py
+++ /dev/null
@@ -1,630 +0,0 @@
-from typing import Any, Dict, List, Optional, Tuple, Callable
-import numpy as np
-from distutils.version import LooseVersion
-
-from mlagents_envs.timers import timed
-
-from mlagents.tf_utils import tf
-from mlagents import tf_utils
-from mlagents_envs.exception import UnityException
-from mlagents_envs.logging_util import get_logger
-from mlagents.trainers.policy import Policy
-from mlagents.trainers.action_info import ActionInfo
-from mlagents.trainers.trajectory import SplitObservations
-from mlagents.trainers.torch.action_log_probs import LogProbsTuple
-from mlagents.trainers.behavior_id_utils import get_global_agent_id
-from mlagents_envs.base_env import DecisionSteps, ActionTuple, BehaviorSpec
-from mlagents.trainers.tf.models import ModelUtils
-from mlagents.trainers.settings import TrainerSettings, EncoderType
-from mlagents.trainers import __version__
-from mlagents.trainers.tf.distributions import (
- GaussianDistribution,
- MultiCategoricalDistribution,
-)
-from mlagents.tf_utils.globals import get_rank
-
-
-logger = get_logger(__name__)
-
-
-# This is the version number of the inputs and outputs of the model, and
-# determines compatibility with inference in Barracuda.
-MODEL_FORMAT_VERSION = 2
-
-EPSILON = 1e-6 # Small value to avoid divide by zero
-
-
-class UnityPolicyException(UnityException):
- """
- Related to errors with the Trainer.
- """
-
- pass
-
-
-class TFPolicy(Policy):
- """
- Contains a learning model, and the necessary
- functions to save/load models and create the input placeholders.
- """
-
- # Callback function used at the start of training to synchronize weights.
- # By default, this nothing.
- # If this needs to be used, it should be done from outside ml-agents.
- broadcast_global_variables: Callable[[int], None] = lambda root_rank: None
-
- def __init__(
- self,
- seed: int,
- behavior_spec: BehaviorSpec,
- trainer_settings: TrainerSettings,
- tanh_squash: bool = False,
- reparameterize: bool = False,
- condition_sigma_on_obs: bool = True,
- create_tf_graph: bool = True,
- ):
- """
- Initialized the policy.
- :param seed: Random seed to use for TensorFlow.
- :param brain: The corresponding Brain for this policy.
- :param trainer_settings: The trainer parameters.
- """
- super().__init__(
- seed,
- behavior_spec,
- trainer_settings,
- tanh_squash,
- reparameterize,
- condition_sigma_on_obs,
- )
- if (
- self.behavior_spec.action_spec.continuous_size > 0
- and self.behavior_spec.action_spec.discrete_size > 0
- ):
- raise UnityPolicyException(
- "TensorFlow does not support continuous and discrete actions on the same behavior. "
- "Please run with the Torch framework."
- )
- # for ghost trainer save/load snapshots
- self.assign_phs: List[tf.Tensor] = []
- self.assign_ops: List[tf.Operation] = []
- self.update_dict: Dict[str, tf.Tensor] = {}
- self.inference_dict: Dict[str, tf.Tensor] = {}
- self.first_normalization_update: bool = False
-
- self.graph = tf.Graph()
- self.sess = tf.Session(
- config=tf_utils.generate_session_config(), graph=self.graph
- )
- self._initialize_tensorflow_references()
- self.grads = None
- self.update_batch: Optional[tf.Operation] = None
- self.trainable_variables: List[tf.Variable] = []
- self.rank = get_rank()
- if create_tf_graph:
- self.create_tf_graph()
-
- def get_trainable_variables(self) -> List[tf.Variable]:
- """
- Returns a List of the trainable variables in this policy. if create_tf_graph hasn't been called,
- returns empty list.
- """
- return self.trainable_variables
-
- def create_tf_graph(self) -> None:
- """
- Builds the tensorflow graph needed for this policy.
- """
- with self.graph.as_default():
- tf.set_random_seed(self.seed)
- _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
- if len(_vars) > 0:
- # We assume the first thing created in the graph is the Policy. If
- # already populated, don't create more tensors.
- return
-
- self.create_input_placeholders()
- encoded = self._create_encoder(
- self.visual_in,
- self.processed_vector_in,
- self.h_size,
- self.num_layers,
- self.vis_encode_type,
- )
- if self.use_continuous_act:
- self._create_cc_actor(
- encoded,
- self.tanh_squash,
- self.reparameterize,
- self.condition_sigma_on_obs,
- )
- else:
- self._create_dc_actor(encoded)
- self.trainable_variables = tf.get_collection(
- tf.GraphKeys.TRAINABLE_VARIABLES, scope="policy"
- )
- self.trainable_variables += tf.get_collection(
- tf.GraphKeys.TRAINABLE_VARIABLES, scope="lstm"
- ) # LSTMs need to be root scope for Barracuda export
-
- self.inference_dict = {
- "action": self.output,
- "log_probs": self.all_log_probs,
- "entropy": self.entropy,
- }
- if self.use_continuous_act:
- self.inference_dict["pre_action"] = self.output_pre
- if self.use_recurrent:
- self.inference_dict["memory_out"] = self.memory_out
-
- # We do an initialize to make the Policy usable out of the box. If an optimizer is needed,
- # it will re-load the full graph
- self.initialize()
- # Create assignment ops for Ghost Trainer
- self.init_load_weights()
-
- def _create_encoder(
- self,
- visual_in: List[tf.Tensor],
- vector_in: tf.Tensor,
- h_size: int,
- num_layers: int,
- vis_encode_type: EncoderType,
- ) -> tf.Tensor:
- """
- Creates an encoder for visual and vector observations.
- :param h_size: Size of hidden linear layers.
- :param num_layers: Number of hidden linear layers.
- :param vis_encode_type: Type of visual encoder to use if visual input.
- :return: The hidden layer (tf.Tensor) after the encoder.
- """
- with tf.variable_scope("policy"):
- encoded = ModelUtils.create_observation_streams(
- self.visual_in,
- self.processed_vector_in,
- 1,
- h_size,
- num_layers,
- vis_encode_type,
- )[0]
- return encoded
-
- @staticmethod
- def _convert_version_string(version_string: str) -> Tuple[int, ...]:
- """
- Converts the version string into a Tuple of ints (major_ver, minor_ver, patch_ver).
- :param version_string: The semantic-versioned version string (X.Y.Z).
- :return: A Tuple containing (major_ver, minor_ver, patch_ver).
- """
- ver = LooseVersion(version_string)
- return tuple(map(int, ver.version[0:3]))
-
- def initialize(self):
- with self.graph.as_default():
- init = tf.global_variables_initializer()
- self.sess.run(init)
-
- def get_weights(self):
- with self.graph.as_default():
- _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
- values = [v.eval(session=self.sess) for v in _vars]
- return values
-
- def init_load_weights(self):
- with self.graph.as_default():
- _vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
- values = [v.eval(session=self.sess) for v in _vars]
- for var, value in zip(_vars, values):
- assign_ph = tf.placeholder(var.dtype, shape=value.shape)
- self.assign_phs.append(assign_ph)
- self.assign_ops.append(tf.assign(var, assign_ph))
-
- def load_weights(self, values):
- if len(self.assign_ops) == 0:
- logger.warning(
- "Calling load_weights in tf_policy but assign_ops is empty. Did you forget to call init_load_weights?"
- )
- with self.graph.as_default():
- feed_dict = {}
- for assign_ph, value in zip(self.assign_phs, values):
- feed_dict[assign_ph] = value
- self.sess.run(self.assign_ops, feed_dict=feed_dict)
-
- @timed
- def evaluate(
- self, decision_requests: DecisionSteps, global_agent_ids: List[str]
- ) -> Dict[str, Any]:
- """
- Evaluates policy for the agent experiences provided.
- :param decision_requests: DecisionSteps object containing inputs.
- :param global_agent_ids: The global (with worker ID) agent ids of the data in the batched_step_result.
- :return: Outputs from network as defined by self.inference_dict.
- """
- feed_dict = {
- self.batch_size_ph: len(decision_requests),
- self.sequence_length_ph: 1,
- }
- if self.use_recurrent:
- if not self.use_continuous_act:
- feed_dict[self.prev_action] = self.retrieve_previous_action(
- global_agent_ids
- )
-
- feed_dict[self.memory_in] = self.retrieve_memories(global_agent_ids)
- feed_dict = self.fill_eval_dict(feed_dict, decision_requests)
- run_out = self._execute_model(feed_dict, self.inference_dict)
- return run_out
-
- def get_action(
- self, decision_requests: DecisionSteps, worker_id: int = 0
- ) -> ActionInfo:
- """
- Decides actions given observations information, and takes them in environment.
- :param decision_requests: A dictionary of brain names and DecisionSteps from environment.
- :param worker_id: In parallel environment training, the unique id of the environment worker that
- the DecisionSteps came from. Used to construct a globally unique id for each agent.
- :return: an ActionInfo containing action, memories, values and an object
- to be passed to add experiences
- """
- if len(decision_requests) == 0:
- return ActionInfo.empty()
-
- global_agent_ids = [
- get_global_agent_id(worker_id, int(agent_id))
- for agent_id in decision_requests.agent_id
- ] # For 1-D array, the iterator order is correct.
-
- run_out = self.evaluate( # pylint: disable=assignment-from-no-return
- decision_requests, global_agent_ids
- )
-
- self.save_memories(global_agent_ids, run_out.get("memory_out"))
- # For Compatibility with buffer changes for hybrid action support
- if "log_probs" in run_out:
- log_probs_tuple = LogProbsTuple()
- if self.behavior_spec.action_spec.is_continuous():
- log_probs_tuple.add_continuous(run_out["log_probs"])
- else:
- log_probs_tuple.add_discrete(run_out["log_probs"])
- run_out["log_probs"] = log_probs_tuple
- if "action" in run_out:
- action_tuple = ActionTuple()
- env_action_tuple = ActionTuple()
- if self.behavior_spec.action_spec.is_continuous():
- action_tuple.add_continuous(run_out["pre_action"])
- env_action_tuple.add_continuous(run_out["action"])
- else:
- action_tuple.add_discrete(run_out["action"])
- env_action_tuple.add_discrete(run_out["action"])
- run_out["action"] = action_tuple
- run_out["env_action"] = env_action_tuple
- self.check_nan_action(run_out.get("action"))
- return ActionInfo(
- action=run_out.get("action"),
- env_action=run_out.get("env_action"),
- value=run_out.get("value"),
- outputs=run_out,
- agent_ids=decision_requests.agent_id,
- )
-
- def update(self, mini_batch, num_sequences):
- """
- Performs update of the policy.
- :param num_sequences: Number of experience trajectories in batch.
- :param mini_batch: Batch of experiences.
- :return: Results of update.
- """
- raise UnityPolicyException("The update function was not implemented.")
-
- def _execute_model(self, feed_dict, out_dict):
- """
- Executes model.
- :param feed_dict: Input dictionary mapping nodes to input data.
- :param out_dict: Output dictionary mapping names to nodes.
- :return: Dictionary mapping names to input data.
- """
- network_out = self.sess.run(list(out_dict.values()), feed_dict=feed_dict)
- run_out = dict(zip(list(out_dict.keys()), network_out))
- return run_out
-
- def fill_eval_dict(self, feed_dict, batched_step_result):
- vec_vis_obs = SplitObservations.from_observations(batched_step_result.obs)
- for i, _ in enumerate(vec_vis_obs.visual_observations):
- feed_dict[self.visual_in[i]] = vec_vis_obs.visual_observations[i]
- if self.use_vec_obs:
- feed_dict[self.vector_in] = vec_vis_obs.vector_observations
- if not self.use_continuous_act:
- mask = np.ones(
- (
- len(batched_step_result),
- sum(self.behavior_spec.action_spec.discrete_branches),
- ),
- dtype=np.float32,
- )
- if batched_step_result.action_mask is not None:
- mask = 1 - np.concatenate(batched_step_result.action_mask, axis=1)
- feed_dict[self.action_masks] = mask
- return feed_dict
-
- def get_current_step(self):
- """
- Gets current model step.
- :return: current model step.
- """
- step = self.sess.run(self.global_step)
- return step
-
- def set_step(self, step: int) -> int:
- """
- Sets current model step to step without creating additional ops.
- :param step: Step to set the current model step to.
- :return: The step the model was set to.
- """
- current_step = self.get_current_step()
- # Increment a positive or negative number of steps.
- return self.increment_step(step - current_step)
-
- def increment_step(self, n_steps):
- """
- Increments model step.
- """
- out_dict = {
- "global_step": self.global_step,
- "increment_step": self.increment_step_op,
- }
- feed_dict = {self.steps_to_increment: n_steps}
- return self.sess.run(out_dict, feed_dict=feed_dict)["global_step"]
-
- def get_inference_vars(self):
- """
- :return:list of inference var names
- """
- return list(self.inference_dict.keys())
-
- def get_update_vars(self):
- """
- :return:list of update var names
- """
- return list(self.update_dict.keys())
-
- def update_normalization(self, vector_obs: np.ndarray) -> None:
- """
- If this policy normalizes vector observations, this will update the norm values in the graph.
- :param vector_obs: The vector observations to add to the running estimate of the distribution.
- """
- if self.use_vec_obs and self.normalize:
- if self.first_normalization_update:
- self.sess.run(
- self.init_normalization_op, feed_dict={self.vector_in: vector_obs}
- )
- self.first_normalization_update = False
- else:
- self.sess.run(
- self.update_normalization_op, feed_dict={self.vector_in: vector_obs}
- )
-
- @property
- def use_vis_obs(self):
- return self.vis_obs_size > 0
-
- @property
- def use_vec_obs(self):
- return self.vec_obs_size > 0
-
- def _initialize_tensorflow_references(self):
- self.value_heads: Dict[str, tf.Tensor] = {}
- self.normalization_steps: Optional[tf.Variable] = None
- self.running_mean: Optional[tf.Variable] = None
- self.running_variance: Optional[tf.Variable] = None
- self.init_normalization_op: Optional[tf.Operation] = None
- self.update_normalization_op: Optional[tf.Operation] = None
- self.value: Optional[tf.Tensor] = None
- self.all_log_probs: tf.Tensor = None
- self.total_log_probs: Optional[tf.Tensor] = None
- self.entropy: Optional[tf.Tensor] = None
- self.output_pre: Optional[tf.Tensor] = None
- self.output: Optional[tf.Tensor] = None
- self.selected_actions: tf.Tensor = None
- self.action_masks: Optional[tf.Tensor] = None
- self.prev_action: Optional[tf.Tensor] = None
- self.memory_in: Optional[tf.Tensor] = None
- self.memory_out: Optional[tf.Tensor] = None
- self.version_tensors: Optional[Tuple[tf.Tensor, tf.Tensor, tf.Tensor]] = None
-
- def create_input_placeholders(self):
- with self.graph.as_default():
- (
- self.global_step,
- self.increment_step_op,
- self.steps_to_increment,
- ) = ModelUtils.create_global_steps()
- self.vector_in, self.visual_in = ModelUtils.create_input_placeholders(
- self.behavior_spec.observation_shapes
- )
- if self.normalize:
- self.first_normalization_update = True
- normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
- self.update_normalization_op = normalization_tensors.update_op
- self.init_normalization_op = normalization_tensors.init_op
- self.normalization_steps = normalization_tensors.steps
- self.running_mean = normalization_tensors.running_mean
- self.running_variance = normalization_tensors.running_variance
- self.processed_vector_in = ModelUtils.normalize_vector_obs(
- self.vector_in,
- self.running_mean,
- self.running_variance,
- self.normalization_steps,
- )
- else:
- self.processed_vector_in = self.vector_in
- self.update_normalization_op = None
-
- self.batch_size_ph = tf.placeholder(
- shape=None, dtype=tf.int32, name="batch_size"
- )
- self.sequence_length_ph = tf.placeholder(
- shape=None, dtype=tf.int32, name="sequence_length"
- )
- self.mask_input = tf.placeholder(
- shape=[None], dtype=tf.float32, name="masks"
- )
- # Only needed for PPO, but needed for BC module
- self.epsilon = tf.placeholder(
- shape=[None, self.act_size[0]], dtype=tf.float32, name="epsilon"
- )
- self.mask = tf.cast(self.mask_input, tf.int32)
-
- tf.Variable(
- int(self.behavior_spec.action_spec.is_continuous()),
- name="is_continuous_control",
- trainable=False,
- dtype=tf.int32,
- )
- int_version = TFPolicy._convert_version_string(__version__)
- major_ver_t = tf.Variable(
- int_version[0],
- name="trainer_major_version",
- trainable=False,
- dtype=tf.int32,
- )
- minor_ver_t = tf.Variable(
- int_version[1],
- name="trainer_minor_version",
- trainable=False,
- dtype=tf.int32,
- )
- patch_ver_t = tf.Variable(
- int_version[2],
- name="trainer_patch_version",
- trainable=False,
- dtype=tf.int32,
- )
- self.version_tensors = (major_ver_t, minor_ver_t, patch_ver_t)
- tf.Variable(
- MODEL_FORMAT_VERSION,
- name="version_number",
- trainable=False,
- dtype=tf.int32,
- )
- tf.Variable(
- self.m_size, name="memory_size", trainable=False, dtype=tf.int32
- )
- if self.behavior_spec.action_spec.is_continuous():
- tf.Variable(
- self.act_size[0],
- name="action_output_shape",
- trainable=False,
- dtype=tf.int32,
- )
- else:
- tf.Variable(
- sum(self.act_size),
- name="action_output_shape",
- trainable=False,
- dtype=tf.int32,
- )
-
- def _create_cc_actor(
- self,
- encoded: tf.Tensor,
- tanh_squash: bool = False,
- reparameterize: bool = False,
- condition_sigma_on_obs: bool = True,
- ) -> None:
- """
- Creates Continuous control actor-critic model.
- :param h_size: Size of hidden linear layers.
- :param num_layers: Number of hidden linear layers.
- :param vis_encode_type: Type of visual encoder to use if visual input.
- :param tanh_squash: Whether to use a tanh function, or a clipped output.
- :param reparameterize: Whether we are using the resampling trick to update the policy.
- """
- if self.use_recurrent:
- self.memory_in = tf.placeholder(
- shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
- )
- hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
- encoded, self.memory_in, self.sequence_length_ph, name="lstm_policy"
- )
-
- self.memory_out = tf.identity(memory_policy_out, name="recurrent_out")
- else:
- hidden_policy = encoded
-
- with tf.variable_scope("policy"):
- distribution = GaussianDistribution(
- hidden_policy,
- self.act_size,
- reparameterize=reparameterize,
- tanh_squash=tanh_squash,
- condition_sigma=condition_sigma_on_obs,
- )
-
- if tanh_squash:
- self.output_pre = distribution.sample
- self.output = tf.identity(self.output_pre, name="action")
- else:
- self.output_pre = distribution.sample
- # Clip and scale output to ensure actions are always within [-1, 1] range.
- output_post = tf.clip_by_value(self.output_pre, -3, 3) / 3
- self.output = tf.identity(output_post, name="action")
-
- self.selected_actions = tf.stop_gradient(self.output)
-
- self.all_log_probs = tf.identity(distribution.log_probs, name="action_probs")
- self.entropy = distribution.entropy
-
- # We keep these tensors the same name, but use new nodes to keep code parallelism with discrete control.
- self.total_log_probs = distribution.total_log_probs
-
- def _create_dc_actor(self, encoded: tf.Tensor) -> None:
- """
- Creates Discrete control actor-critic model.
- :param h_size: Size of hidden linear layers.
- :param num_layers: Number of hidden linear layers.
- :param vis_encode_type: Type of visual encoder to use if visual input.
- """
- if self.use_recurrent:
- self.prev_action = tf.placeholder(
- shape=[None, len(self.act_size)], dtype=tf.int32, name="prev_action"
- )
- prev_action_oh = tf.concat(
- [
- tf.one_hot(self.prev_action[:, i], self.act_size[i])
- for i in range(len(self.act_size))
- ],
- axis=1,
- )
- hidden_policy = tf.concat([encoded, prev_action_oh], axis=1)
-
- self.memory_in = tf.placeholder(
- shape=[None, self.m_size], dtype=tf.float32, name="recurrent_in"
- )
- hidden_policy, memory_policy_out = ModelUtils.create_recurrent_encoder(
- hidden_policy,
- self.memory_in,
- self.sequence_length_ph,
- name="lstm_policy",
- )
-
- self.memory_out = tf.identity(memory_policy_out, "recurrent_out")
- else:
- hidden_policy = encoded
-
- self.action_masks = tf.placeholder(
- shape=[None, sum(self.act_size)], dtype=tf.float32, name="action_masks"
- )
-
- with tf.variable_scope("policy"):
- distribution = MultiCategoricalDistribution(
- hidden_policy, self.act_size, self.action_masks
- )
- # It's important that we are able to feed_dict a value into this tensor to get the
- # right one-hot encoding, so we can't do identity on it.
- self.output = distribution.sample
- self.all_log_probs = tf.identity(distribution.log_probs, name="action")
- self.selected_actions = tf.stop_gradient(
- distribution.sample_onehot
- ) # In discrete, these are onehot
- self.entropy = distribution.entropy
- self.total_log_probs = distribution.total_log_probs
diff --git a/ml-agents/mlagents/trainers/ppo/optimizer_tf.py b/ml-agents/mlagents/trainers/ppo/optimizer_tf.py
deleted file mode 100644
index 505b0a346b..0000000000
--- a/ml-agents/mlagents/trainers/ppo/optimizer_tf.py
+++ /dev/null
@@ -1,361 +0,0 @@
-from typing import Optional, Any, Dict, cast
-import numpy as np
-from mlagents.tf_utils import tf
-from mlagents_envs.timers import timed
-from mlagents.trainers.tf.models import ModelUtils, EncoderType
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
-from mlagents.trainers.buffer import AgentBuffer
-from mlagents.trainers.settings import TrainerSettings, PPOSettings
-
-
-class PPOOptimizer(TFOptimizer):
- def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
- """
- Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
- The PPO optimizer has a value estimator and a loss function.
- :param policy: A TFPolicy object that will be updated by this PPO Optimizer.
- :param trainer_params: Trainer parameters dictionary that specifies the properties of the trainer.
- """
- # Create the graph here to give more granular control of the TF graph to the Optimizer.
- policy.create_tf_graph()
-
- with policy.graph.as_default():
- with tf.variable_scope("optimizer/"):
- super().__init__(policy, trainer_params)
- hyperparameters: PPOSettings = cast(
- PPOSettings, trainer_params.hyperparameters
- )
- lr = float(hyperparameters.learning_rate)
- self._schedule = hyperparameters.learning_rate_schedule
- epsilon = float(hyperparameters.epsilon)
- beta = float(hyperparameters.beta)
- max_step = float(trainer_params.max_steps)
-
- policy_network_settings = policy.network_settings
- h_size = int(policy_network_settings.hidden_units)
- num_layers = policy_network_settings.num_layers
- vis_encode_type = policy_network_settings.vis_encode_type
- self.burn_in_ratio = 0.0
-
- self.stream_names = list(self.reward_signals.keys())
-
- self.tf_optimizer_op: Optional[tf.train.Optimizer] = None
- self.grads = None
- self.update_batch: Optional[tf.Operation] = None
-
- self.stats_name_to_update_name = {
- "Losses/Value Loss": "value_loss",
- "Losses/Policy Loss": "policy_loss",
- "Policy/Learning Rate": "learning_rate",
- "Policy/Epsilon": "decay_epsilon",
- "Policy/Beta": "decay_beta",
- }
- if self.policy.use_recurrent:
- self.m_size = self.policy.m_size
- self.memory_in = tf.placeholder(
- shape=[None, self.m_size],
- dtype=tf.float32,
- name="recurrent_value_in",
- )
-
- if num_layers < 1:
- num_layers = 1
- if policy.use_continuous_act:
- self._create_cc_critic(h_size, num_layers, vis_encode_type)
- else:
- self._create_dc_critic(h_size, num_layers, vis_encode_type)
-
- self.learning_rate = ModelUtils.create_schedule(
- self._schedule,
- lr,
- self.policy.global_step,
- int(max_step),
- min_value=1e-10,
- )
- self._create_losses(
- self.policy.total_log_probs,
- self.old_log_probs,
- self.value_heads,
- self.policy.entropy,
- beta,
- epsilon,
- lr,
- max_step,
- )
- self._create_ppo_optimizer_ops()
-
- self.update_dict.update(
- {
- "value_loss": self.value_loss,
- "policy_loss": self.abs_policy_loss,
- "update_batch": self.update_batch,
- "learning_rate": self.learning_rate,
- "decay_epsilon": self.decay_epsilon,
- "decay_beta": self.decay_beta,
- }
- )
-
- def _create_cc_critic(
- self, h_size: int, num_layers: int, vis_encode_type: EncoderType
- ) -> None:
- """
- Creates Continuous control critic (value) network.
- :param h_size: Size of hidden linear layers.
- :param num_layers: Number of hidden linear layers.
- :param vis_encode_type: The type of visual encoder to use.
- """
- hidden_stream = ModelUtils.create_observation_streams(
- self.policy.visual_in,
- self.policy.processed_vector_in,
- 1,
- h_size,
- num_layers,
- vis_encode_type,
- )[0]
-
- if self.policy.use_recurrent:
- hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
- hidden_stream,
- self.memory_in,
- self.policy.sequence_length_ph,
- name="lstm_value",
- )
- self.memory_out = memory_value_out
- else:
- hidden_value = hidden_stream
-
- self.value_heads, self.value = ModelUtils.create_value_heads(
- self.stream_names, hidden_value
- )
- self.all_old_log_probs = tf.placeholder(
- shape=[None, sum(self.policy.act_size)],
- dtype=tf.float32,
- name="old_probabilities",
- )
-
- self.old_log_probs = tf.reduce_sum(
- (tf.identity(self.all_old_log_probs)), axis=1, keepdims=True
- )
-
- def _create_dc_critic(
- self, h_size: int, num_layers: int, vis_encode_type: EncoderType
- ) -> None:
- """
- Creates Discrete control critic (value) network.
- :param h_size: Size of hidden linear layers.
- :param num_layers: Number of hidden linear layers.
- :param vis_encode_type: The type of visual encoder to use.
- """
- hidden_stream = ModelUtils.create_observation_streams(
- self.policy.visual_in,
- self.policy.processed_vector_in,
- 1,
- h_size,
- num_layers,
- vis_encode_type,
- )[0]
-
- if self.policy.use_recurrent:
- hidden_value, memory_value_out = ModelUtils.create_recurrent_encoder(
- hidden_stream,
- self.memory_in,
- self.policy.sequence_length_ph,
- name="lstm_value",
- )
- self.memory_out = memory_value_out
- else:
- hidden_value = hidden_stream
-
- self.value_heads, self.value = ModelUtils.create_value_heads(
- self.stream_names, hidden_value
- )
-
- self.all_old_log_probs = tf.placeholder(
- shape=[None, sum(self.policy.act_size)],
- dtype=tf.float32,
- name="old_probabilities",
- )
-
- # Break old log log_probs into separate branches
- old_log_prob_branches = ModelUtils.break_into_branches(
- self.all_old_log_probs, self.policy.act_size
- )
-
- _, _, old_normalized_logits = ModelUtils.create_discrete_action_masking_layer(
- old_log_prob_branches, self.policy.action_masks, self.policy.act_size
- )
-
- action_idx = [0] + list(np.cumsum(self.policy.act_size))
-
- self.old_log_probs = tf.reduce_sum(
- (
- tf.stack(
- [
- -tf.nn.softmax_cross_entropy_with_logits_v2(
- labels=self.policy.selected_actions[
- :, action_idx[i] : action_idx[i + 1]
- ],
- logits=old_normalized_logits[
- :, action_idx[i] : action_idx[i + 1]
- ],
- )
- for i in range(len(self.policy.act_size))
- ],
- axis=1,
- )
- ),
- axis=1,
- keepdims=True,
- )
-
- def _create_losses(
- self, probs, old_probs, value_heads, entropy, beta, epsilon, lr, max_step
- ):
- """
- Creates training-specific Tensorflow ops for PPO models.
- :param probs: Current policy probabilities
- :param old_probs: Past policy probabilities
- :param value_heads: Value estimate tensors from each value stream
- :param beta: Entropy regularization strength
- :param entropy: Current policy entropy
- :param epsilon: Value for policy-divergence threshold
- :param lr: Learning rate
- :param max_step: Total number of training steps.
- """
- self.returns_holders = {}
- self.old_values = {}
- for name in value_heads.keys():
- returns_holder = tf.placeholder(
- shape=[None], dtype=tf.float32, name=f"{name}_returns"
- )
- old_value = tf.placeholder(
- shape=[None], dtype=tf.float32, name=f"{name}_value_estimate"
- )
- self.returns_holders[name] = returns_holder
- self.old_values[name] = old_value
- self.advantage = tf.placeholder(
- shape=[None], dtype=tf.float32, name="advantages"
- )
- advantage = tf.expand_dims(self.advantage, -1)
-
- self.decay_epsilon = ModelUtils.create_schedule(
- self._schedule, epsilon, self.policy.global_step, max_step, min_value=0.1
- )
- self.decay_beta = ModelUtils.create_schedule(
- self._schedule, beta, self.policy.global_step, max_step, min_value=1e-5
- )
-
- value_losses = []
- for name, head in value_heads.items():
- clipped_value_estimate = self.old_values[name] + tf.clip_by_value(
- tf.reduce_sum(head, axis=1) - self.old_values[name],
- -self.decay_epsilon,
- self.decay_epsilon,
- )
- v_opt_a = tf.squared_difference(
- self.returns_holders[name], tf.reduce_sum(head, axis=1)
- )
- v_opt_b = tf.squared_difference(
- self.returns_holders[name], clipped_value_estimate
- )
- value_loss = tf.reduce_mean(
- tf.dynamic_partition(tf.maximum(v_opt_a, v_opt_b), self.policy.mask, 2)[
- 1
- ]
- )
- value_losses.append(value_loss)
- self.value_loss = tf.reduce_mean(value_losses)
-
- r_theta = tf.exp(probs - old_probs)
- p_opt_a = r_theta * advantage
- p_opt_b = (
- tf.clip_by_value(
- r_theta, 1.0 - self.decay_epsilon, 1.0 + self.decay_epsilon
- )
- * advantage
- )
- self.policy_loss = -tf.reduce_mean(
- tf.dynamic_partition(tf.minimum(p_opt_a, p_opt_b), self.policy.mask, 2)[1]
- )
- # For cleaner stats reporting
- self.abs_policy_loss = tf.abs(self.policy_loss)
-
- self.loss = (
- self.policy_loss
- + 0.5 * self.value_loss
- - self.decay_beta
- * tf.reduce_mean(tf.dynamic_partition(entropy, self.policy.mask, 2)[1])
- )
-
- def _create_ppo_optimizer_ops(self):
- self.tf_optimizer_op = self.create_optimizer_op(self.learning_rate)
- self.grads = self.tf_optimizer_op.compute_gradients(self.loss)
- self.update_batch = self.tf_optimizer_op.minimize(self.loss)
-
- @timed
- def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
- """
- Performs update on model.
- :param mini_batch: Batch of experiences.
- :param num_sequences: Number of sequences to process.
- :return: Results of update.
- """
- feed_dict = self._construct_feed_dict(batch, num_sequences)
- stats_needed = self.stats_name_to_update_name
- update_stats = {}
- # Collect feed dicts for all reward signals.
- for _, reward_signal in self.reward_signals.items():
- feed_dict.update(
- reward_signal.prepare_update(self.policy, batch, num_sequences)
- )
- stats_needed.update(reward_signal.stats_name_to_update_name)
-
- update_vals = self._execute_model(feed_dict, self.update_dict)
- for stat_name, update_name in stats_needed.items():
- update_stats[stat_name] = update_vals[update_name]
- return update_stats
-
- def _construct_feed_dict(
- self, mini_batch: AgentBuffer, num_sequences: int
- ) -> Dict[tf.Tensor, Any]:
- # Do an optional burn-in for memories
- num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
- burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
- burn_in_mask[range(0, num_burn_in)] = 0
- burn_in_mask = np.tile(burn_in_mask, num_sequences)
- feed_dict = {
- self.policy.batch_size_ph: num_sequences,
- self.policy.sequence_length_ph: self.policy.sequence_length,
- self.policy.mask_input: mini_batch["masks"] * burn_in_mask,
- self.advantage: mini_batch["advantages"],
- }
- for name in self.reward_signals:
- feed_dict[self.returns_holders[name]] = mini_batch[f"{name}_returns"]
- feed_dict[self.old_values[name]] = mini_batch[f"{name}_value_estimates"]
-
- if self.policy.use_continuous_act: # For hybrid action buffer support
- feed_dict[self.all_old_log_probs] = mini_batch["continuous_log_probs"]
- feed_dict[self.policy.output_pre] = mini_batch["continuous_action"]
- else:
- feed_dict[self.all_old_log_probs] = mini_batch["discrete_log_probs"]
- feed_dict[self.policy.output] = mini_batch["discrete_action"]
- if self.policy.use_recurrent:
- feed_dict[self.policy.prev_action] = mini_batch["prev_action"]
- feed_dict[self.policy.action_masks] = mini_batch["action_mask"]
- if "vector_obs" in mini_batch:
- feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
- if self.policy.vis_obs_size > 0:
- for i, _ in enumerate(self.policy.visual_in):
- feed_dict[self.policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
- if self.policy.use_recurrent:
- feed_dict[self.policy.memory_in] = [
- mini_batch["memory"][i]
- for i in range(
- 0, len(mini_batch["memory"]), self.policy.sequence_length
- )
- ]
- feed_dict[self.memory_in] = self._make_zero_mem(
- self.m_size, mini_batch.num_experiences
- )
- return feed_dict
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
index 3e2b913207..1d90d73900 100644
--- a/ml-agents/mlagents/trainers/ppo/trainer.py
+++ b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -15,19 +15,7 @@
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
from mlagents.trainers.trajectory import Trajectory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
-from mlagents.trainers.settings import TrainerSettings, PPOSettings, FrameworkType
-from mlagents.trainers.torch.components.reward_providers.base_reward_provider import (
- BaseRewardProvider,
-)
-from mlagents import tf_utils
-
-if tf_utils.is_available():
- from mlagents.trainers.policy.tf_policy import TFPolicy
- from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
-else:
- TFPolicy = None # type: ignore
- PPOOptimizer = None # type: ignore
-
+from mlagents.trainers.settings import TrainerSettings, PPOSettings
logger = get_logger(__name__)
@@ -92,31 +80,19 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
for name, v in value_estimates.items():
agent_buffer_trajectory[f"{name}_value_estimates"].extend(v)
- if isinstance(self.optimizer.reward_signals[name], BaseRewardProvider):
- self._stats_reporter.add_stat(
- f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
- np.mean(v),
- )
- else:
- self._stats_reporter.add_stat(
- self.optimizer.reward_signals[name].value_name, np.mean(v)
- )
+ self._stats_reporter.add_stat(
+ f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
+ np.mean(v),
+ )
# Evaluate all reward functions
self.collected_rewards["environment"][agent_id] += np.sum(
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
- # BaseRewardProvider is a PyTorch-based reward signal
- if isinstance(reward_signal, BaseRewardProvider):
- evaluate_result = (
- reward_signal.evaluate(agent_buffer_trajectory)
- * reward_signal.strength
- )
- else: # reward_signal is a TensorFlow-based RewardSignal class
- evaluate_result = reward_signal.evaluate_batch(
- agent_buffer_trajectory
- ).scaled_reward
+ evaluate_result = (
+ reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength
+ )
agent_buffer_trajectory[f"{name}_rewards"].extend(evaluate_result)
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
@@ -218,28 +194,6 @@ def _update_policy(self):
self._clear_update_buffer()
return True
- def create_tf_policy(
- self,
- parsed_behavior_id: BehaviorIdentifiers,
- behavior_spec: BehaviorSpec,
- create_graph: bool = False,
- ) -> TFPolicy:
- """
- Creates a policy with a Tensorflow backend and PPO hyperparameters
- :param parsed_behavior_id:
- :param behavior_spec: specifications for policy construction
- :param create_graph: whether to create the Tensorflow graph on construction
- :return policy
- """
- policy = TFPolicy(
- self.seed,
- behavior_spec,
- self.trainer_settings,
- condition_sigma_on_obs=False, # Faster training for PPO
- create_tf_graph=create_graph,
- )
- return policy
-
def create_torch_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TorchPolicy:
@@ -258,15 +212,10 @@ def create_torch_policy(
)
return policy
- def create_ppo_optimizer(self) -> PPOOptimizer:
- if self.framework == FrameworkType.PYTORCH:
- return TorchPPOOptimizer( # type: ignore
- cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
- ) # type: ignore
- else:
- return PPOOptimizer( # type: ignore
- cast(TFPolicy, self.policy), self.trainer_settings # type: ignore
- ) # type: ignore
+ def create_ppo_optimizer(self) -> TorchPPOOptimizer:
+ return TorchPPOOptimizer( # type: ignore
+ cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
+ ) # type: ignore
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
diff --git a/ml-agents/mlagents/trainers/sac/network.py b/ml-agents/mlagents/trainers/sac/network.py
deleted file mode 100644
index 568de44753..0000000000
--- a/ml-agents/mlagents/trainers/sac/network.py
+++ /dev/null
@@ -1,444 +0,0 @@
-from typing import Dict, Optional
-from mlagents.tf_utils import tf
-from mlagents.trainers.tf.models import ModelUtils
-from mlagents.trainers.settings import EncoderType
-
-LOG_STD_MAX = 2
-LOG_STD_MIN = -20
-EPSILON = 1e-6 # Small value to avoid divide by zero
-DISCRETE_TARGET_ENTROPY_SCALE = 0.2 # Roughly equal to e-greedy 0.05
-CONTINUOUS_TARGET_ENTROPY_SCALE = 1.0 # TODO: Make these an optional hyperparam.
-POLICY_SCOPE = ""
-TARGET_SCOPE = "target_network"
-
-
-class SACNetwork:
- """
- Base class for an SAC network. Implements methods for creating the actor and critic heads.
- """
-
- def __init__(
- self,
- policy=None,
- m_size=None,
- h_size=128,
- normalize=False,
- use_recurrent=False,
- num_layers=2,
- stream_names=None,
- vis_encode_type=EncoderType.SIMPLE,
- ):
- self.normalize = normalize
- self.use_recurrent = use_recurrent
- self.num_layers = num_layers
- self.stream_names = stream_names
- self.h_size = h_size
- self.activ_fn = ModelUtils.swish
-
- self.sequence_length_ph = tf.placeholder(
- shape=None, dtype=tf.int32, name="sac_sequence_length"
- )
-
- self.policy_memory_in: Optional[tf.Tensor] = None
- self.policy_memory_out: Optional[tf.Tensor] = None
- self.value_memory_in: Optional[tf.Tensor] = None
- self.value_memory_out: Optional[tf.Tensor] = None
- self.q1: Optional[tf.Tensor] = None
- self.q2: Optional[tf.Tensor] = None
- self.q1_p: Optional[tf.Tensor] = None
- self.q2_p: Optional[tf.Tensor] = None
- self.q1_memory_in: Optional[tf.Tensor] = None
- self.q2_memory_in: Optional[tf.Tensor] = None
- self.q1_memory_out: Optional[tf.Tensor] = None
- self.q2_memory_out: Optional[tf.Tensor] = None
- self.prev_action: Optional[tf.Tensor] = None
- self.action_masks: Optional[tf.Tensor] = None
- self.external_action_in: Optional[tf.Tensor] = None
- self.log_sigma_sq: Optional[tf.Tensor] = None
- self.entropy: Optional[tf.Tensor] = None
- self.deterministic_output: Optional[tf.Tensor] = None
- self.normalized_logprobs: Optional[tf.Tensor] = None
- self.action_probs: Optional[tf.Tensor] = None
- self.output_oh: Optional[tf.Tensor] = None
- self.output_pre: Optional[tf.Tensor] = None
-
- self.value_vars = None
- self.q_vars = None
- self.critic_vars = None
- self.policy_vars = None
-
- self.q1_heads: Dict[str, tf.Tensor] = None
- self.q2_heads: Dict[str, tf.Tensor] = None
- self.q1_pheads: Dict[str, tf.Tensor] = None
- self.q2_pheads: Dict[str, tf.Tensor] = None
-
- self.policy = policy
-
- def get_vars(self, scope):
- return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
-
- def join_scopes(self, scope_1, scope_2):
- """
- Joins two scopes. Does so safetly (i.e., if one of the two scopes doesn't
- exist, don't add any backslashes)
- """
- if not scope_1:
- return scope_2
- if not scope_2:
- return scope_1
- else:
- return "/".join(filter(None, [scope_1, scope_2]))
-
- def create_value_heads(self, stream_names, hidden_input):
- """
- Creates one value estimator head for each reward signal in stream_names.
- Also creates the node corresponding to the mean of all the value heads in self.value.
- self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
- :param stream_names: The list of reward signal names
- :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
- of the hidden input.
- """
- self.value_heads = {}
- for name in stream_names:
- value = tf.layers.dense(hidden_input, 1, name=f"{name}_value")
- self.value_heads[name] = value
- self.value = tf.reduce_mean(list(self.value_heads.values()), 0)
-
- def _create_cc_critic(self, hidden_value, scope, create_qs=True):
- """
- Creates just the critic network
- """
- scope = self.join_scopes(scope, "critic")
- self.create_sac_value_head(
- self.stream_names,
- hidden_value,
- self.num_layers,
- self.h_size,
- self.join_scopes(scope, "value"),
- )
- self.external_action_in = tf.placeholder(
- shape=[None, self.policy.act_size[0]],
- dtype=tf.float32,
- name="external_action_in",
- )
- self.value_vars = self.get_vars(self.join_scopes(scope, "value"))
- if create_qs:
- hidden_q = tf.concat([hidden_value, self.external_action_in], axis=-1)
- hidden_qp = tf.concat([hidden_value, self.policy.output], axis=-1)
- self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
- self.stream_names,
- hidden_q,
- self.num_layers,
- self.h_size,
- self.join_scopes(scope, "q"),
- )
- self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
- self.stream_names,
- hidden_qp,
- self.num_layers,
- self.h_size,
- self.join_scopes(scope, "q"),
- reuse=True,
- )
- self.q_vars = self.get_vars(self.join_scopes(scope, "q"))
- self.critic_vars = self.get_vars(scope)
-
- def _create_dc_critic(self, hidden_value, scope, create_qs=True):
- """
- Creates just the critic network
- """
- scope = self.join_scopes(scope, "critic")
- self.create_sac_value_head(
- self.stream_names,
- hidden_value,
- self.num_layers,
- self.h_size,
- self.join_scopes(scope, "value"),
- )
-
- self.value_vars = self.get_vars("/".join([scope, "value"]))
-
- if create_qs:
- self.q1_heads, self.q2_heads, self.q1, self.q2 = self.create_q_heads(
- self.stream_names,
- hidden_value,
- self.num_layers,
- self.h_size,
- self.join_scopes(scope, "q"),
- num_outputs=sum(self.policy.act_size),
- )
- self.q1_pheads, self.q2_pheads, self.q1_p, self.q2_p = self.create_q_heads(
- self.stream_names,
- hidden_value,
- self.num_layers,
- self.h_size,
- self.join_scopes(scope, "q"),
- reuse=True,
- num_outputs=sum(self.policy.act_size),
- )
- self.q_vars = self.get_vars(scope)
- self.critic_vars = self.get_vars(scope)
-
- def create_sac_value_head(
- self, stream_names, hidden_input, num_layers, h_size, scope
- ):
- """
- Creates one value estimator head for each reward signal in stream_names.
- Also creates the node corresponding to the mean of all the value heads in self.value.
- self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
- :param stream_names: The list of reward signal names
- :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
- of the hidden input.
- :param num_layers: Number of hidden layers for value network
- :param h_size: size of hidden layers for value network
- :param scope: TF scope for value network.
- """
- with tf.variable_scope(scope):
- value_hidden = ModelUtils.create_vector_observation_encoder(
- hidden_input, h_size, self.activ_fn, num_layers, "encoder", False
- )
- if self.use_recurrent:
- value_hidden, memory_out = ModelUtils.create_recurrent_encoder(
- value_hidden,
- self.value_memory_in,
- self.sequence_length_ph,
- name="lstm_value",
- )
- self.value_memory_out = memory_out
- self.create_value_heads(stream_names, value_hidden)
-
- def create_q_heads(
- self,
- stream_names,
- hidden_input,
- num_layers,
- h_size,
- scope,
- reuse=False,
- num_outputs=1,
- ):
- """
- Creates two q heads for each reward signal in stream_names.
- Also creates the node corresponding to the mean of all the value heads in self.value.
- self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
- :param stream_names: The list of reward signal names
- :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
- of the hidden input.
- :param num_layers: Number of hidden layers for Q network
- :param h_size: size of hidden layers for Q network
- :param scope: TF scope for Q network.
- :param reuse: Whether or not to reuse variables. Useful for creating Q of policy.
- :param num_outputs: Number of outputs of each Q function. If discrete, equal to number of actions.
- """
- with tf.variable_scope(self.join_scopes(scope, "q1_encoding"), reuse=reuse):
- q1_hidden = ModelUtils.create_vector_observation_encoder(
- hidden_input, h_size, self.activ_fn, num_layers, "q1_encoder", reuse
- )
- if self.use_recurrent:
- q1_hidden, memory_out = ModelUtils.create_recurrent_encoder(
- q1_hidden,
- self.q1_memory_in,
- self.sequence_length_ph,
- name="lstm_q1",
- )
- self.q1_memory_out = memory_out
-
- q1_heads = {}
- for name in stream_names:
- _q1 = tf.layers.dense(q1_hidden, num_outputs, name=f"{name}_q1")
- q1_heads[name] = _q1
-
- q1 = tf.reduce_mean(list(q1_heads.values()), axis=0)
- with tf.variable_scope(self.join_scopes(scope, "q2_encoding"), reuse=reuse):
- q2_hidden = ModelUtils.create_vector_observation_encoder(
- hidden_input, h_size, self.activ_fn, num_layers, "q2_encoder", reuse
- )
- if self.use_recurrent:
- q2_hidden, memory_out = ModelUtils.create_recurrent_encoder(
- q2_hidden,
- self.q2_memory_in,
- self.sequence_length_ph,
- name="lstm_q2",
- )
- self.q2_memory_out = memory_out
-
- q2_heads = {}
- for name in stream_names:
- _q2 = tf.layers.dense(q2_hidden, num_outputs, name=f"{name}_q2")
- q2_heads[name] = _q2
-
- q2 = tf.reduce_mean(list(q2_heads.values()), axis=0)
-
- return q1_heads, q2_heads, q1, q2
-
-
-class SACTargetNetwork(SACNetwork):
- """
- Instantiation for the SAC target network. Only contains a single
- value estimator and is updated from the Policy Network.
- """
-
- def __init__(
- self,
- policy,
- m_size=None,
- h_size=128,
- normalize=False,
- use_recurrent=False,
- num_layers=2,
- stream_names=None,
- vis_encode_type=EncoderType.SIMPLE,
- ):
- super().__init__(
- policy,
- m_size,
- h_size,
- normalize,
- use_recurrent,
- num_layers,
- stream_names,
- vis_encode_type,
- )
- with tf.variable_scope(TARGET_SCOPE):
- self.vector_in, self.visual_in = ModelUtils.create_input_placeholders(
- self.policy.behavior_spec.observation_shapes
- )
- if self.policy.normalize:
- normalization_tensors = ModelUtils.create_normalizer(self.vector_in)
- self.update_normalization_op = normalization_tensors.update_op
- self.normalization_steps = normalization_tensors.steps
- self.running_mean = normalization_tensors.running_mean
- self.running_variance = normalization_tensors.running_variance
- self.processed_vector_in = ModelUtils.normalize_vector_obs(
- self.vector_in,
- self.running_mean,
- self.running_variance,
- self.normalization_steps,
- )
- else:
- self.processed_vector_in = self.vector_in
- self.update_normalization_op = None
-
- if self.policy.use_recurrent:
- self.memory_in = tf.placeholder(
- shape=[None, m_size], dtype=tf.float32, name="target_recurrent_in"
- )
- self.value_memory_in = self.memory_in
- hidden_streams = ModelUtils.create_observation_streams(
- self.visual_in,
- self.processed_vector_in,
- 1,
- self.h_size,
- 0,
- vis_encode_type=vis_encode_type,
- stream_scopes=["critic/value/"],
- )
- if self.policy.use_continuous_act:
- self._create_cc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
- else:
- self._create_dc_critic(hidden_streams[0], TARGET_SCOPE, create_qs=False)
- if self.use_recurrent:
- self.memory_out = tf.concat(
- self.value_memory_out, axis=1
- ) # Needed for Barracuda to work
-
- def copy_normalization(self, mean, variance, steps):
- """
- Copies the mean, variance, and steps into the normalizers of the
- input of this SACNetwork. Used to copy the normalizer from the policy network
- to the target network.
- param mean: Tensor containing the mean.
- param variance: Tensor containing the variance
- param steps: Tensor containing the number of steps.
- """
- update_mean = tf.assign(self.running_mean, mean)
- update_variance = tf.assign(self.running_variance, variance)
- update_norm_step = tf.assign(self.normalization_steps, steps)
- return tf.group([update_mean, update_variance, update_norm_step])
-
-
-class SACPolicyNetwork(SACNetwork):
- """
- Instantiation for SAC policy network. Contains a dual Q estimator,
- a value estimator, and a reference to the actual policy network.
- """
-
- def __init__(
- self,
- policy,
- m_size=None,
- h_size=128,
- normalize=False,
- use_recurrent=False,
- num_layers=2,
- stream_names=None,
- vis_encode_type=EncoderType.SIMPLE,
- ):
- super().__init__(
- policy,
- m_size,
- h_size,
- normalize,
- use_recurrent,
- num_layers,
- stream_names,
- vis_encode_type,
- )
- if self.policy.use_recurrent:
- self._create_memory_ins(m_size)
-
- hidden_critic = self._create_observation_in(vis_encode_type)
- # Use the sequence length of the policy
- self.sequence_length_ph = self.policy.sequence_length_ph
-
- if self.policy.use_continuous_act:
- self._create_cc_critic(hidden_critic, POLICY_SCOPE)
-
- else:
- self._create_dc_critic(hidden_critic, POLICY_SCOPE)
-
- if self.use_recurrent:
- mem_outs = [self.value_memory_out, self.q1_memory_out, self.q2_memory_out]
- self.memory_out = tf.concat(mem_outs, axis=1)
-
- def _create_memory_ins(self, m_size):
- """
- Creates the memory input placeholders for LSTM.
- :param m_size: the total size of the memory.
- """
- self.memory_in = tf.placeholder(
- shape=[None, m_size * 3], dtype=tf.float32, name="value_recurrent_in"
- )
-
- # Re-break-up for each network
- num_mems = 3
- input_size = self.memory_in.get_shape().as_list()[1]
- mem_ins = []
- for i in range(num_mems):
- _start = input_size // num_mems * i
- _end = input_size // num_mems * (i + 1)
- mem_ins.append(self.memory_in[:, _start:_end])
- self.value_memory_in = mem_ins[0]
- self.q1_memory_in = mem_ins[1]
- self.q2_memory_in = mem_ins[2]
-
- def _create_observation_in(self, vis_encode_type):
- """
- Creates the observation inputs, and a CNN if needed,
- :param vis_encode_type: Type of CNN encoder.
- :param share_ac_cnn: Whether or not to share the actor and critic CNNs.
- :return A tuple of (hidden_policy, hidden_critic). We don't save it to self since they're used
- once and thrown away.
- """
- with tf.variable_scope(POLICY_SCOPE):
- hidden_streams = ModelUtils.create_observation_streams(
- self.policy.visual_in,
- self.policy.processed_vector_in,
- 1,
- self.h_size,
- 0,
- vis_encode_type=vis_encode_type,
- stream_scopes=["critic/value/"],
- )
- hidden_critic = hidden_streams[0]
- return hidden_critic
diff --git a/ml-agents/mlagents/trainers/sac/optimizer_tf.py b/ml-agents/mlagents/trainers/sac/optimizer_tf.py
deleted file mode 100644
index e9d341193e..0000000000
--- a/ml-agents/mlagents/trainers/sac/optimizer_tf.py
+++ /dev/null
@@ -1,641 +0,0 @@
-import numpy as np
-from typing import Dict, List, Optional, Any, Mapping, cast
-
-from mlagents.tf_utils import tf
-
-from mlagents_envs.logging_util import get_logger
-from mlagents.trainers.sac.network import SACPolicyNetwork, SACTargetNetwork
-from mlagents.trainers.tf.models import ModelUtils
-from mlagents.trainers.optimizer.tf_optimizer import TFOptimizer
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.buffer import AgentBuffer
-from mlagents_envs.timers import timed
-from mlagents.trainers.settings import TrainerSettings, SACSettings
-
-EPSILON = 1e-6 # Small value to avoid divide by zero
-
-logger = get_logger(__name__)
-
-POLICY_SCOPE = ""
-TARGET_SCOPE = "target_network"
-
-
-class SACOptimizer(TFOptimizer):
- def __init__(self, policy: TFPolicy, trainer_params: TrainerSettings):
- """
- Takes a Unity environment and model-specific hyper-parameters and returns the
- appropriate PPO agent model for the environment.
- :param brain: Brain parameters used to generate specific network graph.
- :param lr: Learning rate.
- :param lr_schedule: Learning rate decay schedule.
- :param h_size: Size of hidden layers
- :param init_entcoef: Initial value for entropy coefficient. Set lower to learn faster,
- set higher to explore more.
- :return: a sub-class of PPOAgent tailored to the environment.
- :param max_step: Total number of training steps.
- :param normalize: Whether to normalize vector observation input.
- :param use_recurrent: Whether to use an LSTM layer in the network.
- :param num_layers: Number of hidden layers between encoded input and policy & value layers
- :param tau: Strength of soft-Q update.
- :param m_size: Size of brain memory.
- """
- # Create the graph here to give more granular control of the TF graph to the Optimizer.
- policy.create_tf_graph()
-
- with policy.graph.as_default():
- with tf.variable_scope(""):
- super().__init__(policy, trainer_params)
- hyperparameters: SACSettings = cast(
- SACSettings, trainer_params.hyperparameters
- )
- lr = hyperparameters.learning_rate
- lr_schedule = hyperparameters.learning_rate_schedule
- max_step = trainer_params.max_steps
- self.tau = hyperparameters.tau
- self.init_entcoef = hyperparameters.init_entcoef
-
- self.policy = policy
- self.act_size = policy.act_size
- policy_network_settings = policy.network_settings
- h_size = policy_network_settings.hidden_units
- num_layers = policy_network_settings.num_layers
- vis_encode_type = policy_network_settings.vis_encode_type
-
- self.tau = hyperparameters.tau
- self.burn_in_ratio = 0.0
-
- # Non-exposed SAC parameters
- self.discrete_target_entropy_scale = (
- 0.2 # Roughly equal to e-greedy 0.05
- )
- self.continuous_target_entropy_scale = 1.0
-
- stream_names = list(self.reward_signals.keys())
- # Use to reduce "survivor bonus" when using Curiosity or GAIL.
- self.gammas = [
- _val.gamma for _val in trainer_params.reward_signals.values()
- ]
- self.use_dones_in_backup = {
- name: tf.Variable(1.0) for name in stream_names
- }
- self.disable_use_dones = {
- name: self.use_dones_in_backup[name].assign(0.0)
- for name in stream_names
- }
-
- if num_layers < 1:
- num_layers = 1
-
- self.target_init_op: List[tf.Tensor] = []
- self.target_update_op: List[tf.Tensor] = []
- self.update_batch_policy: Optional[tf.Operation] = None
- self.update_batch_value: Optional[tf.Operation] = None
- self.update_batch_entropy: Optional[tf.Operation] = None
-
- self.policy_network = SACPolicyNetwork(
- policy=self.policy,
- m_size=self.policy.m_size, # 3x policy.m_size
- h_size=h_size,
- normalize=self.policy.normalize,
- use_recurrent=self.policy.use_recurrent,
- num_layers=num_layers,
- stream_names=stream_names,
- vis_encode_type=vis_encode_type,
- )
- self.target_network = SACTargetNetwork(
- policy=self.policy,
- m_size=self.policy.m_size, # 1x policy.m_size
- h_size=h_size,
- normalize=self.policy.normalize,
- use_recurrent=self.policy.use_recurrent,
- num_layers=num_layers,
- stream_names=stream_names,
- vis_encode_type=vis_encode_type,
- )
- # The optimizer's m_size is 3 times the policy (Q1, Q2, and Value)
- self.m_size = 3 * self.policy.m_size
- self._create_inputs_and_outputs()
- self.learning_rate = ModelUtils.create_schedule(
- lr_schedule,
- lr,
- self.policy.global_step,
- int(max_step),
- min_value=1e-10,
- )
- self._create_losses(
- self.policy_network.q1_heads,
- self.policy_network.q2_heads,
- lr,
- int(max_step),
- stream_names,
- discrete=not self.policy.use_continuous_act,
- )
- self._create_sac_optimizer_ops()
-
- self.selected_actions = (
- self.policy.selected_actions
- ) # For GAIL and other reward signals
- if self.policy.normalize:
- target_update_norm = self.target_network.copy_normalization(
- self.policy.running_mean,
- self.policy.running_variance,
- self.policy.normalization_steps,
- )
- # Update the normalization of the optimizer when the policy does.
- self.policy.update_normalization_op = tf.group(
- [self.policy.update_normalization_op, target_update_norm]
- )
-
- self.stats_name_to_update_name = {
- "Losses/Value Loss": "value_loss",
- "Losses/Policy Loss": "policy_loss",
- "Losses/Q1 Loss": "q1_loss",
- "Losses/Q2 Loss": "q2_loss",
- "Policy/Entropy Coeff": "entropy_coef",
- "Policy/Learning Rate": "learning_rate",
- }
-
- self.update_dict = {
- "value_loss": self.total_value_loss,
- "policy_loss": self.policy_loss,
- "q1_loss": self.q1_loss,
- "q2_loss": self.q2_loss,
- "entropy_coef": self.ent_coef,
- "update_batch": self.update_batch_policy,
- "update_value": self.update_batch_value,
- "update_entropy": self.update_batch_entropy,
- "learning_rate": self.learning_rate,
- }
-
- def _create_inputs_and_outputs(self) -> None:
- """
- Assign the higher-level SACModel's inputs and outputs to those of its policy or
- target network.
- """
- self.vector_in = self.policy.vector_in
- self.visual_in = self.policy.visual_in
- self.next_vector_in = self.target_network.vector_in
- self.next_visual_in = self.target_network.visual_in
- self.sequence_length_ph = self.policy.sequence_length_ph
- self.next_sequence_length_ph = self.target_network.sequence_length_ph
- if not self.policy.use_continuous_act:
- self.action_masks = self.policy_network.action_masks
- else:
- self.output_pre = self.policy_network.output_pre
-
- # Don't use value estimate during inference.
- self.value = tf.identity(
- self.policy_network.value, name="value_estimate_unused"
- )
- self.value_heads = self.policy_network.value_heads
- self.dones_holder = tf.placeholder(
- shape=[None], dtype=tf.float32, name="dones_holder"
- )
-
- if self.policy.use_recurrent:
- self.memory_in = self.policy_network.memory_in
- self.memory_out = self.policy_network.memory_out
- if not self.policy.use_continuous_act:
- self.prev_action = self.policy_network.prev_action
- self.next_memory_in = self.target_network.memory_in
-
- def _create_losses(
- self,
- q1_streams: Dict[str, tf.Tensor],
- q2_streams: Dict[str, tf.Tensor],
- lr: tf.Tensor,
- max_step: int,
- stream_names: List[str],
- discrete: bool = False,
- ) -> None:
- """
- Creates training-specific Tensorflow ops for SAC models.
- :param q1_streams: Q1 streams from policy network
- :param q1_streams: Q2 streams from policy network
- :param lr: Learning rate
- :param max_step: Total number of training steps.
- :param stream_names: List of reward stream names.
- :param discrete: Whether or not to use discrete action losses.
- """
-
- if discrete:
- self.target_entropy = [
- self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
- for i in self.act_size
- ]
- discrete_action_probs = tf.exp(self.policy.all_log_probs)
- per_action_entropy = discrete_action_probs * self.policy.all_log_probs
- else:
- self.target_entropy = (
- -1
- * self.continuous_target_entropy_scale
- * np.prod(self.act_size[0]).astype(np.float32)
- )
-
- self.rewards_holders = {}
- self.min_policy_qs = {}
-
- for name in stream_names:
- if discrete:
- _branched_mpq1 = ModelUtils.break_into_branches(
- self.policy_network.q1_pheads[name] * discrete_action_probs,
- self.act_size,
- )
- branched_mpq1 = tf.stack(
- [
- tf.reduce_sum(_br, axis=1, keep_dims=True)
- for _br in _branched_mpq1
- ]
- )
- _q1_p_mean = tf.reduce_mean(branched_mpq1, axis=0)
-
- _branched_mpq2 = ModelUtils.break_into_branches(
- self.policy_network.q2_pheads[name] * discrete_action_probs,
- self.act_size,
- )
- branched_mpq2 = tf.stack(
- [
- tf.reduce_sum(_br, axis=1, keep_dims=True)
- for _br in _branched_mpq2
- ]
- )
- _q2_p_mean = tf.reduce_mean(branched_mpq2, axis=0)
-
- self.min_policy_qs[name] = tf.minimum(_q1_p_mean, _q2_p_mean)
- else:
- self.min_policy_qs[name] = tf.minimum(
- self.policy_network.q1_pheads[name],
- self.policy_network.q2_pheads[name],
- )
-
- rewards_holder = tf.placeholder(
- shape=[None], dtype=tf.float32, name=f"{name}_rewards"
- )
- self.rewards_holders[name] = rewards_holder
-
- q1_losses = []
- q2_losses = []
- # Multiple q losses per stream
- expanded_dones = tf.expand_dims(self.dones_holder, axis=-1)
- for i, name in enumerate(stream_names):
- _expanded_rewards = tf.expand_dims(self.rewards_holders[name], axis=-1)
-
- q_backup = tf.stop_gradient(
- _expanded_rewards
- + (1.0 - self.use_dones_in_backup[name] * expanded_dones)
- * self.gammas[i]
- * self.target_network.value_heads[name]
- )
-
- if discrete:
- # We need to break up the Q functions by branch, and update them individually.
- branched_q1_stream = ModelUtils.break_into_branches(
- self.policy.selected_actions * q1_streams[name], self.act_size
- )
- branched_q2_stream = ModelUtils.break_into_branches(
- self.policy.selected_actions * q2_streams[name], self.act_size
- )
-
- # Reduce each branch into scalar
- branched_q1_stream = [
- tf.reduce_sum(_branch, axis=1, keep_dims=True)
- for _branch in branched_q1_stream
- ]
- branched_q2_stream = [
- tf.reduce_sum(_branch, axis=1, keep_dims=True)
- for _branch in branched_q2_stream
- ]
-
- q1_stream = tf.reduce_mean(branched_q1_stream, axis=0)
- q2_stream = tf.reduce_mean(branched_q2_stream, axis=0)
-
- else:
- q1_stream = q1_streams[name]
- q2_stream = q2_streams[name]
-
- _q1_loss = 0.5 * tf.reduce_mean(
- tf.to_float(self.policy.mask)
- * tf.squared_difference(q_backup, q1_stream)
- )
-
- _q2_loss = 0.5 * tf.reduce_mean(
- tf.to_float(self.policy.mask)
- * tf.squared_difference(q_backup, q2_stream)
- )
-
- q1_losses.append(_q1_loss)
- q2_losses.append(_q2_loss)
-
- self.q1_loss = tf.reduce_mean(q1_losses)
- self.q2_loss = tf.reduce_mean(q2_losses)
-
- # Learn entropy coefficient
- if discrete:
- # Create a log_ent_coef for each branch
- self.log_ent_coef = tf.get_variable(
- "log_ent_coef",
- dtype=tf.float32,
- initializer=np.log([self.init_entcoef] * len(self.act_size)).astype(
- np.float32
- ),
- trainable=True,
- )
- else:
- self.log_ent_coef = tf.get_variable(
- "log_ent_coef",
- dtype=tf.float32,
- initializer=np.log(self.init_entcoef).astype(np.float32),
- trainable=True,
- )
-
- self.ent_coef = tf.exp(self.log_ent_coef)
- if discrete:
- # We also have to do a different entropy and target_entropy per branch.
- branched_per_action_ent = ModelUtils.break_into_branches(
- per_action_entropy, self.act_size
- )
- branched_ent_sums = tf.stack(
- [
- tf.reduce_sum(_lp, axis=1, keep_dims=True) + _te
- for _lp, _te in zip(branched_per_action_ent, self.target_entropy)
- ],
- axis=1,
- )
- self.entropy_loss = -tf.reduce_mean(
- tf.to_float(self.policy.mask)
- * tf.reduce_mean(
- self.log_ent_coef
- * tf.squeeze(tf.stop_gradient(branched_ent_sums), axis=2),
- axis=1,
- )
- )
-
- # Same with policy loss, we have to do the loss per branch and average them,
- # so that larger branches don't get more weight.
- # The equivalent KL divergence from Eq 10 of Haarnoja et al. is also pi*log(pi) - Q
- branched_q_term = ModelUtils.break_into_branches(
- discrete_action_probs * self.policy_network.q1_p, self.act_size
- )
-
- branched_policy_loss = tf.stack(
- [
- tf.reduce_sum(self.ent_coef[i] * _lp - _qt, axis=1, keep_dims=True)
- for i, (_lp, _qt) in enumerate(
- zip(branched_per_action_ent, branched_q_term)
- )
- ]
- )
- self.policy_loss = tf.reduce_mean(
- tf.to_float(self.policy.mask) * tf.squeeze(branched_policy_loss)
- )
-
- # Do vbackup entropy bonus per branch as well.
- branched_ent_bonus = tf.stack(
- [
- tf.reduce_sum(self.ent_coef[i] * _lp, axis=1, keep_dims=True)
- for i, _lp in enumerate(branched_per_action_ent)
- ]
- )
- value_losses = []
- for name in stream_names:
- v_backup = tf.stop_gradient(
- self.min_policy_qs[name]
- - tf.reduce_mean(branched_ent_bonus, axis=0)
- )
- value_losses.append(
- 0.5
- * tf.reduce_mean(
- tf.to_float(self.policy.mask)
- * tf.squared_difference(
- self.policy_network.value_heads[name], v_backup
- )
- )
- )
-
- else:
- self.entropy_loss = -tf.reduce_mean(
- self.log_ent_coef
- * tf.to_float(self.policy.mask)
- * tf.stop_gradient(
- tf.reduce_sum(
- self.policy.all_log_probs + self.target_entropy,
- axis=1,
- keep_dims=True,
- )
- )
- )
- batch_policy_loss = tf.reduce_mean(
- self.ent_coef * self.policy.all_log_probs - self.policy_network.q1_p,
- axis=1,
- )
- self.policy_loss = tf.reduce_mean(
- tf.to_float(self.policy.mask) * batch_policy_loss
- )
-
- value_losses = []
- for name in stream_names:
- v_backup = tf.stop_gradient(
- self.min_policy_qs[name]
- - tf.reduce_sum(self.ent_coef * self.policy.all_log_probs, axis=1)
- )
- value_losses.append(
- 0.5
- * tf.reduce_mean(
- tf.to_float(self.policy.mask)
- * tf.squared_difference(
- self.policy_network.value_heads[name], v_backup
- )
- )
- )
- self.value_loss = tf.reduce_mean(value_losses)
-
- self.total_value_loss = self.q1_loss + self.q2_loss + self.value_loss
-
- self.entropy = self.policy_network.entropy
-
- def _create_sac_optimizer_ops(self) -> None:
- """
- Creates the Adam optimizers and update ops for SAC, including
- the policy, value, and entropy updates, as well as the target network update.
- """
- policy_optimizer = self.create_optimizer_op(
- learning_rate=self.learning_rate, name="sac_policy_opt"
- )
- entropy_optimizer = self.create_optimizer_op(
- learning_rate=self.learning_rate, name="sac_entropy_opt"
- )
- value_optimizer = self.create_optimizer_op(
- learning_rate=self.learning_rate, name="sac_value_opt"
- )
-
- self.target_update_op = [
- tf.assign(target, (1 - self.tau) * target + self.tau * source)
- for target, source in zip(
- self.target_network.value_vars, self.policy_network.value_vars
- )
- ]
- logger.debug("value_vars")
- self.print_all_vars(self.policy_network.value_vars)
- logger.debug("targvalue_vars")
- self.print_all_vars(self.target_network.value_vars)
- logger.debug("critic_vars")
- self.print_all_vars(self.policy_network.critic_vars)
- logger.debug("q_vars")
- self.print_all_vars(self.policy_network.q_vars)
- logger.debug("policy_vars")
- policy_vars = self.policy.get_trainable_variables()
- self.print_all_vars(policy_vars)
-
- self.target_init_op = [
- tf.assign(target, source)
- for target, source in zip(
- self.target_network.value_vars, self.policy_network.value_vars
- )
- ]
-
- self.update_batch_policy = policy_optimizer.minimize(
- self.policy_loss, var_list=policy_vars
- )
-
- # Make sure policy is updated first, then value, then entropy.
- with tf.control_dependencies([self.update_batch_policy]):
- self.update_batch_value = value_optimizer.minimize(
- self.total_value_loss, var_list=self.policy_network.critic_vars
- )
- # Add entropy coefficient optimization operation
- with tf.control_dependencies([self.update_batch_value]):
- self.update_batch_entropy = entropy_optimizer.minimize(
- self.entropy_loss, var_list=self.log_ent_coef
- )
-
- def print_all_vars(self, variables):
- for _var in variables:
- logger.debug(_var)
-
- @timed
- def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
- """
- Updates model using buffer.
- :param num_sequences: Number of trajectories in batch.
- :param batch: Experience mini-batch.
- :param update_target: Whether or not to update target value network
- :param reward_signal_batches: Minibatches to use for updating the reward signals,
- indexed by name. If none, don't update the reward signals.
- :return: Output from update process.
- """
- feed_dict = self._construct_feed_dict(self.policy, batch, num_sequences)
- stats_needed = self.stats_name_to_update_name
- update_stats: Dict[str, float] = {}
- update_vals = self._execute_model(feed_dict, self.update_dict)
- for stat_name, update_name in stats_needed.items():
- update_stats[stat_name] = update_vals[update_name]
- # Update target network. By default, target update happens at every policy update.
- self.sess.run(self.target_update_op)
- return update_stats
-
- def update_reward_signals(
- self, reward_signal_minibatches: Mapping[str, AgentBuffer], num_sequences: int
- ) -> Dict[str, float]:
- """
- Only update the reward signals.
- :param reward_signal_batches: Minibatches to use for updating the reward signals,
- indexed by name. If none, don't update the reward signals.
- """
- # Collect feed dicts for all reward signals.
- feed_dict: Dict[tf.Tensor, Any] = {}
- update_dict: Dict[str, tf.Tensor] = {}
- update_stats: Dict[str, float] = {}
- stats_needed: Dict[str, str] = {}
- if reward_signal_minibatches:
- self.add_reward_signal_dicts(
- feed_dict,
- update_dict,
- stats_needed,
- reward_signal_minibatches,
- num_sequences,
- )
- update_vals = self._execute_model(feed_dict, update_dict)
- for stat_name, update_name in stats_needed.items():
- update_stats[stat_name] = update_vals[update_name]
- return update_stats
-
- def add_reward_signal_dicts(
- self,
- feed_dict: Dict[tf.Tensor, Any],
- update_dict: Dict[str, tf.Tensor],
- stats_needed: Dict[str, str],
- reward_signal_minibatches: Mapping[str, AgentBuffer],
- num_sequences: int,
- ) -> None:
- """
- Adds the items needed for reward signal updates to the feed_dict and stats_needed dict.
- :param feed_dict: Feed dict needed update
- :param update_dit: Update dict that needs update
- :param stats_needed: Stats needed to get from the update.
- :param reward_signal_minibatches: Minibatches to use for updating the reward signals,
- indexed by name.
- """
- for name, r_batch in reward_signal_minibatches.items():
- feed_dict.update(
- self.reward_signals[name].prepare_update(
- self.policy, r_batch, num_sequences
- )
- )
- update_dict.update(self.reward_signals[name].update_dict)
- stats_needed.update(self.reward_signals[name].stats_name_to_update_name)
-
- def _construct_feed_dict(
- self, policy: TFPolicy, batch: AgentBuffer, num_sequences: int
- ) -> Dict[tf.Tensor, Any]:
- """
- Builds the feed dict for updating the SAC model.
- :param model: The model to update. May be different when, e.g. using multi-GPU.
- :param batch: Mini-batch to use to update.
- :param num_sequences: Number of LSTM sequences in batch.
- """
- # Do an optional burn-in for memories
- num_burn_in = int(self.burn_in_ratio * self.policy.sequence_length)
- burn_in_mask = np.ones((self.policy.sequence_length), dtype=np.float32)
- burn_in_mask[range(0, num_burn_in)] = 0
- burn_in_mask = np.tile(burn_in_mask, num_sequences)
- feed_dict = {
- policy.batch_size_ph: num_sequences,
- policy.sequence_length_ph: self.policy.sequence_length,
- self.next_sequence_length_ph: self.policy.sequence_length,
- self.policy.mask_input: batch["masks"] * burn_in_mask,
- }
- for name in self.reward_signals:
- feed_dict[self.rewards_holders[name]] = batch[f"{name}_rewards"]
-
- if self.policy.use_continuous_act:
- feed_dict[self.policy_network.external_action_in] = batch[
- "continuous_action"
- ]
- else:
- feed_dict[policy.output] = batch["discrete_action"]
- if self.policy.use_recurrent:
- feed_dict[policy.prev_action] = batch["prev_action"]
- feed_dict[policy.action_masks] = batch["action_mask"]
- if self.policy.use_vec_obs:
- feed_dict[policy.vector_in] = batch["vector_obs"]
- feed_dict[self.next_vector_in] = batch["next_vector_in"]
- if self.policy.vis_obs_size > 0:
- for i, _ in enumerate(policy.visual_in):
- _obs = batch["visual_obs%d" % i]
- feed_dict[policy.visual_in[i]] = _obs
- for i, _ in enumerate(self.next_visual_in):
- _obs = batch["next_visual_obs%d" % i]
- feed_dict[self.next_visual_in[i]] = _obs
- if self.policy.use_recurrent:
- feed_dict[policy.memory_in] = [
- batch["memory"][i]
- for i in range(0, len(batch["memory"]), self.policy.sequence_length)
- ]
- feed_dict[self.policy_network.memory_in] = self._make_zero_mem(
- self.m_size, batch.num_experiences
- )
- feed_dict[self.target_network.memory_in] = self._make_zero_mem(
- self.m_size // 3, batch.num_experiences
- )
- feed_dict[self.dones_holder] = batch["done"]
- return feed_dict
diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
index c9e43b9443..ad1b1461f4 100644
--- a/ml-agents/mlagents/trainers/sac/trainer.py
+++ b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -18,16 +18,7 @@
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
from mlagents.trainers.trajectory import Trajectory, SplitObservations
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
-from mlagents.trainers.settings import TrainerSettings, SACSettings, FrameworkType
-from mlagents.trainers.torch.components.reward_providers import BaseRewardProvider
-from mlagents import tf_utils
-
-if tf_utils.is_available():
- from mlagents.trainers.policy.tf_policy import TFPolicy
- from mlagents.trainers.sac.optimizer_tf import SACOptimizer
-else:
- TFPolicy = None # type: ignore
- SACOptimizer = None # type: ignore
+from mlagents.trainers.settings import TrainerSettings, SACSettings
logger = get_logger(__name__)
@@ -149,16 +140,9 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
agent_buffer_trajectory["environment_rewards"]
)
for name, reward_signal in self.optimizer.reward_signals.items():
- # BaseRewardProvider is a PyTorch-based reward signal
- if isinstance(reward_signal, BaseRewardProvider):
- evaluate_result = (
- reward_signal.evaluate(agent_buffer_trajectory)
- * reward_signal.strength
- )
- else: # reward_signal uses TensorFlow
- evaluate_result = reward_signal.evaluate_batch(
- agent_buffer_trajectory
- ).scaled_reward
+ evaluate_result = (
+ reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength
+ )
# Report the reward signals
self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
@@ -168,16 +152,10 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached
)
for name, v in value_estimates.items():
- # BaseRewardProvider is a PyTorch-based reward signal
- if isinstance(self.optimizer.reward_signals[name], BaseRewardProvider):
- self._stats_reporter.add_stat(
- f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
- np.mean(v),
- )
- else: # TensorFlow reward signal
- self._stats_reporter.add_stat(
- self.optimizer.reward_signals[name].value_name, np.mean(v)
- )
+ self._stats_reporter.add_stat(
+ f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
+ np.mean(v),
+ )
# Bootstrap using the last step rather than the bootstrap step if max step is reached.
# Set last element to duplicate obs and remove dones.
@@ -236,30 +214,6 @@ def maybe_load_replay_buffer(self):
)
)
- def create_tf_policy(
- self,
- parsed_behavior_id: BehaviorIdentifiers,
- behavior_spec: BehaviorSpec,
- create_graph: bool = False,
- ) -> TFPolicy:
- """
- Creates a policy with a Tensorflow backend and SAC hyperparameters
- :param parsed_behavior_id:
- :param behavior_spec: specifications for policy construction
- :param create_graph: whether to create the Tensorflow graph on construction
- :return policy
- """
- policy = TFPolicy(
- self.seed,
- behavior_spec,
- self.trainer_settings,
- tanh_squash=True,
- reparameterize=True,
- create_tf_graph=create_graph,
- )
- self.maybe_load_replay_buffer()
- return policy
-
def create_torch_policy(
self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
) -> TorchPolicy:
@@ -304,15 +258,9 @@ def _update_sac_policy(self) -> bool:
)
# Get rewards for each reward
for name, signal in self.optimizer.reward_signals.items():
- # BaseRewardProvider is a PyTorch-based reward signal
- if isinstance(signal, BaseRewardProvider):
- sampled_minibatch[f"{name}_rewards"] = (
- signal.evaluate(sampled_minibatch) * signal.strength
- )
- else: # reward_signal is a TensorFlow-based RewardSignal class
- sampled_minibatch[f"{name}_rewards"] = signal.evaluate_batch(
- sampled_minibatch
- ).scaled_reward
+ sampled_minibatch[f"{name}_rewards"] = (
+ signal.evaluate(sampled_minibatch) * signal.strength
+ )
update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
for stat_name, value in update_stats.items():
@@ -357,22 +305,13 @@ def _update_reward_signals(self) -> None:
) / self.reward_signal_update_steps > self.reward_signal_steps_per_update:
# Get minibatches for reward signal update if needed
reward_signal_minibatches = {}
- for name, signal in self.optimizer.reward_signals.items():
+ for name in self.optimizer.reward_signals.keys():
logger.debug(f"Updating {name} at step {self.step}")
- # BaseRewardProvider is a PyTorch-based reward signal
- if not isinstance(signal, BaseRewardProvider):
- # Some signals don't need a minibatch to be sampled - so we don't!
- if signal.update_dict:
- reward_signal_minibatches[name] = buffer.sample_mini_batch(
- self.hyperparameters.batch_size,
- sequence_length=self.policy.sequence_length,
- )
- else: # TensorFlow reward signal
- if name != "extrinsic":
- reward_signal_minibatches[name] = buffer.sample_mini_batch(
- self.hyperparameters.batch_size,
- sequence_length=self.policy.sequence_length,
- )
+ if name != "extrinsic":
+ reward_signal_minibatches[name] = buffer.sample_mini_batch(
+ self.hyperparameters.batch_size,
+ sequence_length=self.policy.sequence_length,
+ )
update_stats = self.optimizer.update_reward_signals(
reward_signal_minibatches, n_sequences
)
@@ -384,14 +323,9 @@ def _update_reward_signals(self) -> None:
self._stats_reporter.add_stat(stat, np.mean(stat_list))
def create_sac_optimizer(self) -> TorchSACOptimizer:
- if self.framework == FrameworkType.PYTORCH:
- return TorchSACOptimizer( # type: ignore
- cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
- ) # type: ignore
- else:
- return SACOptimizer( # type: ignore
- cast(TFPolicy, self.policy), self.trainer_settings # type: ignore
- ) # type: ignore
+ return TorchSACOptimizer( # type: ignore
+ cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore
+ ) # type: ignore
def add_policy(
self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
diff --git a/ml-agents/mlagents/trainers/settings.py b/ml-agents/mlagents/trainers/settings.py
index 0147256c1e..abc07c54d1 100644
--- a/ml-agents/mlagents/trainers/settings.py
+++ b/ml-agents/mlagents/trainers/settings.py
@@ -592,11 +592,6 @@ def to_settings(self) -> type:
return _mapping[self]
-class FrameworkType(Enum):
- TENSORFLOW: str = "tensorflow"
- PYTORCH: str = "pytorch"
-
-
@attr.s(auto_attribs=True)
class TrainerSettings(ExportableSettings):
default_override: ClassVar[Optional["TrainerSettings"]] = None
@@ -620,7 +615,6 @@ def _set_default_hyperparameters(self):
threaded: bool = True
self_play: Optional[SelfPlaySettings] = None
behavioral_cloning: Optional[BehavioralCloningSettings] = None
- framework: FrameworkType = FrameworkType.PYTORCH
cattr.register_structure_hook(
Dict[RewardSignalType, RewardSignalSettings], RewardSignalSettings.structure
@@ -662,6 +656,10 @@ def structure(d: Mapping, t: type) -> Any:
deep_update_dict(d_copy, d)
+ if "framework" in d_copy:
+ logger.warning("Framework option was deprecated but was specified")
+ d_copy.pop("framework", None)
+
for key, val in d_copy.items():
if attr.has(type(val)):
# Don't convert already-converted attrs classes.
diff --git a/ml-agents/mlagents/trainers/stats.py b/ml-agents/mlagents/trainers/stats.py
index b34937e356..1074d61520 100644
--- a/ml-agents/mlagents/trainers/stats.py
+++ b/ml-agents/mlagents/trainers/stats.py
@@ -10,7 +10,7 @@
from mlagents_envs.logging_util import get_logger
from mlagents_envs.timers import set_gauge
from torch.utils.tensorboard import SummaryWriter
-from mlagents.tf_utils.globals import get_rank
+from mlagents.torch_utils.globals import get_rank
logger = get_logger(__name__)
diff --git a/ml-agents/mlagents/trainers/tests/__init__.py b/ml-agents/mlagents/trainers/tests/__init__.py
index 85482cb137..19fd7ccfa7 100644
--- a/ml-agents/mlagents/trainers/tests/__init__.py
+++ b/ml-agents/mlagents/trainers/tests/__init__.py
@@ -19,7 +19,7 @@ def _check_no_float64(arr, kwargs_dtype):
# tb[-2] is the wrapper function, e.g. np_array_no_float64
# we want the calling function, so use tb[-3]
filename = tb[-3].filename
- # Only raise if this came from mlagents code, not tensorflow
+ # Only raise if this came from mlagents code
if (
"ml-agents/mlagents" in filename
or "ml-agents-envs/mlagents" in filename
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/BasicLearning.pb b/ml-agents/mlagents/trainers/tests/tensorflow/BasicLearning.pb
deleted file mode 100644
index e8bd4ad86f..0000000000
Binary files a/ml-agents/mlagents/trainers/tests/tensorflow/BasicLearning.pb and /dev/null differ
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/__init__.py b/ml-agents/mlagents/trainers/tests/tensorflow/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_barracuda_converter.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_barracuda_converter.py
deleted file mode 100644
index 7ec1b1b403..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_barracuda_converter.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import os
-import tempfile
-
-import mlagents.trainers.tf.tensorflow_to_barracuda as tf2bc
-
-
-def test_barracuda_converter():
- path_prefix = os.path.dirname(os.path.abspath(__file__))
- tmpfile = os.path.join(
- tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()) + ".nn"
- )
-
- # make sure there are no left-over files
- if os.path.isfile(tmpfile):
- os.remove(tmpfile)
-
- tf2bc.convert(path_prefix + "/BasicLearning.pb", tmpfile)
-
- # test if file exists after conversion
- assert os.path.isfile(tmpfile)
- # currently converter produces small output file even if input file is empty
- # 100 bytes is high enough to prove that conversion was successful
- assert os.path.getsize(tmpfile) > 100
-
- # cleanup
- os.remove(tmpfile)
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_bcmodule.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_bcmodule.py
deleted file mode 100644
index 1eceefd7d9..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_bcmodule.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import pytest
-import mlagents.trainers.tests.mock_brain as mb
-
-import numpy as np
-
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.tf.components.bc.module import BCModule
-from mlagents.trainers.settings import (
- TrainerSettings,
- BehavioralCloningSettings,
- NetworkSettings,
-)
-
-from mlagents.trainers.tests.dummy_config import (
- DISCRETE_DEMO_PATH,
- CONTINUOUS_DEMO_PATH,
-)
-
-
-def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample):
- # model_path = env.external_brain_names[0]
- trainer_config = TrainerSettings()
- trainer_config.network_settings.memory = (
- NetworkSettings.MemorySettings() if use_rnn else None
- )
- policy = TFPolicy(
- 0, mock_behavior_specs, trainer_config, tanhresample, tanhresample
- )
- with policy.graph.as_default():
- bc_module = BCModule(
- policy,
- policy_learning_rate=trainer_config.hyperparameters.learning_rate,
- default_batch_size=trainer_config.hyperparameters.batch_size,
- default_num_epoch=3,
- settings=bc_settings,
- )
- policy.initialize() # Normally the optimizer calls this after the BCModule is created
- return bc_module
-
-
-# Test default values
-def test_bcmodule_defaults():
- # See if default values match
- mock_specs = mb.create_mock_3dball_behavior_specs()
- bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH)
- bc_module = create_bc_module(mock_specs, bc_settings, False, False)
- assert bc_module.num_epoch == 3
- assert bc_module.batch_size == TrainerSettings().hyperparameters.batch_size
- # Assign strange values and see if it overrides properly
- bc_settings = BehavioralCloningSettings(
- demo_path=CONTINUOUS_DEMO_PATH, num_epoch=100, batch_size=10000
- )
- bc_module = create_bc_module(mock_specs, bc_settings, False, False)
- assert bc_module.num_epoch == 100
- assert bc_module.batch_size == 10000
-
-
-# Test with continuous control env and vector actions
-@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
-def test_bcmodule_update(is_sac):
- mock_specs = mb.create_mock_3dball_behavior_specs()
- bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH)
- bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
- stats = bc_module.update()
- for _, item in stats.items():
- assert isinstance(item, np.float32)
-
-
-# Test with constant pretraining learning rate
-@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
-def test_bcmodule_constant_lr_update(is_sac):
- mock_specs = mb.create_mock_3dball_behavior_specs()
- bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH, steps=0)
- bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
- stats = bc_module.update()
- for _, item in stats.items():
- assert isinstance(item, np.float32)
- old_learning_rate = bc_module.current_lr
-
- _ = bc_module.update()
- assert old_learning_rate == bc_module.current_lr
-
-
-# Test with RNN
-@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
-def test_bcmodule_rnn_update(is_sac):
- mock_specs = mb.create_mock_3dball_behavior_specs()
- bc_settings = BehavioralCloningSettings(demo_path=CONTINUOUS_DEMO_PATH)
- bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
- stats = bc_module.update()
- for _, item in stats.items():
- assert isinstance(item, np.float32)
-
-
-# Test with discrete control and visual observations
-@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
-def test_bcmodule_dc_visual_update(is_sac):
- mock_specs = mb.create_mock_banana_behavior_specs()
- bc_settings = BehavioralCloningSettings(demo_path=DISCRETE_DEMO_PATH)
- bc_module = create_bc_module(mock_specs, bc_settings, False, is_sac)
- stats = bc_module.update()
- for _, item in stats.items():
- assert isinstance(item, np.float32)
-
-
-# Test with discrete control, visual observations and RNN
-@pytest.mark.parametrize("is_sac", [True, False], ids=["sac", "ppo"])
-def test_bcmodule_rnn_dc_update(is_sac):
- mock_specs = mb.create_mock_banana_behavior_specs()
- bc_settings = BehavioralCloningSettings(demo_path=DISCRETE_DEMO_PATH)
- bc_module = create_bc_module(mock_specs, bc_settings, True, is_sac)
- stats = bc_module.update()
- for _, item in stats.items():
- assert isinstance(item, np.float32)
-
-
-if __name__ == "__main__":
- pytest.main()
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_distributions.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_distributions.py
deleted file mode 100644
index c098d067f0..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_distributions.py
+++ /dev/null
@@ -1,114 +0,0 @@
-import pytest
-
-from mlagents.tf_utils import tf
-
-from mlagents.trainers.tf.distributions import (
- GaussianDistribution,
- MultiCategoricalDistribution,
-)
-
-
-VECTOR_ACTION_SPACE = [2]
-VECTOR_OBS_SPACE = 8
-DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
-BUFFER_INIT_SAMPLES = 32
-NUM_AGENTS = 12
-
-
-def test_gaussian_distribution():
- with tf.Graph().as_default():
- logits = tf.Variable(initial_value=[[1, 1]], trainable=True, dtype=tf.float32)
- distribution = GaussianDistribution(
- logits,
- act_size=VECTOR_ACTION_SPACE,
- reparameterize=False,
- tanh_squash=False,
- )
- sess = tf.Session()
- with tf.Session() as sess:
- init = tf.global_variables_initializer()
- sess.run(init)
- output = sess.run(distribution.sample)
- for _ in range(10):
- output = sess.run([distribution.sample, distribution.log_probs])
- for out in output:
- assert out.shape[1] == VECTOR_ACTION_SPACE[0]
- output = sess.run([distribution.total_log_probs])
- assert output[0].shape[0] == 1
- # Test entropy is correct
- log_std_tensor = tf.get_default_graph().get_tensor_by_name(
- "log_std/BiasAdd:0"
- )
- feed_dict = {log_std_tensor: [[1.0, 1.0]]}
- entropy = sess.run([distribution.entropy], feed_dict=feed_dict)
- # Entropy with log_std of 1.0 should be 2.42
- assert pytest.approx(entropy[0], 0.01) == 2.42
-
-
-def test_tanh_distribution():
- with tf.Graph().as_default():
- logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32)
- distribution = GaussianDistribution(
- logits, act_size=VECTOR_ACTION_SPACE, reparameterize=False, tanh_squash=True
- )
- sess = tf.Session()
- with tf.Session() as sess:
- init = tf.global_variables_initializer()
- sess.run(init)
- output = sess.run(distribution.sample)
- for _ in range(10):
- output = sess.run([distribution.sample, distribution.log_probs])
- for out in output:
- assert out.shape[1] == VECTOR_ACTION_SPACE[0]
- # Assert action never exceeds [-1,1]
- action = output[0][0]
- for act in action:
- assert act >= -1 and act <= 1
- output = sess.run([distribution.total_log_probs])
- assert output[0].shape[0] == 1
-
-
-def test_multicategorical_distribution():
- with tf.Graph().as_default():
- logits = tf.Variable(initial_value=[[0, 0]], trainable=True, dtype=tf.float32)
- action_masks = tf.Variable(
- initial_value=[[1 for _ in range(sum(DISCRETE_ACTION_SPACE))]],
- trainable=True,
- dtype=tf.float32,
- )
- distribution = MultiCategoricalDistribution(
- logits, act_size=DISCRETE_ACTION_SPACE, action_masks=action_masks
- )
- sess = tf.Session()
- with tf.Session() as sess:
- init = tf.global_variables_initializer()
- sess.run(init)
- output = sess.run(distribution.sample)
- for _ in range(10):
- sample, log_probs, entropy = sess.run(
- [distribution.sample, distribution.log_probs, distribution.entropy]
- )
- assert len(log_probs[0]) == sum(DISCRETE_ACTION_SPACE)
- # Assert action never exceeds [-1,1]
- assert len(sample[0]) == len(DISCRETE_ACTION_SPACE)
- for i, act in enumerate(sample[0]):
- assert act >= 0 and act <= DISCRETE_ACTION_SPACE[i]
- output = sess.run([distribution.total_log_probs])
- assert output[0].shape[0] == 1
- # Make sure entropy is correct
- assert entropy[0] > 3.8
-
- # Test masks
- mask = []
- for space in DISCRETE_ACTION_SPACE:
- mask.append(1)
- for _action_space in range(1, space):
- mask.append(0)
- for _ in range(10):
- sample, log_probs = sess.run(
- [distribution.sample, distribution.log_probs],
- feed_dict={action_masks: [mask]},
- )
- for act in sample[0]:
- assert act >= 0 and act <= 1
- output = sess.run([distribution.total_log_probs])
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py
deleted file mode 100644
index e784d5286b..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_ghost.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import pytest
-
-import numpy as np
-
-from mlagents.trainers.ghost.trainer import GhostTrainer
-from mlagents.trainers.ghost.controller import GhostController
-from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
-from mlagents.trainers.ppo.trainer import PPOTrainer
-from mlagents.trainers.agent_processor import AgentManagerQueue
-from mlagents.trainers.tests import mock_brain as mb
-from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings
-
-
-@pytest.fixture
-def dummy_config():
- return TrainerSettings(self_play=SelfPlaySettings())
-
-
-VECTOR_ACTION_SPACE = 1
-VECTOR_OBS_SPACE = 8
-DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
-BUFFER_INIT_SAMPLES = 513
-NUM_AGENTS = 12
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_load_and_set(dummy_config, use_discrete):
- mock_specs = mb.setup_test_behavior_specs(
- use_discrete,
- False,
- vector_action_space=DISCRETE_ACTION_SPACE
- if use_discrete
- else VECTOR_ACTION_SPACE,
- vector_obs_space=VECTOR_OBS_SPACE,
- )
-
- trainer_params = dummy_config
- trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
- trainer.seed = 1
- policy = trainer.create_policy("test", mock_specs, create_graph=True)
- trainer.seed = 20 # otherwise graphs are the same
- to_load_policy = trainer.create_policy("test", mock_specs, create_graph=True)
-
- weights = policy.get_weights()
- load_weights = to_load_policy.get_weights()
- try:
- for w, lw in zip(weights, load_weights):
- np.testing.assert_array_equal(w, lw)
- except AssertionError:
- pass
-
- to_load_policy.load_weights(weights)
- load_weights = to_load_policy.get_weights()
-
- for w, lw in zip(weights, load_weights):
- np.testing.assert_array_equal(w, lw)
-
-
-def test_resume(dummy_config, tmp_path):
- mock_specs = mb.setup_test_behavior_specs(
- True, False, vector_action_space=[2], vector_obs_space=1
- )
- behavior_id_team0 = "test_brain?team=0"
- behavior_id_team1 = "test_brain?team=1"
- brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
- tmp_path = tmp_path.as_posix()
- ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path)
- controller = GhostController(100)
- trainer = GhostTrainer(
- ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path
- )
-
- parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
- policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
- trainer.add_policy(parsed_behavior_id0, policy)
-
- parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
- policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
- trainer.add_policy(parsed_behavior_id1, policy)
-
- trainer.save_model()
-
- # Make a new trainer, check that the policies are the same
- ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path)
- trainer2 = GhostTrainer(
- ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path
- )
- policy = trainer2.create_policy(parsed_behavior_id0, mock_specs)
- trainer2.add_policy(parsed_behavior_id0, policy)
-
- policy = trainer2.create_policy(parsed_behavior_id1, mock_specs)
- trainer2.add_policy(parsed_behavior_id1, policy)
-
- trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id)
- trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id)
- weights = trainer1_policy.get_weights()
- weights2 = trainer2_policy.get_weights()
-
- for w, lw in zip(weights, weights2):
- np.testing.assert_array_equal(w, lw)
-
-
-def test_process_trajectory(dummy_config):
- mock_specs = mb.setup_test_behavior_specs(
- True, False, vector_action_space=[2], vector_obs_space=1
- )
- behavior_id_team0 = "test_brain?team=0"
- behavior_id_team1 = "test_brain?team=1"
- brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name
-
- ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
- controller = GhostController(100)
- trainer = GhostTrainer(
- ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
- )
-
- # first policy encountered becomes policy trained by wrapped PPO
- parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
- policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
- trainer.add_policy(parsed_behavior_id0, policy)
- trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
- trainer.subscribe_trajectory_queue(trajectory_queue0)
-
- # Ghost trainer should ignore this queue because off policy
- parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
- policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
- trainer.add_policy(parsed_behavior_id1, policy)
- trajectory_queue1 = AgentManagerQueue(behavior_id_team1)
- trainer.subscribe_trajectory_queue(trajectory_queue1)
-
- time_horizon = 15
- trajectory = make_fake_trajectory(
- length=time_horizon,
- max_step_complete=True,
- observation_shapes=[(1,)],
- action_spec=mock_specs.action_spec,
- )
- trajectory_queue0.put(trajectory)
- trainer.advance()
-
- # Check that trainer put trajectory in update buffer
- assert trainer.trainer.update_buffer.num_experiences == 15
-
- trajectory_queue1.put(trajectory)
- trainer.advance()
-
- # Check that ghost trainer ignored off policy queue
- assert trainer.trainer.update_buffer.num_experiences == 15
- # Check that it emptied the queue
- assert trajectory_queue1.empty()
-
-
-def test_publish_queue(dummy_config):
- mock_specs = mb.setup_test_behavior_specs(
- True, False, vector_action_space=[1], vector_obs_space=8
- )
-
- behavior_id_team0 = "test_brain?team=0"
- behavior_id_team1 = "test_brain?team=1"
-
- parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)
-
- brain_name = parsed_behavior_id0.brain_name
-
- ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
- controller = GhostController(100)
- trainer = GhostTrainer(
- ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
- )
-
- # First policy encountered becomes policy trained by wrapped PPO
- # This queue should remain empty after swap snapshot
- policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
- trainer.add_policy(parsed_behavior_id0, policy)
- policy_queue0 = AgentManagerQueue(behavior_id_team0)
- trainer.publish_policy_queue(policy_queue0)
-
- # Ghost trainer should use this queue for ghost policy swap
- parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
- policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
- trainer.add_policy(parsed_behavior_id1, policy)
- policy_queue1 = AgentManagerQueue(behavior_id_team1)
- trainer.publish_policy_queue(policy_queue1)
-
- # check ghost trainer swap pushes to ghost queue and not trainer
- assert policy_queue0.empty() and policy_queue1.empty()
- trainer._swap_snapshots()
- assert policy_queue0.empty() and not policy_queue1.empty()
- # clear
- policy_queue1.get_nowait()
-
- mock_specs = mb.setup_test_behavior_specs(
- False,
- False,
- vector_action_space=VECTOR_ACTION_SPACE,
- vector_obs_space=VECTOR_OBS_SPACE,
- )
-
- buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs)
- # Mock out reward signal eval
- buffer["extrinsic_rewards"] = buffer["environment_rewards"]
- buffer["extrinsic_returns"] = buffer["environment_rewards"]
- buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
- buffer["curiosity_rewards"] = buffer["environment_rewards"]
- buffer["curiosity_returns"] = buffer["environment_rewards"]
- buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
- buffer["advantages"] = buffer["environment_rewards"]
- trainer.trainer.update_buffer = buffer
-
- # when ghost trainer advance and wrapped trainer buffers full
- # the wrapped trainer pushes updated policy to correct queue
- assert policy_queue0.empty() and policy_queue1.empty()
- trainer.advance()
- assert not policy_queue0.empty() and policy_queue1.empty()
-
-
-if __name__ == "__main__":
- pytest.main()
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_models.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_models.py
deleted file mode 100644
index 81e68bcc6a..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_models.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import pytest
-
-from mlagents.trainers.tf.models import ModelUtils
-from mlagents.tf_utils import tf
-from mlagents_envs.base_env import BehaviorSpec, ActionSpec
-
-
-def create_behavior_spec(num_visual, num_vector, vector_size):
- behavior_spec = BehaviorSpec(
- [(84, 84, 3)] * int(num_visual) + [(vector_size,)] * int(num_vector),
- ActionSpec.create_discrete((1,)),
- )
- return behavior_spec
-
-
-@pytest.mark.parametrize("num_visual", [1, 2, 4])
-@pytest.mark.parametrize("num_vector", [1, 2, 4])
-def test_create_input_placeholders(num_vector, num_visual):
- vec_size = 8
- name_prefix = "test123"
- bspec = create_behavior_spec(num_visual, num_vector, vec_size)
- vec_in, vis_in = ModelUtils.create_input_placeholders(
- bspec.observation_shapes, name_prefix=name_prefix
- )
-
- assert isinstance(vis_in, list)
- assert len(vis_in) == num_visual
- assert isinstance(vec_in, tf.Tensor)
- assert vec_in.get_shape().as_list()[1] == num_vector * 8
-
- # Check names contain prefix and vis shapes are correct
- for _vis in vis_in:
- assert _vis.get_shape().as_list() == [None, 84, 84, 3]
- assert _vis.name.startswith(name_prefix)
- assert vec_in.name.startswith(name_prefix)
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_nn_policy.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_nn_policy.py
deleted file mode 100644
index 25a807ce93..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_nn_policy.py
+++ /dev/null
@@ -1,289 +0,0 @@
-import pytest
-
-import numpy as np
-from mlagents.tf_utils import tf
-
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.tf.models import ModelUtils, Tensor3DShape
-from mlagents.trainers.exception import UnityTrainerException
-from mlagents.trainers.tests import mock_brain as mb
-from mlagents.trainers.settings import TrainerSettings, NetworkSettings, EncoderType
-from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-
-
-VECTOR_ACTION_SPACE = 2
-VECTOR_OBS_SPACE = 8
-DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
-BUFFER_INIT_SAMPLES = 32
-NUM_AGENTS = 12
-EPSILON = 1e-7
-
-
-def create_policy_mock(
- dummy_config: TrainerSettings,
- use_rnn: bool = False,
- use_discrete: bool = True,
- use_visual: bool = False,
- seed: int = 0,
-) -> TFPolicy:
- mock_spec = mb.setup_test_behavior_specs(
- use_discrete,
- use_visual,
- vector_action_space=DISCRETE_ACTION_SPACE
- if use_discrete
- else VECTOR_ACTION_SPACE,
- vector_obs_space=VECTOR_OBS_SPACE,
- )
-
- trainer_settings = dummy_config
- trainer_settings.keep_checkpoints = 3
- trainer_settings.network_settings.memory = (
- NetworkSettings.MemorySettings() if use_rnn else None
- )
- policy = TFPolicy(seed, mock_spec, trainer_settings)
- return policy
-
-
-def _compare_two_policies(policy1: TFPolicy, policy2: TFPolicy) -> None:
- """
- Make sure two policies have the same output for the same input.
- """
- decision_step, _ = mb.create_steps_from_behavior_spec(
- policy1.behavior_spec, num_agents=1
- )
- run_out1 = policy1.evaluate(decision_step, list(decision_step.agent_id))
- run_out2 = policy2.evaluate(decision_step, list(decision_step.agent_id))
-
- np.testing.assert_array_equal(run_out2["log_probs"], run_out1["log_probs"])
-
-
-@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
-@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
-@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
-def test_policy_evaluate(rnn, visual, discrete):
- # Test evaluate
- tf.reset_default_graph()
- policy = create_policy_mock(
- TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
- )
- decision_step, terminal_step = mb.create_steps_from_behavior_spec(
- policy.behavior_spec, num_agents=NUM_AGENTS
- )
-
- run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
- if discrete:
- run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
- else:
- assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
-
-
-def test_large_normalization():
- behavior_spec = mb.setup_test_behavior_specs(
- use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
- )
- # Taken from Walker seed 3713 which causes NaN without proper initialization
- large_obs1 = [
- 1800.00036621,
- 1799.96972656,
- 1800.01245117,
- 1800.07214355,
- 1800.02758789,
- 1799.98303223,
- 1799.88647461,
- 1799.89575195,
- 1800.03479004,
- 1800.14025879,
- 1800.17675781,
- 1800.20581055,
- 1800.33740234,
- 1800.36450195,
- 1800.43457031,
- 1800.45544434,
- 1800.44604492,
- 1800.56713867,
- 1800.73901367,
- ]
- large_obs2 = [
- 1799.99975586,
- 1799.96679688,
- 1799.92980957,
- 1799.89550781,
- 1799.93774414,
- 1799.95300293,
- 1799.94067383,
- 1799.92993164,
- 1799.84057617,
- 1799.69873047,
- 1799.70605469,
- 1799.82849121,
- 1799.85095215,
- 1799.76977539,
- 1799.78283691,
- 1799.76708984,
- 1799.67163086,
- 1799.59191895,
- 1799.5135498,
- 1799.45556641,
- 1799.3717041,
- ]
- policy = TFPolicy(
- 0,
- behavior_spec,
- TrainerSettings(network_settings=NetworkSettings(normalize=True)),
- "testdir",
- False,
- )
- time_horizon = len(large_obs1)
- trajectory = make_fake_trajectory(
- length=time_horizon,
- max_step_complete=True,
- observation_shapes=[(1,)],
- action_spec=behavior_spec.action_spec,
- )
- for i in range(time_horizon):
- trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32)
- trajectory_buffer = trajectory.to_agentbuffer()
- policy.update_normalization(trajectory_buffer["vector_obs"])
-
- # Check that the running mean and variance is correct
- steps, mean, variance = policy.sess.run(
- [policy.normalization_steps, policy.running_mean, policy.running_variance]
- )
- assert mean[0] == pytest.approx(np.mean(large_obs1, dtype=np.float32), abs=0.01)
- assert variance[0] / steps == pytest.approx(
- np.var(large_obs1, dtype=np.float32), abs=0.01
- )
-
- time_horizon = len(large_obs2)
- trajectory = make_fake_trajectory(
- length=time_horizon,
- max_step_complete=True,
- observation_shapes=[(1,)],
- action_spec=behavior_spec.action_spec,
- )
- for i in range(time_horizon):
- trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32)
-
- trajectory_buffer = trajectory.to_agentbuffer()
- policy.update_normalization(trajectory_buffer["vector_obs"])
-
- steps, mean, variance = policy.sess.run(
- [policy.normalization_steps, policy.running_mean, policy.running_variance]
- )
-
- assert mean[0] == pytest.approx(
- np.mean(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
- )
- assert variance[0] / steps == pytest.approx(
- np.var(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
- )
-
-
-def test_normalization():
- behavior_spec = mb.setup_test_behavior_specs(
- use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
- )
- time_horizon = 6
- trajectory = make_fake_trajectory(
- length=time_horizon,
- max_step_complete=True,
- observation_shapes=[(1,)],
- action_spec=behavior_spec.action_spec,
- )
- # Change half of the obs to 0
- for i in range(3):
- trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
- policy = TFPolicy(
- 0,
- behavior_spec,
- TrainerSettings(network_settings=NetworkSettings(normalize=True)),
- "testdir",
- False,
- )
-
- trajectory_buffer = trajectory.to_agentbuffer()
- policy.update_normalization(trajectory_buffer["vector_obs"])
-
- # Check that the running mean and variance is correct
- steps, mean, variance = policy.sess.run(
- [policy.normalization_steps, policy.running_mean, policy.running_variance]
- )
-
- assert steps == 6
- assert mean[0] == 0.5
- # Note: variance is initalized to the variance of the initial trajectory + EPSILON
- # (to avoid divide by 0) and multiplied by the number of steps. The correct answer is 0.25
- assert variance[0] / steps == pytest.approx(0.25, abs=0.01)
- # Make another update, this time with all 1's
- time_horizon = 10
- trajectory = make_fake_trajectory(
- length=time_horizon,
- max_step_complete=True,
- observation_shapes=[(1,)],
- action_spec=behavior_spec.action_spec,
- )
- trajectory_buffer = trajectory.to_agentbuffer()
- policy.update_normalization(trajectory_buffer["vector_obs"])
-
- # Check that the running mean and variance is correct
- steps, mean, variance = policy.sess.run(
- [policy.normalization_steps, policy.running_mean, policy.running_variance]
- )
-
- assert steps == 16
- assert mean[0] == 0.8125
- assert variance[0] / steps == pytest.approx(0.152, abs=0.01)
-
-
-def test_min_visual_size():
- # Make sure each EncoderType has an entry in MIS_RESOLUTION_FOR_ENCODER
- assert set(ModelUtils.MIN_RESOLUTION_FOR_ENCODER.keys()) == set(EncoderType)
-
- for encoder_type in EncoderType:
- with tf.Graph().as_default():
- good_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type]
- good_res = Tensor3DShape(width=good_size, height=good_size, num_channels=3)
- vis_input = ModelUtils.create_visual_input(good_res, "test_min_visual_size")
- ModelUtils._check_resolution_for_encoder(vis_input, encoder_type)
- enc_func = ModelUtils.get_encoder_for_type(encoder_type)
- enc_func(vis_input, 32, ModelUtils.swish, 1, "test", False)
-
- # Anything under the min size should raise an exception. If not, decrease the min size!
- with pytest.raises(Exception):
- with tf.Graph().as_default():
- bad_size = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[encoder_type] - 1
- bad_res = Tensor3DShape(width=bad_size, height=bad_size, num_channels=3)
- vis_input = ModelUtils.create_visual_input(
- bad_res, "test_min_visual_size"
- )
-
- with pytest.raises(UnityTrainerException):
- # Make sure we'd hit a friendly error during model setup time.
- ModelUtils._check_resolution_for_encoder(vis_input, encoder_type)
-
- enc_func = ModelUtils.get_encoder_for_type(encoder_type)
- enc_func(vis_input, 32, ModelUtils.swish, 1, "test", False)
-
-
-def test_step_overflow():
- behavior_spec = mb.setup_test_behavior_specs(
- use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
- )
-
- policy = TFPolicy(
- 0,
- behavior_spec,
- TrainerSettings(network_settings=NetworkSettings(normalize=True)),
- create_tf_graph=False,
- )
- policy.create_input_placeholders()
- policy.initialize()
-
- policy.set_step(2 ** 31 - 1)
- assert policy.get_current_step() == 2 ** 31 - 1
- policy.increment_step(3)
- assert policy.get_current_step() == 2 ** 31 + 2
-
-
-if __name__ == "__main__":
- pytest.main()
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
deleted file mode 100644
index 2cda566cdf..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_ppo.py
+++ /dev/null
@@ -1,393 +0,0 @@
-from unittest import mock
-import pytest
-
-import numpy as np
-from mlagents.tf_utils import tf
-import attr
-from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
-
-from mlagents.trainers.trainer.rl_trainer import RLTrainer
-from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
-from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.agent_processor import AgentManagerQueue
-from mlagents.trainers.tests import mock_brain as mb
-from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-from mlagents.trainers.settings import NetworkSettings, FrameworkType
-from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable
- curiosity_dummy_config,
- gail_dummy_config,
- ppo_dummy_config,
-)
-
-from mlagents_envs.base_env import ActionSpec
-
-
-@pytest.fixture
-def dummy_config():
- return attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW)
-
-
-VECTOR_ACTION_SPACE = 2
-VECTOR_OBS_SPACE = 8
-DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
-BUFFER_INIT_SAMPLES = 64
-NUM_AGENTS = 12
-
-CONTINUOUS_ACTION_SPEC = ActionSpec.create_continuous(VECTOR_ACTION_SPACE)
-DISCRETE_ACTION_SPEC = ActionSpec.create_discrete(tuple(DISCRETE_ACTION_SPACE))
-
-
-def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual):
- mock_specs = mb.setup_test_behavior_specs(
- use_discrete,
- use_visual,
- vector_action_space=DISCRETE_ACTION_SPACE
- if use_discrete
- else VECTOR_ACTION_SPACE,
- vector_obs_space=VECTOR_OBS_SPACE,
- )
-
- trainer_settings = attr.evolve(dummy_config, framework=FrameworkType.TENSORFLOW)
- trainer_settings.network_settings.memory = (
- NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
- if use_rnn
- else None
- )
- policy = TFPolicy(
- 0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
- )
- optimizer = PPOOptimizer(policy, trainer_settings)
- policy.initialize()
- return optimizer
-
-
-@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
-@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
-@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
-def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
- # Test evaluate
- tf.reset_default_graph()
- optimizer = _create_ppo_optimizer_ops_mock(
- dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
- )
- # Test update
- behavior_spec = optimizer.policy.behavior_spec
- update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
- # Mock out reward signal eval
- update_buffer["advantages"] = update_buffer["environment_rewards"]
- update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
- update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
- # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
- if discrete:
- n_agents = len(update_buffer["discrete_log_probs"])
- update_buffer["discrete_log_probs"] = np.ones(
- (n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
- dtype=np.float32,
- )
- else:
- n_agents = len(update_buffer["continuous_log_probs"])
- update_buffer["continuous_log_probs"] = np.ones(
- (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
- )
-
- optimizer.update(
- update_buffer,
- num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
- )
-
-
-@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
-@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
-@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
-# We need to test this separately from test_reward_signals.py to ensure no interactions
-def test_ppo_optimizer_update_curiosity(
- dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811
-):
- # Test evaluate
- tf.reset_default_graph()
- dummy_config.reward_signals = curiosity_dummy_config
- optimizer = _create_ppo_optimizer_ops_mock(
- dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
- )
- # Test update
- behavior_spec = optimizer.policy.behavior_spec
- update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
- # Mock out reward signal eval
- update_buffer["advantages"] = update_buffer["environment_rewards"]
- update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
- update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
- update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
- update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
- # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
- if discrete:
- n_agents = len(update_buffer["discrete_log_probs"])
- update_buffer["discrete_log_probs"] = np.ones(
- (n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
- dtype=np.float32,
- )
- else:
- n_agents = len(update_buffer["continuous_log_probs"])
- update_buffer["continuous_log_probs"] = np.ones(
- (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
- )
- optimizer.update(
- update_buffer,
- num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
- )
-
-
-# We need to test this separately from test_reward_signals.py to ensure no interactions
-def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811
- # Test evaluate
- tf.reset_default_graph()
- dummy_config.reward_signals = gail_dummy_config
- optimizer = _create_ppo_optimizer_ops_mock(
- attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW),
- use_rnn=False,
- use_discrete=False,
- use_visual=False,
- )
- # Test update
- behavior_spec = optimizer.policy.behavior_spec
- update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
- # Mock out reward signal eval
- update_buffer["advantages"] = update_buffer["environment_rewards"]
- update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
- update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
- update_buffer["gail_returns"] = update_buffer["environment_rewards"]
- update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
- # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
- n_agents = len(update_buffer["continuous_log_probs"])
- update_buffer["continuous_log_probs"] = np.ones(
- (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32
- )
- optimizer.update(
- update_buffer,
- num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
- )
-
- # Check if buffer size is too big
- update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
- # Mock out reward signal eval
- update_buffer["advantages"] = update_buffer["environment_rewards"]
- update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
- update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
- update_buffer["gail_returns"] = update_buffer["environment_rewards"]
- update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
- optimizer.update(
- update_buffer,
- num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
- )
-
-
-@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
-@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
-@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
-def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete):
- tf.reset_default_graph()
-
- optimizer = _create_ppo_optimizer_ops_mock(
- dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
- )
- time_horizon = 15
- trajectory = make_fake_trajectory(
- length=time_horizon,
- observation_shapes=optimizer.policy.behavior_spec.observation_shapes,
- max_step_complete=True,
- action_spec=DISCRETE_ACTION_SPEC if discrete else CONTINUOUS_ACTION_SPEC,
- )
- run_out, final_value_out = optimizer.get_trajectory_value_estimates(
- trajectory.to_agentbuffer(), trajectory.next_obs, done=False
- )
- for key, val in run_out.items():
- assert type(key) is str
- assert len(val) == 15
-
- run_out, final_value_out = optimizer.get_trajectory_value_estimates(
- trajectory.to_agentbuffer(), trajectory.next_obs, done=True
- )
- for key, val in final_value_out.items():
- assert type(key) is str
- assert val == 0.0
-
- # Check if we ignore terminal states properly
- optimizer.reward_signals["extrinsic"].use_terminal_states = False
- run_out, final_value_out = optimizer.get_trajectory_value_estimates(
- trajectory.to_agentbuffer(), trajectory.next_obs, done=False
- )
- for key, val in final_value_out.items():
- assert type(key) is str
- assert val != 0.0
-
-
-def test_rl_functions():
- rewards = np.array([0.0, 0.0, 0.0, 1.0], dtype=np.float32)
- gamma = 0.9
- returns = discount_rewards(rewards, gamma, 0.0)
- np.testing.assert_array_almost_equal(
- returns, np.array([0.729, 0.81, 0.9, 1.0], dtype=np.float32)
- )
-
-
-@mock.patch.object(RLTrainer, "create_model_saver")
-@mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
-def test_trainer_increment_step(ppo_optimizer, mock_create_model_saver):
- trainer_params = attr.evolve(
- attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW),
- framework=FrameworkType.TENSORFLOW,
- )
- mock_optimizer = mock.Mock()
- mock_optimizer.reward_signals = {}
- ppo_optimizer.return_value = mock_optimizer
-
- trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0")
- policy_mock = mock.Mock(spec=TFPolicy)
- policy_mock.get_current_step.return_value = 0
- step_count = (
- 5 # 10 hacked because this function is no longer called through trainer
- )
- policy_mock.increment_step = mock.Mock(return_value=step_count)
- behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
- trainer.add_policy(behavior_id, policy_mock)
-
- trainer._increment_step(5, trainer.brain_name)
- policy_mock.increment_step.assert_called_with(5)
- assert trainer.step == step_count
-
-
-@pytest.mark.parametrize("use_discrete", [True, False])
-def test_trainer_update_policy(
- dummy_config, curiosity_dummy_config, use_discrete # noqa: F811
-):
- mock_behavior_spec = mb.setup_test_behavior_specs(
- use_discrete,
- False,
- vector_action_space=DISCRETE_ACTION_SPACE
- if use_discrete
- else VECTOR_ACTION_SPACE,
- vector_obs_space=VECTOR_OBS_SPACE,
- )
-
- trainer_params = dummy_config
- trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
- memory_size=10, sequence_length=16
- )
-
- # Test curiosity reward signal
- trainer_params.reward_signals = curiosity_dummy_config
- mock_brain_name = "MockBrain"
- behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name)
- trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
- policy = trainer.create_policy(behavior_id, mock_behavior_spec)
- trainer.add_policy(behavior_id, policy)
- # Test update with sequence length smaller than batch size
- buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_behavior_spec)
- # Mock out reward signal eval
- buffer["extrinsic_rewards"] = buffer["environment_rewards"]
- buffer["extrinsic_returns"] = buffer["environment_rewards"]
- buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
- buffer["curiosity_rewards"] = buffer["environment_rewards"]
- buffer["curiosity_returns"] = buffer["environment_rewards"]
- buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
- buffer["advantages"] = buffer["environment_rewards"]
- # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
- if use_discrete:
- n_agents = len(buffer["discrete_log_probs"])
- buffer["discrete_log_probs"].reset_field()
- for _ in range(n_agents):
- buffer["discrete_log_probs"].append(
- np.ones(
- int(sum(mock_behavior_spec.action_spec.discrete_branches)),
- dtype=np.float32,
- )
- )
- else:
- n_agents = len(buffer["continuous_log_probs"])
- buffer["continuous_log_probs"].reset_field()
- for _ in range(n_agents):
- buffer["continuous_log_probs"].append(
- np.ones(
- mock_behavior_spec.action_spec.continuous_size, dtype=np.float32
- )
- )
- trainer.update_buffer = buffer
- trainer._update_policy()
-
-
-def test_process_trajectory(dummy_config):
- behavior_spec = mb.setup_test_behavior_specs(
- True,
- False,
- vector_action_space=DISCRETE_ACTION_SPACE,
- vector_obs_space=VECTOR_OBS_SPACE,
- )
- mock_brain_name = "MockBrain"
- behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name)
- trainer = PPOTrainer("test_brain", 0, dummy_config, True, False, 0, "0")
- policy = trainer.create_policy(behavior_id, behavior_spec)
- trainer.add_policy(behavior_id, policy)
- trajectory_queue = AgentManagerQueue("testbrain")
- trainer.subscribe_trajectory_queue(trajectory_queue)
- time_horizon = 15
- trajectory = make_fake_trajectory(
- length=time_horizon,
- observation_shapes=behavior_spec.observation_shapes,
- max_step_complete=True,
- action_spec=behavior_spec.action_spec,
- )
- trajectory_queue.put(trajectory)
- trainer.advance()
-
- # Check that trainer put trajectory in update buffer
- assert trainer.update_buffer.num_experiences == 15
-
- # Check that GAE worked
- assert (
- "advantages" in trainer.update_buffer
- and "discounted_returns" in trainer.update_buffer
- )
-
- # Check that the stats are being collected as episode isn't complete
- for reward in trainer.collected_rewards.values():
- for agent in reward.values():
- assert agent > 0
-
- # Add a terminal trajectory
- trajectory = make_fake_trajectory(
- length=time_horizon + 1,
- max_step_complete=False,
- observation_shapes=behavior_spec.observation_shapes,
- action_spec=behavior_spec.action_spec,
- )
- trajectory_queue.put(trajectory)
- trainer.advance()
-
- # Check that the stats are reset as episode is finished
- for reward in trainer.collected_rewards.values():
- for agent in reward.values():
- assert agent == 0
- assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
-
-
-@mock.patch.object(RLTrainer, "create_model_saver")
-@mock.patch("mlagents.trainers.ppo.trainer.PPOOptimizer")
-def test_add_get_policy(ppo_optimizer, mock_create_model_saver, dummy_config):
- mock_optimizer = mock.Mock()
- mock_optimizer.reward_signals = {}
- ppo_optimizer.return_value = mock_optimizer
-
- trainer = PPOTrainer("test_policy", 0, dummy_config, True, False, 0, "0")
- policy = mock.Mock(spec=TFPolicy)
- policy.get_current_step.return_value = 2000
-
- behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
- trainer.add_policy(behavior_id, policy)
- assert trainer.get_policy("test_policy") == policy
-
- # Make sure the summary steps were loaded properly
- assert trainer.get_step == 2000
-
-
-if __name__ == "__main__":
- pytest.main()
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_reward_signals.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_reward_signals.py
deleted file mode 100644
index 4b12ee5f4c..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_reward_signals.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import pytest
-
-import mlagents.trainers.tests.mock_brain as mb
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.sac.optimizer_tf import SACOptimizer
-from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
-from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable
- ppo_dummy_config,
- sac_dummy_config,
- gail_dummy_config,
- curiosity_dummy_config,
- extrinsic_dummy_config,
- DISCRETE_DEMO_PATH,
- CONTINUOUS_DEMO_PATH,
-)
-from mlagents.trainers.settings import (
- GAILSettings,
- BehavioralCloningSettings,
- NetworkSettings,
- TrainerType,
- RewardSignalType,
-)
-
-
-VECTOR_ACTION_SPACE = 2
-VECTOR_OBS_SPACE = 8
-DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
-BUFFER_INIT_SAMPLES = 20
-BATCH_SIZE = 12
-NUM_AGENTS = 12
-
-
-def create_optimizer_mock(
- trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual
-):
- mock_specs = mb.setup_test_behavior_specs(
- use_discrete,
- use_visual,
- vector_action_space=DISCRETE_ACTION_SPACE
- if use_discrete
- else VECTOR_ACTION_SPACE,
- vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
- )
- trainer_settings = trainer_config
- trainer_settings.reward_signals = reward_signal_config
- trainer_settings.network_settings.memory = (
- NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
- if use_rnn
- else None
- )
- policy = TFPolicy(
- 0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
- )
- if trainer_settings.trainer_type == TrainerType.SAC:
- optimizer = SACOptimizer(policy, trainer_settings)
- else:
- optimizer = PPOOptimizer(policy, trainer_settings)
- optimizer.policy.initialize()
- return optimizer
-
-
-def reward_signal_eval(optimizer, reward_signal_name):
- buffer = mb.simulate_rollout(BATCH_SIZE, optimizer.policy.behavior_spec)
- # Test evaluate
- rsig_result = optimizer.reward_signals[reward_signal_name].evaluate_batch(buffer)
- assert rsig_result.scaled_reward.shape == (BATCH_SIZE,)
- assert rsig_result.unscaled_reward.shape == (BATCH_SIZE,)
-
-
-def reward_signal_update(optimizer, reward_signal_name):
- buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec)
- feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update(
- optimizer.policy, buffer.make_mini_batch(0, 10), 2
- )
- out = optimizer.policy._execute_model(
- feed_dict, optimizer.reward_signals[reward_signal_name].update_dict
- )
- assert type(out) is dict
-
-
-@pytest.mark.parametrize(
- "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_gail_cc(trainer_config, gail_dummy_config): # noqa: F811
- trainer_config.behavioral_cloning = BehavioralCloningSettings(
- demo_path=CONTINUOUS_DEMO_PATH
- )
- optimizer = create_optimizer_mock(
- trainer_config, gail_dummy_config, False, False, False
- )
- reward_signal_eval(optimizer, "gail")
- reward_signal_update(optimizer, "gail")
-
-
-@pytest.mark.parametrize(
- "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_gail_dc_visual(trainer_config, gail_dummy_config): # noqa: F811
- gail_dummy_config_discrete = {
- RewardSignalType.GAIL: GAILSettings(demo_path=DISCRETE_DEMO_PATH)
- }
- optimizer = create_optimizer_mock(
- trainer_config, gail_dummy_config_discrete, False, True, True
- )
- reward_signal_eval(optimizer, "gail")
- reward_signal_update(optimizer, "gail")
-
-
-@pytest.mark.parametrize(
- "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_gail_rnn(trainer_config, gail_dummy_config): # noqa: F811
- policy = create_optimizer_mock(
- trainer_config, gail_dummy_config, True, False, False
- )
- reward_signal_eval(policy, "gail")
- reward_signal_update(policy, "gail")
-
-
-@pytest.mark.parametrize(
- "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_curiosity_cc(trainer_config, curiosity_dummy_config): # noqa: F811
- policy = create_optimizer_mock(
- trainer_config, curiosity_dummy_config, False, False, False
- )
- reward_signal_eval(policy, "curiosity")
- reward_signal_update(policy, "curiosity")
-
-
-@pytest.mark.parametrize(
- "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_curiosity_dc(trainer_config, curiosity_dummy_config): # noqa: F811
- policy = create_optimizer_mock(
- trainer_config, curiosity_dummy_config, False, True, False
- )
- reward_signal_eval(policy, "curiosity")
- reward_signal_update(policy, "curiosity")
-
-
-@pytest.mark.parametrize(
- "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_curiosity_visual(trainer_config, curiosity_dummy_config): # noqa: F811
- policy = create_optimizer_mock(
- trainer_config, curiosity_dummy_config, False, False, True
- )
- reward_signal_eval(policy, "curiosity")
- reward_signal_update(policy, "curiosity")
-
-
-@pytest.mark.parametrize(
- "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_curiosity_rnn(trainer_config, curiosity_dummy_config): # noqa: F811
- policy = create_optimizer_mock(
- trainer_config, curiosity_dummy_config, True, False, False
- )
- reward_signal_eval(policy, "curiosity")
- reward_signal_update(policy, "curiosity")
-
-
-@pytest.mark.parametrize(
- "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
-)
-def test_extrinsic(trainer_config, extrinsic_dummy_config): # noqa: F811
- policy = create_optimizer_mock(
- trainer_config, extrinsic_dummy_config, False, False, False
- )
- reward_signal_eval(policy, "extrinsic")
- reward_signal_update(policy, "extrinsic")
-
-
-if __name__ == "__main__":
- pytest.main()
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py
deleted file mode 100644
index 82a8a2eca6..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_sac.py
+++ /dev/null
@@ -1,239 +0,0 @@
-import pytest
-from unittest import mock
-import attr
-
-from mlagents.tf_utils import tf
-from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
-
-from mlagents.trainers.trainer.rl_trainer import RLTrainer
-from mlagents.trainers.sac.trainer import SACTrainer
-from mlagents.trainers.sac.optimizer_tf import SACOptimizer
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.agent_processor import AgentManagerQueue
-from mlagents.trainers.tests import mock_brain as mb
-from mlagents.trainers.tests.mock_brain import setup_test_behavior_specs
-from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-from mlagents.trainers.settings import NetworkSettings, FrameworkType
-from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable
- curiosity_dummy_config,
- sac_dummy_config,
-)
-
-
-@pytest.fixture
-def dummy_config():
- return attr.evolve(sac_dummy_config(), framework=FrameworkType.TENSORFLOW)
-
-
-VECTOR_ACTION_SPACE = 2
-VECTOR_OBS_SPACE = 8
-DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
-BUFFER_INIT_SAMPLES = 64
-NUM_AGENTS = 12
-
-
-def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
- mock_brain = mb.setup_test_behavior_specs(
- use_discrete,
- use_visual,
- vector_action_space=DISCRETE_ACTION_SPACE
- if use_discrete
- else VECTOR_ACTION_SPACE,
- vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
- )
- trainer_settings = dummy_config
- trainer_settings.network_settings.memory = (
- NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
- if use_rnn
- else None
- )
- policy = TFPolicy(
- 0, mock_brain, trainer_settings, "test", False, create_tf_graph=False
- )
- optimizer = SACOptimizer(policy, trainer_settings)
- optimizer.policy.initialize()
- return optimizer
-
-
-@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
-@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
-@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
-def test_sac_optimizer_update(dummy_config, rnn, visual, discrete):
- # Test evaluate
- tf.reset_default_graph()
- optimizer = create_sac_optimizer_mock(
- dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
- )
- # Test update
- update_buffer = mb.simulate_rollout(
- BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
- )
- # Mock out reward signal eval
- update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
- optimizer.update(
- update_buffer,
- num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
- )
-
-
-@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
-def test_sac_update_reward_signals(
- dummy_config, curiosity_dummy_config, discrete # noqa: F811
-):
- # Test evaluate
- tf.reset_default_graph()
- # Add a Curiosity module
- dummy_config.reward_signals = curiosity_dummy_config
- optimizer = create_sac_optimizer_mock(
- dummy_config, use_rnn=False, use_discrete=discrete, use_visual=False
- )
-
- # Test update, while removing PPO-specific buffer elements.
- update_buffer = mb.simulate_rollout(
- BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
- )
-
- # Mock out reward signal eval
- update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
- update_buffer["curiosity_rewards"] = update_buffer["environment_rewards"]
- optimizer.update_reward_signals(
- {"curiosity": update_buffer}, num_sequences=update_buffer.num_experiences
- )
-
-
-def test_sac_save_load_buffer(tmpdir, dummy_config):
- mock_specs = mb.setup_test_behavior_specs(
- False,
- False,
- vector_action_space=VECTOR_ACTION_SPACE,
- vector_obs_space=VECTOR_OBS_SPACE,
- )
- trainer_params = dummy_config
- trainer_params.hyperparameters.save_replay_buffer = True
- trainer = SACTrainer("test", 1, trainer_params, True, False, 0, "testdir")
- behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
- policy = trainer.create_policy(behavior_id, mock_specs)
- trainer.add_policy(behavior_id, policy)
-
- trainer.update_buffer = mb.simulate_rollout(
- BUFFER_INIT_SAMPLES, policy.behavior_spec
- )
- buffer_len = trainer.update_buffer.num_experiences
- trainer.save_model()
-
- # Wipe Trainer and try to load
- trainer2 = SACTrainer("test", 1, trainer_params, True, True, 0, "testdir")
-
- policy = trainer2.create_policy(behavior_id, mock_specs)
- trainer2.add_policy(behavior_id, policy)
- assert trainer2.update_buffer.num_experiences == buffer_len
-
-
-@mock.patch.object(RLTrainer, "create_model_saver")
-@mock.patch("mlagents.trainers.sac.trainer.SACOptimizer")
-def test_add_get_policy(sac_optimizer, mock_create_model_saver, dummy_config):
- mock_optimizer = mock.Mock()
- mock_optimizer.reward_signals = {}
- sac_optimizer.return_value = mock_optimizer
-
- trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0")
- policy = mock.Mock(spec=TFPolicy)
- policy.get_current_step.return_value = 2000
- behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
- trainer.add_policy(behavior_id, policy)
- assert trainer.get_policy(behavior_id.behavior_id) == policy
-
- # Make sure the summary steps were loaded properly
- assert trainer.get_step == 2000
-
-
-def test_advance(dummy_config):
- specs = setup_test_behavior_specs(
- use_discrete=False, use_visual=False, vector_action_space=2
- )
- dummy_config.hyperparameters.steps_per_update = 20
- dummy_config.hyperparameters.reward_signal_steps_per_update = 20
- dummy_config.hyperparameters.buffer_init_steps = 0
- trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0")
- behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
- policy = trainer.create_policy(behavior_id, specs)
- trainer.add_policy(behavior_id, policy)
-
- trajectory_queue = AgentManagerQueue("testbrain")
- policy_queue = AgentManagerQueue("testbrain")
- trainer.subscribe_trajectory_queue(trajectory_queue)
- trainer.publish_policy_queue(policy_queue)
-
- trajectory = make_fake_trajectory(
- length=15,
- observation_shapes=specs.observation_shapes,
- max_step_complete=True,
- action_spec=specs.action_spec,
- )
- trajectory_queue.put(trajectory)
- trainer.advance()
-
- # Check that trainer put trajectory in update buffer
- assert trainer.update_buffer.num_experiences == 15
-
- # Check that the stats are being collected as episode isn't complete
- for reward in trainer.collected_rewards.values():
- for agent in reward.values():
- assert agent > 0
-
- # Add a terminal trajectory
- trajectory = make_fake_trajectory(
- length=6,
- observation_shapes=specs.observation_shapes,
- max_step_complete=False,
- action_spec=specs.action_spec,
- )
- trajectory_queue.put(trajectory)
- trainer.advance()
-
- # Check that the stats are reset as episode is finished
- for reward in trainer.collected_rewards.values():
- for agent in reward.values():
- assert agent == 0
- assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
- # Assert we're not just using the default values
- assert (
- trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").mean > 0
- )
-
- # Make sure there is a policy on the queue
- policy_queue.get_nowait()
-
- # Add another trajectory. Since this is less than 20 steps total (enough for)
- # two updates, there should NOT be a policy on the queue.
- trajectory = make_fake_trajectory(
- length=5,
- observation_shapes=specs.observation_shapes,
- action_spec=specs.action_spec,
- max_step_complete=False,
- )
- trajectory_queue.put(trajectory)
- trainer.advance()
- with pytest.raises(AgentManagerQueue.Empty):
- policy_queue.get_nowait()
-
- # Call add_policy and check that we update the correct number of times.
- # This is to emulate a load from checkpoint.
- behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
- policy = trainer.create_policy(behavior_id, specs)
- policy.get_current_step = lambda: 200
- trainer.add_policy(behavior_id, policy)
- trainer.optimizer.update = mock.Mock()
- trainer.model_saver.initialize_or_load(policy)
- trainer.optimizer.update_reward_signals = mock.Mock()
- trainer.optimizer.update_reward_signals.return_value = {}
- trainer.optimizer.update.return_value = {}
- trajectory_queue.put(trajectory)
- trainer.advance()
- # Make sure we did exactly 1 update
- assert trainer.optimizer.update.call_count == 1
- assert trainer.optimizer.update_reward_signals.call_count == 1
-
-
-if __name__ == "__main__":
- pytest.main()
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py
deleted file mode 100644
index b020d2fc69..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_saver.py
+++ /dev/null
@@ -1,179 +0,0 @@
-import pytest
-from unittest import mock
-import os
-import unittest
-import tempfile
-
-import numpy as np
-from mlagents.tf_utils import tf
-from mlagents.trainers.model_saver.tf_model_saver import TFModelSaver
-from mlagents.trainers import __version__
-from mlagents.trainers.settings import TrainerSettings, NetworkSettings
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.tests import mock_brain as mb
-from mlagents.trainers.tests.tensorflow.test_nn_policy import create_policy_mock
-from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-from mlagents.trainers.ppo.optimizer_tf import PPOOptimizer
-
-
-def test_register(tmp_path):
- trainer_params = TrainerSettings()
- model_saver = TFModelSaver(trainer_params, tmp_path)
-
- opt = mock.Mock(spec=PPOOptimizer)
- model_saver.register(opt)
- assert model_saver.policy is None
-
- trainer_params = TrainerSettings()
- policy = create_policy_mock(trainer_params)
- model_saver.register(policy)
- assert model_saver.policy is not None
-
-
-class ModelVersionTest(unittest.TestCase):
- def test_version_compare(self):
- # Test write_stats
- with self.assertLogs("mlagents.trainers", level="WARNING") as cm:
- trainer_params = TrainerSettings()
- mock_path = tempfile.mkdtemp()
- policy = create_policy_mock(trainer_params)
- model_saver = TFModelSaver(trainer_params, mock_path)
- model_saver.register(policy)
-
- model_saver._check_model_version(
- "0.0.0"
- ) # This is not the right version for sure
- # Assert that 1 warning has been thrown with incorrect version
- assert len(cm.output) == 1
- model_saver._check_model_version(
- __version__
- ) # This should be the right version
- # Assert that no additional warnings have been thrown wth correct ver
- assert len(cm.output) == 1
-
-
-def test_load_save(tmp_path):
- path1 = os.path.join(tmp_path, "runid1")
- path2 = os.path.join(tmp_path, "runid2")
- trainer_params = TrainerSettings()
- policy = create_policy_mock(trainer_params)
- model_saver = TFModelSaver(trainer_params, path1)
- model_saver.register(policy)
- model_saver.initialize_or_load(policy)
- policy.set_step(2000)
-
- mock_brain_name = "MockBrain"
- model_saver.save_checkpoint(mock_brain_name, 2000)
- assert len(os.listdir(tmp_path)) > 0
-
- # Try load from this path
- model_saver = TFModelSaver(trainer_params, path1, load=True)
- policy2 = create_policy_mock(trainer_params)
- model_saver.register(policy2)
- model_saver.initialize_or_load(policy2)
- _compare_two_policies(policy, policy2)
- assert policy2.get_current_step() == 2000
-
- # Try initialize from path 1
- trainer_params.init_path = path1
- model_saver = TFModelSaver(trainer_params, path2)
- policy3 = create_policy_mock(trainer_params)
- model_saver.register(policy3)
- model_saver.initialize_or_load(policy3)
-
- _compare_two_policies(policy2, policy3)
- # Assert that the steps are 0.
- assert policy3.get_current_step() == 0
-
-
-def _compare_two_policies(policy1: TFPolicy, policy2: TFPolicy) -> None:
- """
- Make sure two policies have the same output for the same input.
- """
- decision_step, _ = mb.create_steps_from_behavior_spec(
- policy1.behavior_spec, num_agents=1
- )
- run_out1 = policy1.evaluate(decision_step, list(decision_step.agent_id))
- run_out2 = policy2.evaluate(decision_step, list(decision_step.agent_id))
-
- np.testing.assert_array_equal(run_out2["log_probs"], run_out1["log_probs"])
-
-
-@pytest.mark.parametrize("discrete", [True, False], ids=["discrete", "continuous"])
-@pytest.mark.parametrize("visual", [True, False], ids=["visual", "vector"])
-@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
-def test_checkpoint_conversion(tmpdir, rnn, visual, discrete):
- tf.reset_default_graph()
- dummy_config = TrainerSettings()
- model_path = os.path.join(tmpdir, "Mock_Brain")
- policy = create_policy_mock(
- dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
- )
- trainer_params = TrainerSettings()
- model_saver = TFModelSaver(trainer_params, model_path)
- model_saver.register(policy)
- model_saver.save_checkpoint("Mock_Brain", 100)
- assert os.path.isfile(model_path + "/Mock_Brain-100.nn")
-
-
-# This is the normalizer test from test_nn_policy.py but with a load
-def test_normalizer_after_load(tmp_path):
- behavior_spec = mb.setup_test_behavior_specs(
- use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
- )
- time_horizon = 6
- trajectory = make_fake_trajectory(
- length=time_horizon,
- max_step_complete=True,
- observation_shapes=[(1,)],
- action_spec=behavior_spec.action_spec,
- )
- # Change half of the obs to 0
- for i in range(3):
- trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
-
- trainer_params = TrainerSettings(network_settings=NetworkSettings(normalize=True))
- policy = TFPolicy(0, behavior_spec, trainer_params)
-
- trajectory_buffer = trajectory.to_agentbuffer()
- policy.update_normalization(trajectory_buffer["vector_obs"])
-
- # Check that the running mean and variance is correct
- steps, mean, variance = policy.sess.run(
- [policy.normalization_steps, policy.running_mean, policy.running_variance]
- )
-
- assert steps == 6
- assert mean[0] == 0.5
- assert variance[0] / steps == pytest.approx(0.25, abs=0.01)
- # Save ckpt and load into another policy
- path1 = os.path.join(tmp_path, "runid1")
- model_saver = TFModelSaver(trainer_params, path1)
- model_saver.register(policy)
- mock_brain_name = "MockBrain"
- model_saver.save_checkpoint(mock_brain_name, 6)
- assert len(os.listdir(tmp_path)) > 0
- policy1 = TFPolicy(0, behavior_spec, trainer_params)
- model_saver = TFModelSaver(trainer_params, path1, load=True)
- model_saver.register(policy1)
- model_saver.initialize_or_load(policy1)
-
- # Make another update to new policy, this time with all 1's
- time_horizon = 10
- trajectory = make_fake_trajectory(
- length=time_horizon,
- max_step_complete=True,
- observation_shapes=[(1,)],
- action_spec=behavior_spec.action_spec,
- )
- trajectory_buffer = trajectory.to_agentbuffer()
- policy1.update_normalization(trajectory_buffer["vector_obs"])
-
- # Check that the running mean and variance is correct
- steps, mean, variance = policy1.sess.run(
- [policy1.normalization_steps, policy1.running_mean, policy1.running_variance]
- )
-
- assert steps == 16
- assert mean[0] == 0.8125
- assert variance[0] / steps == pytest.approx(0.152, abs=0.01)
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
deleted file mode 100644
index 8ba24c50ba..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_simple_rl.py
+++ /dev/null
@@ -1,522 +0,0 @@
-import math
-import tempfile
-import pytest
-import numpy as np
-import attr
-from typing import Dict
-
-from mlagents.trainers.tests.simple_test_envs import (
- SimpleEnvironment,
- MemoryEnvironment,
- RecordEnvironment,
-)
-from mlagents.trainers.trainer_controller import TrainerController
-from mlagents.trainers.trainer import TrainerFactory
-from mlagents.trainers.simple_env_manager import SimpleEnvManager
-from mlagents.trainers.demo_loader import write_demo
-from mlagents.trainers.stats import StatsReporter, StatsWriter, StatsSummary
-from mlagents.trainers.settings import (
- NetworkSettings,
- SelfPlaySettings,
- BehavioralCloningSettings,
- GAILSettings,
- RewardSignalType,
- EncoderType,
- FrameworkType,
-)
-from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
-from mlagents_envs.side_channel.environment_parameters_channel import (
- EnvironmentParametersChannel,
-)
-from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
- DemonstrationMetaProto,
-)
-from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
-from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous
-
-from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
-
-PPO_TF_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW)
-SAC_TF_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.TENSORFLOW)
-
-BRAIN_NAME = "1D"
-
-
-# The reward processor is passed as an argument to _check_environment_trains.
-# It is applied to the list of all final rewards for each brain individually.
-# This is so that we can process all final rewards in different ways for different algorithms.
-# Custom reward processors should be built within the test function and passed to _check_environment_trains
-# Default is average over the last 5 final rewards
-def default_reward_processor(rewards, last_n_rewards=5):
- rewards_to_use = rewards[-last_n_rewards:]
- # For debugging tests
- print(f"Last {last_n_rewards} rewards:", rewards_to_use)
- return np.array(rewards[-last_n_rewards:], dtype=np.float32).mean()
-
-
-class DebugWriter(StatsWriter):
- """
- Print to stdout so stats can be viewed in pytest
- """
-
- def __init__(self):
- self._last_reward_summary: Dict[str, float] = {}
-
- def get_last_rewards(self):
- return self._last_reward_summary
-
- def write_stats(
- self, category: str, values: Dict[str, StatsSummary], step: int
- ) -> None:
- for val, stats_summary in values.items():
- if val == "Environment/Cumulative Reward":
- print(step, val, stats_summary.mean)
- self._last_reward_summary[category] = stats_summary.mean
-
-
-def _check_environment_trains(
- env,
- trainer_config,
- reward_processor=default_reward_processor,
- env_parameter_manager=None,
- success_threshold=0.9,
- env_manager=None,
-):
- if env_parameter_manager is None:
- env_parameter_manager = EnvironmentParameterManager()
- # Create controller and begin training.
- with tempfile.TemporaryDirectory() as dir:
- run_id = "id"
- seed = 1337
- StatsReporter.writers.clear() # Clear StatsReporters so we don't write to file
- debug_writer = DebugWriter()
- StatsReporter.add_writer(debug_writer)
- if env_manager is None:
- env_manager = SimpleEnvManager(env, EnvironmentParametersChannel())
- trainer_factory = TrainerFactory(
- trainer_config=trainer_config,
- output_path=dir,
- train_model=True,
- load_model=False,
- seed=seed,
- param_manager=env_parameter_manager,
- multi_gpu=False,
- )
-
- tc = TrainerController(
- trainer_factory=trainer_factory,
- output_path=dir,
- run_id=run_id,
- param_manager=env_parameter_manager,
- train=True,
- training_seed=seed,
- )
-
- # Begin training
- tc.start_learning(env_manager)
- if (
- success_threshold is not None
- ): # For tests where we are just checking setup and not reward
- processed_rewards = [
- reward_processor(rewards) for rewards in env.final_rewards.values()
- ]
- assert all(not math.isnan(reward) for reward in processed_rewards)
- assert all(reward > success_threshold for reward in processed_rewards)
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_simple_ppo(action_sizes):
- env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
- config = attr.evolve(PPO_TF_CONFIG, framework=FrameworkType.TENSORFLOW)
- _check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
-def test_2d_ppo(action_sizes):
- env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
- new_hyperparams = attr.evolve(
- PPO_TF_CONFIG.hyperparameters, batch_size=64, buffer_size=640
- )
- config = attr.evolve(
- PPO_TF_CONFIG,
- hyperparameters=new_hyperparams,
- max_steps=10000,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-@pytest.mark.parametrize("num_visual", [1, 2])
-def test_visual_ppo(num_visual, action_sizes):
- env = SimpleEnvironment(
- [BRAIN_NAME],
- action_sizes=action_sizes,
- num_visual=num_visual,
- num_vector=0,
- step_size=0.2,
- )
- new_hyperparams = attr.evolve(PPO_TF_CONFIG.hyperparameters, learning_rate=3.0e-4)
- config = attr.evolve(
- PPO_TF_CONFIG,
- hyperparameters=new_hyperparams,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("num_visual", [1, 2])
-@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
-def test_visual_advanced_ppo(vis_encode_type, num_visual):
- env = SimpleEnvironment(
- [BRAIN_NAME],
- action_sizes=(0, 1),
- num_visual=num_visual,
- num_vector=0,
- step_size=0.5,
- vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
- )
- new_networksettings = attr.evolve(
- SAC_TF_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
- )
- new_hyperparams = attr.evolve(PPO_TF_CONFIG.hyperparameters, learning_rate=3.0e-4)
- config = attr.evolve(
- PPO_TF_CONFIG,
- hyperparameters=new_hyperparams,
- network_settings=new_networksettings,
- max_steps=400,
- summary_freq=100,
- framework=FrameworkType.TENSORFLOW,
- )
- # The number of steps is pretty small for these encoders
- _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_recurrent_ppo(action_sizes):
- env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
- new_network_settings = attr.evolve(
- PPO_TF_CONFIG.network_settings,
- memory=NetworkSettings.MemorySettings(memory_size=16),
- )
- new_hyperparams = attr.evolve(
- PPO_TF_CONFIG.hyperparameters,
- learning_rate=1.0e-3,
- batch_size=64,
- buffer_size=128,
- )
- config = attr.evolve(
- PPO_TF_CONFIG,
- hyperparameters=new_hyperparams,
- network_settings=new_network_settings,
- max_steps=5000,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_simple_sac(action_sizes):
- env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes)
- config = attr.evolve(
- SAC_TF_CONFIG, framework=FrameworkType.TENSORFLOW, max_steps=900
- )
- _check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 2), (2, 0)])
-def test_2d_sac(action_sizes):
- env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8)
- new_hyperparams = attr.evolve(SAC_TF_CONFIG.hyperparameters, buffer_init_steps=2000)
- config = attr.evolve(
- SAC_TF_CONFIG,
- hyperparameters=new_hyperparams,
- max_steps=10000,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-@pytest.mark.parametrize("num_visual", [1, 2])
-def test_visual_sac(num_visual, action_sizes):
- env = SimpleEnvironment(
- [BRAIN_NAME],
- action_sizes=action_sizes,
- num_visual=num_visual,
- num_vector=0,
- step_size=0.2,
- )
- new_hyperparams = attr.evolve(
- SAC_TF_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4
- )
- config = attr.evolve(
- SAC_TF_CONFIG,
- hyperparameters=new_hyperparams,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("num_visual", [1, 2])
-@pytest.mark.parametrize("vis_encode_type", ["resnet", "nature_cnn", "match3"])
-def test_visual_advanced_sac(vis_encode_type, num_visual):
- env = SimpleEnvironment(
- [BRAIN_NAME],
- action_sizes=(0, 1),
- num_visual=num_visual,
- num_vector=0,
- step_size=0.5,
- vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3),
- )
- new_networksettings = attr.evolve(
- SAC_TF_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)
- )
- new_hyperparams = attr.evolve(
- SAC_TF_CONFIG.hyperparameters,
- batch_size=16,
- learning_rate=3e-4,
- buffer_init_steps=0,
- )
- config = attr.evolve(
- SAC_TF_CONFIG,
- hyperparameters=new_hyperparams,
- network_settings=new_networksettings,
- max_steps=200,
- framework=FrameworkType.TENSORFLOW,
- )
- # The number of steps is pretty small for these encoders
- _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_recurrent_sac(action_sizes):
- step_size = 0.2 if action_sizes == (0, 1) else 0.5
- env = MemoryEnvironment(
- [BRAIN_NAME], action_sizes=action_sizes, step_size=step_size
- )
- new_networksettings = attr.evolve(
- SAC_TF_CONFIG.network_settings,
- memory=NetworkSettings.MemorySettings(memory_size=16),
- )
- new_hyperparams = attr.evolve(
- SAC_TF_CONFIG.hyperparameters,
- batch_size=128,
- learning_rate=1e-3,
- buffer_init_steps=1000,
- steps_per_update=2,
- )
- config = attr.evolve(
- SAC_TF_CONFIG,
- hyperparameters=new_hyperparams,
- network_settings=new_networksettings,
- max_steps=4000,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_simple_ghost(action_sizes):
- env = SimpleEnvironment(
- [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
- )
- self_play_settings = SelfPlaySettings(
- play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000
- )
- config = attr.evolve(
- PPO_TF_CONFIG,
- self_play=self_play_settings,
- max_steps=2500,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config})
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_simple_ghost_fails(action_sizes):
- env = SimpleEnvironment(
- [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes
- )
- # This config should fail because the ghosted policy is never swapped with a competent policy.
- # Swap occurs after max step is reached.
- self_play_settings = SelfPlaySettings(
- play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000
- )
- config = attr.evolve(
- PPO_TF_CONFIG,
- self_play=self_play_settings,
- max_steps=2500,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None)
- processed_rewards = [
- default_reward_processor(rewards) for rewards in env.final_rewards.values()
- ]
- success_threshold = 0.9
- assert any(reward > success_threshold for reward in processed_rewards) and any(
- reward < success_threshold for reward in processed_rewards
- )
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_simple_asymm_ghost(action_sizes):
- # Make opponent for asymmetric case
- brain_name_opp = BRAIN_NAME + "Opp"
- env = SimpleEnvironment(
- [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
- )
- self_play_settings = SelfPlaySettings(
- play_against_latest_model_ratio=1.0,
- save_steps=10000,
- swap_steps=10000,
- team_change=400,
- )
- config = attr.evolve(
- PPO_TF_CONFIG,
- self_play=self_play_settings,
- max_steps=4000,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_simple_asymm_ghost_fails(action_sizes):
- # Make opponent for asymmetric case
- brain_name_opp = BRAIN_NAME + "Opp"
- env = SimpleEnvironment(
- [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes
- )
- # This config should fail because the team that us not learning when both have reached
- # max step should be executing the initial, untrained poliy.
- self_play_settings = SelfPlaySettings(
- play_against_latest_model_ratio=0.0,
- save_steps=5000,
- swap_steps=5000,
- team_change=2000,
- )
- config = attr.evolve(
- PPO_TF_CONFIG,
- self_play=self_play_settings,
- max_steps=3000,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(
- env, {BRAIN_NAME: config, brain_name_opp: config}, success_threshold=None
- )
- processed_rewards = [
- default_reward_processor(rewards) for rewards in env.final_rewards.values()
- ]
- success_threshold = 0.9
- assert any(reward > success_threshold for reward in processed_rewards) and any(
- reward < success_threshold for reward in processed_rewards
- )
-
-
-@pytest.fixture(scope="session")
-def simple_record(tmpdir_factory):
- def record_demo(action_sizes, num_visual=0, num_vector=1):
- env = RecordEnvironment(
- [BRAIN_NAME],
- action_sizes=action_sizes,
- num_visual=num_visual,
- num_vector=num_vector,
- n_demos=100,
- )
- # If we want to use true demos, we can solve the env in the usual way
- # Otherwise, we can just call solve to execute the optimal policy
- env.solve()
- continuous_size, discrete_size = action_sizes
- use_discrete = True if discrete_size > 0 else False
- agent_info_protos = env.demonstration_protos[BRAIN_NAME]
- meta_data_proto = DemonstrationMetaProto()
- brain_param_proto = BrainParametersProto(
- vector_action_size_deprecated=[2] if use_discrete else [1],
- vector_action_descriptions_deprecated=[""],
- vector_action_space_type_deprecated=discrete
- if use_discrete
- else continuous,
- brain_name=BRAIN_NAME,
- is_training=True,
- )
- action_type = "Discrete" if use_discrete else "Continuous"
- demo_path_name = "1DTest" + action_type + ".demo"
- demo_path = str(tmpdir_factory.mktemp("tmp_demo").join(demo_path_name))
- write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos)
- return demo_path
-
- return record_demo
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-@pytest.mark.parametrize("trainer_config", [PPO_TF_CONFIG, SAC_TF_CONFIG])
-def test_gail(simple_record, action_sizes, trainer_config):
- demo_path = simple_record(action_sizes)
- env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.2)
- bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
- reward_signals = {
- RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
- }
- config = attr.evolve(
- trainer_config,
- reward_signals=reward_signals,
- behavioral_cloning=bc_settings,
- max_steps=500,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_gail_visual_ppo(simple_record, action_sizes):
- demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
- env = SimpleEnvironment(
- [BRAIN_NAME],
- num_visual=1,
- num_vector=0,
- action_sizes=action_sizes,
- step_size=0.2,
- )
- bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500)
- reward_signals = {
- RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
- }
- hyperparams = attr.evolve(PPO_TF_CONFIG.hyperparameters, learning_rate=3e-4)
- config = attr.evolve(
- PPO_TF_CONFIG,
- reward_signals=reward_signals,
- hyperparameters=hyperparams,
- behavioral_cloning=bc_settings,
- max_steps=1000,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
-
-
-@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
-def test_gail_visual_sac(simple_record, action_sizes):
- demo_path = simple_record(action_sizes, num_visual=1, num_vector=0)
- env = SimpleEnvironment(
- [BRAIN_NAME],
- num_visual=1,
- num_vector=0,
- action_sizes=action_sizes,
- step_size=0.2,
- )
- bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000)
- reward_signals = {
- RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path)
- }
- hyperparams = attr.evolve(
- SAC_TF_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16
- )
- config = attr.evolve(
- SAC_TF_CONFIG,
- reward_signals=reward_signals,
- hyperparameters=hyperparams,
- behavioral_cloning=bc_settings,
- max_steps=500,
- framework=FrameworkType.TENSORFLOW,
- )
- _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
diff --git a/ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py b/ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
deleted file mode 100644
index acd3058f35..0000000000
--- a/ml-agents/mlagents/trainers/tests/tensorflow/test_tf_policy.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
-from mlagents.trainers.action_info import ActionInfo
-from unittest.mock import MagicMock
-from mlagents.trainers.settings import TrainerSettings
-import numpy as np
-from mlagents_envs.base_env import ActionSpec
-
-
-def basic_behavior_spec():
- dummy_actionspec = ActionSpec.create_continuous(1)
- dummy_groupspec = BehaviorSpec([(1,)], dummy_actionspec)
- return dummy_groupspec
-
-
-class FakePolicy(TFPolicy):
- def create_tf_graph(self):
- pass
-
- def get_trainable_variables(self):
- return []
-
-
-def test_take_action_returns_empty_with_no_agents():
- test_seed = 3
- behavior_spec = basic_behavior_spec()
- policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
- no_agent_step = DecisionSteps.empty(behavior_spec)
- result = policy.get_action(no_agent_step)
- assert result == ActionInfo.empty()
-
-
-def test_take_action_returns_nones_on_missing_values():
- test_seed = 3
- behavior_spec = basic_behavior_spec()
- policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
- policy.evaluate = MagicMock(return_value={})
- policy.save_memories = MagicMock()
- step_with_agents = DecisionSteps(
- [], np.array([], dtype=np.float32), np.array([0]), None
- )
- result = policy.get_action(step_with_agents, worker_id=0)
- assert result == ActionInfo(None, None, None, {}, [0])
-
-
-def test_take_action_returns_action_info_when_available():
- test_seed = 3
- behavior_spec = basic_behavior_spec()
- policy = FakePolicy(test_seed, behavior_spec, TrainerSettings(), "output")
- policy_eval_out = {
- "action": np.array([[1.0]], dtype=np.float32),
- "pre_action": np.array([[1.0]], dtype=np.float32),
- "memory_out": np.array([[2.5]], dtype=np.float32),
- "value": np.array([1.1], dtype=np.float32),
- }
- policy.evaluate = MagicMock(return_value=policy_eval_out)
- step_with_agents = DecisionSteps(
- [], np.array([], dtype=np.float32), np.array([0]), None
- )
- result = policy.get_action(step_with_agents)
- print(result)
- expected = ActionInfo(
- policy_eval_out["action"],
- policy_eval_out["env_action"],
- policy_eval_out["value"],
- policy_eval_out,
- [0],
- )
- assert result == expected
-
-
-def test_convert_version_string():
- result = TFPolicy._convert_version_string("200.300.100")
- assert result == (200, 300, 100)
- # Test dev versions
- result = TFPolicy._convert_version_string("200.300.100.dev0")
- assert result == (200, 300, 100)
diff --git a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
index a5beec5a2d..918a9d04e4 100644
--- a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
+++ b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
@@ -6,7 +6,7 @@
from mlagents.trainers.trainer.rl_trainer import RLTrainer
from mlagents.trainers.tests.test_buffer import construct_fake_buffer
from mlagents.trainers.agent_processor import AgentManagerQueue
-from mlagents.trainers.settings import TrainerSettings, FrameworkType
+from mlagents.trainers.settings import TrainerSettings
from mlagents_envs.base_env import ActionSpec
@@ -45,12 +45,10 @@ def _process_trajectory(self, trajectory):
super()._process_trajectory(trajectory)
-def create_rl_trainer(framework=FrameworkType.TENSORFLOW):
+def create_rl_trainer():
trainer = FakeTrainer(
"test_trainer",
- TrainerSettings(
- max_steps=100, checkpoint_interval=10, summary_freq=20, framework=framework
- ),
+ TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20),
True,
False,
"mock_model_path",
@@ -124,15 +122,12 @@ def test_advance(mocked_clear_update_buffer, mocked_save_model):
assert mocked_save_model.call_count == 0
-@pytest.mark.parametrize(
- "framework", [FrameworkType.TENSORFLOW, FrameworkType.PYTORCH], ids=["tf", "torch"]
-)
@mock.patch("mlagents.trainers.trainer.trainer.StatsReporter.write_stats")
@mock.patch(
"mlagents.trainers.trainer.rl_trainer.ModelCheckpointManager.add_checkpoint"
)
-def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary, framework):
- trainer = create_rl_trainer(framework)
+def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary):
+ trainer = create_rl_trainer()
mock_policy = mock.Mock()
trainer.add_policy("TestBrain", mock_policy)
trajectory_queue = AgentManagerQueue("testbrain")
@@ -169,7 +164,7 @@ def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary, framework):
calls = [mock.call(trainer.brain_name, step) for step in checkpoint_range]
trainer.model_saver.save_checkpoint.assert_has_calls(calls, any_order=True)
- export_ext = "nn" if trainer.framework == FrameworkType.TENSORFLOW else "onnx"
+ export_ext = "onnx"
add_checkpoint_calls = [
mock.call(
diff --git a/ml-agents/mlagents/trainers/tests/test_trainer_controller.py b/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
index 1ff0fe7a8c..578b852f59 100644
--- a/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
+++ b/ml-agents/mlagents/trainers/tests/test_trainer_controller.py
@@ -1,7 +1,7 @@
from unittest.mock import MagicMock, patch
import pytest
+from mlagents.torch_utils import torch
-from mlagents.tf_utils import tf
from mlagents.trainers.trainer_controller import TrainerController
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents.trainers.ghost.controller import GhostController
@@ -22,8 +22,8 @@ def basic_trainer_controller():
@patch("numpy.random.seed")
-@patch.object(tf, "set_random_seed")
-def test_initialization_seed(numpy_random_seed, tensorflow_set_seed):
+@patch.object(torch, "manual_seed")
+def test_initialization_seed(numpy_random_seed, torch_set_seed):
seed = 27
trainer_factory_mock = MagicMock()
trainer_factory_mock.ghost_controller = GhostController()
@@ -36,7 +36,7 @@ def test_initialization_seed(numpy_random_seed, tensorflow_set_seed):
training_seed=seed,
)
numpy_random_seed.assert_called_with(seed)
- tensorflow_set_seed.assert_called_with(seed)
+ torch_set_seed.assert_called_with(seed)
@pytest.fixture
@@ -70,33 +70,27 @@ def take_step_sideeffect(env):
return tc, trainer_mock
-@patch.object(tf, "reset_default_graph")
def test_start_learning_trains_forever_if_no_train_model(
- tf_reset_graph, trainer_controller_with_start_learning_mocks
+ trainer_controller_with_start_learning_mocks
):
tc, trainer_mock = trainer_controller_with_start_learning_mocks
tc.train_model = False
- tf_reset_graph.return_value = None
-
env_mock = MagicMock()
env_mock.close = MagicMock()
env_mock.reset = MagicMock()
env_mock.training_behaviors = MagicMock()
tc.start_learning(env_mock)
- tf_reset_graph.assert_called_once()
env_mock.reset.assert_called_once()
assert tc.advance.call_count == 11
tc._save_models.assert_not_called()
-@patch.object(tf, "reset_default_graph")
def test_start_learning_trains_until_max_steps_then_saves(
- tf_reset_graph, trainer_controller_with_start_learning_mocks
+ trainer_controller_with_start_learning_mocks
):
tc, trainer_mock = trainer_controller_with_start_learning_mocks
- tf_reset_graph.return_value = None
brain_info_mock = MagicMock()
env_mock = MagicMock()
@@ -105,7 +99,6 @@ def test_start_learning_trains_until_max_steps_then_saves(
env_mock.training_behaviors = MagicMock()
tc.start_learning(env_mock)
- tf_reset_graph.assert_called_once()
env_mock.reset.assert_called_once()
assert tc.advance.call_count == trainer_mock.get_max_steps + 1
tc._save_models.assert_called_once()
diff --git a/ml-agents/mlagents/trainers/tests/test_training_status.py b/ml-agents/mlagents/trainers/tests/test_training_status.py
index db9992fe1c..32028befdb 100644
--- a/ml-agents/mlagents/trainers/tests/test_training_status.py
+++ b/ml-agents/mlagents/trainers/tests/test_training_status.py
@@ -117,8 +117,8 @@ def test_metadata_compare(self):
version_statsmetadata = StatusMetaData(mlagents_version="test")
default_metadata.check_compatibility(version_statsmetadata)
- tf_version_statsmetadata = StatusMetaData(tensorflow_version="test")
- default_metadata.check_compatibility(tf_version_statsmetadata)
+ torch_version_statsmetadata = StatusMetaData(torch_version="test")
+ default_metadata.check_compatibility(torch_version_statsmetadata)
# Assert that 2 warnings have been thrown
assert len(cm.output) == 2
diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
index f80a25095b..8a37bca613 100644
--- a/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ghost.py
@@ -9,14 +9,12 @@
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings, FrameworkType
+from mlagents.trainers.settings import TrainerSettings, SelfPlaySettings
@pytest.fixture
def dummy_config():
- return TrainerSettings(
- self_play=SelfPlaySettings(), framework=FrameworkType.PYTORCH
- )
+ return TrainerSettings(self_play=SelfPlaySettings())
VECTOR_ACTION_SPACE = 1
diff --git a/ml-agents/mlagents/trainers/tests/torch/test_hybrid.py b/ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
index b7a1ee4f6d..9ae20034c9 100644
--- a/ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_hybrid.py
@@ -7,15 +7,15 @@
MemoryEnvironment,
)
-from mlagents.trainers.settings import NetworkSettings, FrameworkType
+from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
from mlagents.trainers.tests.check_env_trains import check_environment_trains
BRAIN_NAME = "1D"
-PPO_TORCH_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
-SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
+PPO_TORCH_CONFIG = ppo_dummy_config()
+SAC_TORCH_CONFIG = sac_dummy_config()
@pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)])
diff --git a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
index cda6e61aad..a3b196364a 100644
--- a/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_ppo.py
@@ -1,14 +1,13 @@
import pytest
import numpy as np
-from mlagents.tf_utils import tf
import attr
from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb
from mlagents.trainers.tests.test_trajectory import make_fake_trajectory
-from mlagents.trainers.settings import NetworkSettings, FrameworkType
+from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable
ppo_dummy_config,
curiosity_dummy_config,
@@ -20,7 +19,7 @@
@pytest.fixture
def dummy_config():
- return attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
+ return ppo_dummy_config()
VECTOR_ACTION_SPACE = 2
@@ -59,7 +58,6 @@ def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
@pytest.mark.parametrize("rnn", [True, False], ids=["rnn", "no_rnn"])
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
# Test evaluate
- tf.reset_default_graph()
optimizer = create_test_ppo_optimizer(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
)
@@ -98,7 +96,6 @@ def test_ppo_optimizer_update_curiosity(
dummy_config, curiosity_dummy_config, rnn, visual, discrete # noqa: F811
):
# Test evaluate
- tf.reset_default_graph()
dummy_config.reward_signals = curiosity_dummy_config
optimizer = create_test_ppo_optimizer(
dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
@@ -125,7 +122,7 @@ def test_ppo_optimizer_update_curiosity(
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811
# Test evaluate
dummy_config.reward_signals = gail_dummy_config
- config = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
+ config = ppo_dummy_config()
optimizer = create_test_ppo_optimizer(
config, use_rnn=False, use_discrete=False, use_visual=False
)
diff --git a/ml-agents/mlagents/trainers/tests/torch/test_sac.py b/ml-agents/mlagents/trainers/tests/torch/test_sac.py
index 84fda15238..7f50514428 100644
--- a/ml-agents/mlagents/trainers/tests/torch/test_sac.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_sac.py
@@ -1,11 +1,10 @@
import pytest
from mlagents.torch_utils import torch
-import attr
from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer
from mlagents.trainers.policy.torch_policy import TorchPolicy
from mlagents.trainers.tests import mock_brain as mb
-from mlagents.trainers.settings import NetworkSettings, FrameworkType
+from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.tests.dummy_config import ( # noqa: F401; pylint: disable=unused-variable
sac_dummy_config,
curiosity_dummy_config,
@@ -14,7 +13,7 @@
@pytest.fixture
def dummy_config():
- return attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
+ return sac_dummy_config()
VECTOR_ACTION_SPACE = 2
diff --git a/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
index 3ba26acc79..0d1751dd08 100644
--- a/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
+++ b/ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py
@@ -17,7 +17,6 @@
GAILSettings,
RewardSignalType,
EncoderType,
- FrameworkType,
)
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
@@ -36,8 +35,8 @@
BRAIN_NAME = "1D"
-PPO_TORCH_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
-SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)
+PPO_TORCH_CONFIG = ppo_dummy_config()
+SAC_TORCH_CONFIG = sac_dummy_config()
@pytest.mark.parametrize("action_sizes", [(0, 1), (1, 0)])
diff --git a/ml-agents/mlagents/trainers/tf/__init__.py b/ml-agents/mlagents/trainers/tf/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/ml-agents/mlagents/trainers/tf/components/__init__.py b/ml-agents/mlagents/trainers/tf/components/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/ml-agents/mlagents/trainers/tf/components/bc/__init__.py b/ml-agents/mlagents/trainers/tf/components/bc/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/ml-agents/mlagents/trainers/tf/components/bc/model.py b/ml-agents/mlagents/trainers/tf/components/bc/model.py
deleted file mode 100644
index 501841ed55..0000000000
--- a/ml-agents/mlagents/trainers/tf/components/bc/model.py
+++ /dev/null
@@ -1,75 +0,0 @@
-from mlagents.tf_utils import tf
-
-from mlagents.trainers.policy.tf_policy import TFPolicy
-
-
-class BCModel:
- def __init__(
- self, policy: TFPolicy, learning_rate: float = 3e-4, anneal_steps: int = 0
- ):
- """
- Tensorflow operations to perform Behavioral Cloning on a Policy model
- :param policy: The policy of the learning algorithm
- :param lr: The initial learning Rate for behavioral cloning
- :param anneal_steps: Number of steps over which to anneal BC training
- """
- self.policy = policy
- self.expert_visual_in = self.policy.visual_in
- self.obs_in_expert = self.policy.vector_in
- self.make_inputs()
- self.create_loss(learning_rate, anneal_steps)
-
- def make_inputs(self) -> None:
- """
- Creates the input layers for the discriminator
- """
- self.done_expert = tf.placeholder(shape=[None, 1], dtype=tf.float32)
- self.done_policy = tf.placeholder(shape=[None, 1], dtype=tf.float32)
-
- if self.policy.behavior_spec.action_spec.is_continuous():
- action_length = self.policy.act_size[0]
- self.action_in_expert = tf.placeholder(
- shape=[None, action_length], dtype=tf.float32
- )
- self.expert_action = tf.identity(self.action_in_expert)
- else:
- action_length = len(self.policy.act_size)
- self.action_in_expert = tf.placeholder(
- shape=[None, action_length], dtype=tf.int32
- )
- self.expert_action = tf.concat(
- [
- tf.one_hot(self.action_in_expert[:, i], act_size)
- for i, act_size in enumerate(self.policy.act_size)
- ],
- axis=1,
- )
-
- def create_loss(self, learning_rate: float, anneal_steps: int) -> None:
- """
- Creates the loss and update nodes for the BC module
- :param learning_rate: The learning rate for the optimizer
- :param anneal_steps: Number of steps over which to anneal the learning_rate
- """
- selected_action = self.policy.output
- if self.policy.use_continuous_act:
- self.loss = tf.reduce_mean(
- tf.squared_difference(selected_action, self.expert_action)
- )
- else:
- log_probs = self.policy.all_log_probs
- self.loss = tf.reduce_mean(
- -tf.log(tf.nn.softmax(log_probs) + 1e-7) * self.expert_action
- )
-
- if anneal_steps > 0:
- self.annealed_learning_rate = tf.train.polynomial_decay(
- learning_rate, self.policy.global_step, anneal_steps, 0.0, power=1.0
- )
- else:
- self.annealed_learning_rate = tf.Variable(learning_rate)
-
- optimizer = tf.train.AdamOptimizer(
- learning_rate=self.annealed_learning_rate, name="bc_adam"
- )
- self.update_batch = optimizer.minimize(self.loss)
diff --git a/ml-agents/mlagents/trainers/tf/components/bc/module.py b/ml-agents/mlagents/trainers/tf/components/bc/module.py
deleted file mode 100644
index ef829c73b1..0000000000
--- a/ml-agents/mlagents/trainers/tf/components/bc/module.py
+++ /dev/null
@@ -1,136 +0,0 @@
-from typing import Dict, Any
-import numpy as np
-
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from .model import BCModel
-from mlagents.trainers.demo_loader import demo_to_buffer
-from mlagents.trainers.settings import BehavioralCloningSettings
-
-
-class BCModule:
- def __init__(
- self,
- policy: TFPolicy,
- settings: BehavioralCloningSettings,
- policy_learning_rate: float,
- default_batch_size: int,
- default_num_epoch: int,
- ):
- """
- A BC trainer that can be used inline with RL.
- :param policy: The policy of the learning model
- :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate
- for the pretrainer.
- :param default_batch_size: The default batch size to use if batch_size isn't provided.
- :param default_num_epoch: The default num_epoch to use if num_epoch isn't provided.
- :param strength: The proportion of learning rate used to update through BC.
- :param steps: The number of steps to anneal BC training over. 0 for continuous training.
- :param demo_path: The path to the demonstration file.
- :param batch_size: The batch size to use during BC training.
- :param num_epoch: Number of epochs to train for during each update.
- :param samples_per_update: Maximum number of samples to train on during each BC update.
- """
- self.policy = policy
- self.current_lr = policy_learning_rate * settings.strength
- self.model = BCModel(policy, self.current_lr, settings.steps)
- _, self.demonstration_buffer = demo_to_buffer(
- settings.demo_path, policy.sequence_length, policy.behavior_spec
- )
-
- self.batch_size = (
- settings.batch_size if settings.batch_size else default_batch_size
- )
- self.num_epoch = settings.num_epoch if settings.num_epoch else default_num_epoch
- self.n_sequences = max(
- min(self.batch_size, self.demonstration_buffer.num_experiences)
- // policy.sequence_length,
- 1,
- )
-
- self.has_updated = False
- self.use_recurrent = self.policy.use_recurrent
- self.samples_per_update = settings.samples_per_update
- self.out_dict = {
- "loss": self.model.loss,
- "update": self.model.update_batch,
- "learning_rate": self.model.annealed_learning_rate,
- }
-
- def update(self) -> Dict[str, Any]:
- """
- Updates model using buffer.
- :param max_batches: The maximum number of batches to use per update.
- :return: The loss of the update.
- """
- # Don't continue training if the learning rate has reached 0, to reduce training time.
- if self.current_lr <= 0:
- return {"Losses/Pretraining Loss": 0}
-
- batch_losses = []
- possible_demo_batches = (
- self.demonstration_buffer.num_experiences // self.n_sequences
- )
- possible_batches = possible_demo_batches
-
- max_batches = self.samples_per_update // self.n_sequences
-
- n_epoch = self.num_epoch
- for _ in range(n_epoch):
- self.demonstration_buffer.shuffle(
- sequence_length=self.policy.sequence_length
- )
- if max_batches == 0:
- num_batches = possible_batches
- else:
- num_batches = min(possible_batches, max_batches)
- for i in range(num_batches // self.policy.sequence_length):
- demo_update_buffer = self.demonstration_buffer
- start = i * self.n_sequences * self.policy.sequence_length
- end = (i + 1) * self.n_sequences * self.policy.sequence_length
- mini_batch_demo = demo_update_buffer.make_mini_batch(start, end)
- run_out = self._update_batch(mini_batch_demo, self.n_sequences)
- loss = run_out["loss"]
- self.current_lr = run_out["learning_rate"]
- batch_losses.append(loss)
- self.has_updated = True
- update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)}
- return update_stats
-
- def _update_batch(
- self, mini_batch_demo: Dict[str, Any], n_sequences: int
- ) -> Dict[str, Any]:
- """
- Helper function for update_batch.
- """
- feed_dict = {
- self.policy.batch_size_ph: n_sequences,
- self.policy.sequence_length_ph: self.policy.sequence_length,
- }
- if self.policy.behavior_spec.action_spec.is_discrete():
- feed_dict[self.model.action_in_expert] = mini_batch_demo["discrete_action"]
- feed_dict[self.policy.action_masks] = np.ones(
- (
- self.n_sequences * self.policy.sequence_length,
- sum(self.policy.behavior_spec.action_spec.discrete_branches),
- ),
- dtype=np.float32,
- )
- else:
- feed_dict[self.model.action_in_expert] = mini_batch_demo[
- "continuous_action"
- ]
- if self.policy.vec_obs_size > 0:
- feed_dict[self.policy.vector_in] = mini_batch_demo["vector_obs"]
- for i, _ in enumerate(self.policy.visual_in):
- feed_dict[self.policy.visual_in[i]] = mini_batch_demo["visual_obs%d" % i]
- if self.use_recurrent:
- feed_dict[self.policy.memory_in] = np.zeros(
- [self.n_sequences, self.policy.m_size], dtype=np.float32
- )
- if not self.policy.use_continuous_act:
- feed_dict[self.policy.prev_action] = mini_batch_demo["prev_action"]
- network_out = self.policy.sess.run(
- list(self.out_dict.values()), feed_dict=feed_dict
- )
- run_out = dict(zip(list(self.out_dict.keys()), network_out))
- return run_out
diff --git a/ml-agents/mlagents/trainers/tf/components/reward_signals/__init__.py b/ml-agents/mlagents/trainers/tf/components/reward_signals/__init__.py
deleted file mode 100644
index e35b03a038..0000000000
--- a/ml-agents/mlagents/trainers/tf/components/reward_signals/__init__.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from typing import Any, Dict
-from collections import namedtuple
-import numpy as np
-import abc
-
-from mlagents.tf_utils import tf
-
-from mlagents_envs.logging_util import get_logger
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.buffer import AgentBuffer
-from mlagents.trainers.settings import RewardSignalSettings
-
-
-logger = get_logger(__name__)
-
-RewardSignalResult = namedtuple(
- "RewardSignalResult", ["scaled_reward", "unscaled_reward"]
-)
-
-
-class RewardSignal(abc.ABC):
- def __init__(self, policy: TFPolicy, settings: RewardSignalSettings):
- """
- Initializes a reward signal. At minimum, you must pass in the policy it is being applied to,
- the reward strength, and the gamma (discount factor.)
- :param policy: The Policy object (e.g. TFPolicy) that this Reward Signal will apply to.
- :param settings: Settings parameters for this Reward Signal, including gamma and strength.
- :return: A RewardSignal object.
- """
- class_name = self.__class__.__name__
- short_name = class_name.replace("RewardSignal", "")
- self.stat_name = f"Policy/{short_name} Reward"
- self.value_name = f"Policy/{short_name} Value Estimate"
- # Terminate discounted reward computation at Done. Can disable to mitigate positive bias in rewards with
- # no natural end, e.g. GAIL or Curiosity
- self.use_terminal_states = True
- self.update_dict: Dict[str, tf.Tensor] = {}
- self.gamma = settings.gamma
- self.policy = policy
- self.strength = settings.strength
- self.stats_name_to_update_name: Dict[str, str] = {}
-
- def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
- """
- Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward
- function drawn straight from a Buffer.
- :param mini_batch: A Dict of numpy arrays (the format used by our Buffer)
- when drawing from the update buffer.
- :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
- """
- mini_batch_len = len(next(iter(mini_batch.values())))
- return RewardSignalResult(
- self.strength * np.zeros(mini_batch_len, dtype=np.float32),
- np.zeros(mini_batch_len, dtype=np.float32),
- )
-
- def prepare_update(
- self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
- ) -> Dict[tf.Tensor, Any]:
- """
- If the reward signal has an internal model (e.g. GAIL or Curiosity), get the feed_dict
- needed to update the buffer..
- :param update_buffer: An AgentBuffer that contains the live data from which to update.
- :param n_sequences: The number of sequences in the training buffer.
- :return: A dict that corresponds to the feed_dict needed for the update.
- """
- return {}
diff --git a/ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/__init__.py b/ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/model.py b/ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/model.py
deleted file mode 100644
index d6c50f7a98..0000000000
--- a/ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/model.py
+++ /dev/null
@@ -1,169 +0,0 @@
-from typing import List, Tuple
-from mlagents.tf_utils import tf
-
-from mlagents.trainers.tf.models import ModelUtils
-from mlagents.trainers.policy.tf_policy import TFPolicy
-
-
-class CuriosityModel:
- def __init__(
- self, policy: TFPolicy, encoding_size: int = 128, learning_rate: float = 3e-4
- ):
- """
- Creates the curiosity model for the Curiosity reward Generator
- :param policy: The policy being trained
- :param encoding_size: The size of the encoding for the Curiosity module
- :param learning_rate: The learning rate for the curiosity module
- """
- self.encoding_size = encoding_size
- self.policy = policy
- self.next_visual_in: List[tf.Tensor] = []
- encoded_state, encoded_next_state = self.create_curiosity_encoders()
- self.create_inverse_model(encoded_state, encoded_next_state)
- self.create_forward_model(encoded_state, encoded_next_state)
- self.create_loss(learning_rate)
-
- def create_curiosity_encoders(self) -> Tuple[tf.Tensor, tf.Tensor]:
- """
- Creates state encoders for current and future observations.
- Used for implementation of Curiosity-driven Exploration by Self-supervised Prediction
- See https://arxiv.org/abs/1705.05363 for more details.
- :return: current and future state encoder tensors.
- """
- encoded_state_list = []
- encoded_next_state_list = []
-
- # Create input ops for next (t+1) visual observations.
- self.next_vector_in, self.next_visual_in = ModelUtils.create_input_placeholders(
- self.policy.behavior_spec.observation_shapes, name_prefix="curiosity_next_"
- )
-
- if self.next_visual_in:
- visual_encoders = []
- next_visual_encoders = []
- for i, (vis_in, next_vis_in) in enumerate(
- zip(self.policy.visual_in, self.next_visual_in)
- ):
- # Create the encoder ops for current and next visual input.
- # Note that these encoders are siamese.
- encoded_visual = ModelUtils.create_visual_observation_encoder(
- vis_in,
- self.encoding_size,
- ModelUtils.swish,
- 1,
- f"curiosity_stream_{i}_visual_obs_encoder",
- False,
- )
-
- encoded_next_visual = ModelUtils.create_visual_observation_encoder(
- next_vis_in,
- self.encoding_size,
- ModelUtils.swish,
- 1,
- f"curiosity_stream_{i}_visual_obs_encoder",
- True,
- )
- visual_encoders.append(encoded_visual)
- next_visual_encoders.append(encoded_next_visual)
-
- hidden_visual = tf.concat(visual_encoders, axis=1)
- hidden_next_visual = tf.concat(next_visual_encoders, axis=1)
- encoded_state_list.append(hidden_visual)
- encoded_next_state_list.append(hidden_next_visual)
-
- if self.policy.vec_obs_size > 0:
- encoded_vector_obs = ModelUtils.create_vector_observation_encoder(
- self.policy.vector_in,
- self.encoding_size,
- ModelUtils.swish,
- 2,
- "curiosity_vector_obs_encoder",
- False,
- )
- encoded_next_vector_obs = ModelUtils.create_vector_observation_encoder(
- self.next_vector_in,
- self.encoding_size,
- ModelUtils.swish,
- 2,
- "curiosity_vector_obs_encoder",
- True,
- )
- encoded_state_list.append(encoded_vector_obs)
- encoded_next_state_list.append(encoded_next_vector_obs)
- encoded_state = tf.concat(encoded_state_list, axis=1)
- encoded_next_state = tf.concat(encoded_next_state_list, axis=1)
- return encoded_state, encoded_next_state
-
- def create_inverse_model(
- self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
- ) -> None:
- """
- Creates inverse model TensorFlow ops for Curiosity module.
- Predicts action taken given current and future encoded states.
- :param encoded_state: Tensor corresponding to encoded current state.
- :param encoded_next_state: Tensor corresponding to encoded next state.
- """
- combined_input = tf.concat([encoded_state, encoded_next_state], axis=1)
- hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
- if self.policy.behavior_spec.action_spec.is_continuous():
- pred_action = tf.layers.dense(
- hidden, self.policy.act_size[0], activation=None
- )
- squared_difference = tf.reduce_sum(
- tf.squared_difference(pred_action, self.policy.selected_actions), axis=1
- )
- self.inverse_loss = tf.reduce_mean(
- tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1]
- )
- else:
- pred_action = tf.concat(
- [
- tf.layers.dense(
- hidden, self.policy.act_size[i], activation=tf.nn.softmax
- )
- for i in range(len(self.policy.act_size))
- ],
- axis=1,
- )
- cross_entropy = tf.reduce_sum(
- -tf.log(pred_action + 1e-10) * self.policy.selected_actions, axis=1
- )
- self.inverse_loss = tf.reduce_mean(
- tf.dynamic_partition(cross_entropy, self.policy.mask, 2)[1]
- )
-
- def create_forward_model(
- self, encoded_state: tf.Tensor, encoded_next_state: tf.Tensor
- ) -> None:
- """
- Creates forward model TensorFlow ops for Curiosity module.
- Predicts encoded future state based on encoded current state and given action.
- :param encoded_state: Tensor corresponding to encoded current state.
- :param encoded_next_state: Tensor corresponding to encoded next state.
- """
- combined_input = tf.concat(
- [encoded_state, self.policy.selected_actions], axis=1
- )
- hidden = tf.layers.dense(combined_input, 256, activation=ModelUtils.swish)
- pred_next_state = tf.layers.dense(
- hidden,
- self.encoding_size
- * (self.policy.vis_obs_size + int(self.policy.vec_obs_size > 0)),
- activation=None,
- )
- squared_difference = 0.5 * tf.reduce_sum(
- tf.squared_difference(pred_next_state, encoded_next_state), axis=1
- )
- self.intrinsic_reward = squared_difference
- self.forward_loss = tf.reduce_mean(
- tf.dynamic_partition(squared_difference, self.policy.mask, 2)[1]
- )
-
- def create_loss(self, learning_rate: float) -> None:
- """
- Creates the loss node of the model as well as the update_batch optimizer to update the model.
- :param learning_rate: The learning rate for the optimizer.
- """
- self.loss = 10 * (0.2 * self.forward_loss + 0.8 * self.inverse_loss)
- optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
- self.update_batch = optimizer.minimize(self.loss)
diff --git a/ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py b/ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py
deleted file mode 100644
index 48a01d0f34..0000000000
--- a/ml-agents/mlagents/trainers/tf/components/reward_signals/curiosity/signal.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from typing import Any, Dict
-import numpy as np
-from mlagents.tf_utils import tf
-
-from mlagents.trainers.tf.components.reward_signals import (
- RewardSignal,
- RewardSignalResult,
-)
-from mlagents.trainers.tf.components.reward_signals.curiosity.model import (
- CuriosityModel,
-)
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.buffer import AgentBuffer
-from mlagents.trainers.settings import CuriositySettings
-
-
-class CuriosityRewardSignal(RewardSignal):
- def __init__(self, policy: TFPolicy, settings: CuriositySettings):
- """
- Creates the Curiosity reward generator
- :param policy: The Learning Policy
- :param settings: CuriositySettings object that contains the parameters
- (including encoding size and learning rate) for this CuriosityRewardSignal.
- """
- super().__init__(policy, settings)
- self.model = CuriosityModel(
- policy,
- encoding_size=settings.encoding_size,
- learning_rate=settings.learning_rate,
- )
- self.use_terminal_states = False
- self.update_dict = {
- "curiosity_forward_loss": self.model.forward_loss,
- "curiosity_inverse_loss": self.model.inverse_loss,
- "curiosity_update": self.model.update_batch,
- }
- self.stats_name_to_update_name = {
- "Losses/Curiosity Forward Loss": "curiosity_forward_loss",
- "Losses/Curiosity Inverse Loss": "curiosity_inverse_loss",
- }
- self.has_updated = False
-
- def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
- feed_dict: Dict[tf.Tensor, Any] = {
- self.policy.batch_size_ph: len(mini_batch["vector_obs"]),
- self.policy.sequence_length_ph: self.policy.sequence_length,
- }
- if self.policy.use_vec_obs:
- feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
- feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
- if self.policy.vis_obs_size > 0:
- for i in range(len(self.policy.visual_in)):
- _obs = mini_batch["visual_obs%d" % i]
- _next_obs = mini_batch["next_visual_obs%d" % i]
- feed_dict[self.policy.visual_in[i]] = _obs
- feed_dict[self.model.next_visual_in[i]] = _next_obs
-
- if self.policy.use_continuous_act:
- feed_dict[self.policy.selected_actions] = mini_batch["continuous_action"]
- else:
- feed_dict[self.policy.output] = mini_batch["discrete_action"]
- unscaled_reward = self.policy.sess.run(
- self.model.intrinsic_reward, feed_dict=feed_dict
- )
- scaled_reward = np.clip(
- unscaled_reward * float(self.has_updated) * self.strength, 0, 1
- )
- return RewardSignalResult(scaled_reward, unscaled_reward)
-
- def prepare_update(
- self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
- ) -> Dict[tf.Tensor, Any]:
- """
- Prepare for update and get feed_dict.
- :param num_sequences: Number of trajectories in batch.
- :param mini_batch: Experience batch.
- :return: Feed_dict needed for update.
- """
- feed_dict = {
- policy.batch_size_ph: num_sequences,
- policy.sequence_length_ph: self.policy.sequence_length,
- policy.mask_input: mini_batch["masks"],
- }
- if self.policy.use_continuous_act:
- feed_dict[policy.selected_actions] = mini_batch["continuous_action"]
- else:
- feed_dict[policy.output] = mini_batch["discrete_action"]
- if self.policy.use_vec_obs:
- feed_dict[policy.vector_in] = mini_batch["vector_obs"]
- feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
- if policy.vis_obs_size > 0:
- for i, vis_in in enumerate(policy.visual_in):
- feed_dict[vis_in] = mini_batch["visual_obs%d" % i]
- for i, next_vis_in in enumerate(self.model.next_visual_in):
- feed_dict[next_vis_in] = mini_batch["next_visual_obs%d" % i]
-
- self.has_updated = True
- return feed_dict
diff --git a/ml-agents/mlagents/trainers/tf/components/reward_signals/extrinsic/__init__.py b/ml-agents/mlagents/trainers/tf/components/reward_signals/extrinsic/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/ml-agents/mlagents/trainers/tf/components/reward_signals/extrinsic/signal.py b/ml-agents/mlagents/trainers/tf/components/reward_signals/extrinsic/signal.py
deleted file mode 100644
index f975c36919..0000000000
--- a/ml-agents/mlagents/trainers/tf/components/reward_signals/extrinsic/signal.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import numpy as np
-
-from mlagents.trainers.tf.components.reward_signals import (
- RewardSignal,
- RewardSignalResult,
-)
-from mlagents.trainers.buffer import AgentBuffer
-
-
-class ExtrinsicRewardSignal(RewardSignal):
- def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
- env_rews = np.array(mini_batch["environment_rewards"], dtype=np.float32)
- return RewardSignalResult(self.strength * env_rews, env_rews)
diff --git a/ml-agents/mlagents/trainers/tf/components/reward_signals/gail/__init__.py b/ml-agents/mlagents/trainers/tf/components/reward_signals/gail/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/ml-agents/mlagents/trainers/tf/components/reward_signals/gail/model.py b/ml-agents/mlagents/trainers/tf/components/reward_signals/gail/model.py
deleted file mode 100644
index 94c3fe9b86..0000000000
--- a/ml-agents/mlagents/trainers/tf/components/reward_signals/gail/model.py
+++ /dev/null
@@ -1,310 +0,0 @@
-from typing import Optional, Tuple
-
-from mlagents.tf_utils import tf
-
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.tf.models import ModelUtils
-
-EPSILON = 1e-7
-
-
-class GAILModel:
- def __init__(
- self,
- policy: TFPolicy,
- h_size: int = 128,
- learning_rate: float = 3e-4,
- encoding_size: int = 64,
- use_actions: bool = False,
- use_vail: bool = False,
- gradient_penalty_weight: float = 10.0,
- ):
- """
- The initializer for the GAIL reward generator.
- https://arxiv.org/abs/1606.03476
- :param policy_model: The policy of the learning algorithm
- :param h_size: Size of the hidden layer for the discriminator
- :param learning_rate: The learning Rate for the discriminator
- :param encoding_size: The encoding size for the encoder
- :param use_actions: Whether or not to use actions to discriminate
- :param use_vail: Whether or not to use a variational bottleneck for the
- discriminator. See https://arxiv.org/abs/1810.00821.
- """
- self.h_size = h_size
- self.z_size = 128
- self.alpha = 0.0005
- self.mutual_information = 0.5
- self.policy = policy
- self.encoding_size = encoding_size
- self.gradient_penalty_weight = gradient_penalty_weight
- self.use_vail = use_vail
- self.use_actions = use_actions # True # Not using actions
-
- self.noise: Optional[tf.Tensor] = None
- self.z: Optional[tf.Tensor] = None
-
- self.make_inputs()
- self.create_network()
- self.create_loss(learning_rate)
- if self.use_vail:
- self.make_beta_update()
-
- def make_beta_update(self) -> None:
- """
- Creates the beta parameter and its updater for GAIL
- """
-
- new_beta = tf.maximum(
- self.beta + self.alpha * (self.kl_loss - self.mutual_information), EPSILON
- )
- with tf.control_dependencies([self.update_batch]):
- self.update_beta = tf.assign(self.beta, new_beta)
-
- def make_inputs(self) -> None:
- """
- Creates the input layers for the discriminator
- """
- self.done_expert_holder = tf.placeholder(shape=[None], dtype=tf.float32)
- self.done_policy_holder = tf.placeholder(shape=[None], dtype=tf.float32)
- self.done_expert = tf.expand_dims(self.done_expert_holder, -1)
- self.done_policy = tf.expand_dims(self.done_policy_holder, -1)
-
- if self.policy.behavior_spec.action_spec.is_continuous():
- action_length = self.policy.act_size[0]
- self.action_in_expert = tf.placeholder(
- shape=[None, action_length], dtype=tf.float32
- )
- self.expert_action = tf.identity(self.action_in_expert)
- else:
- action_length = len(self.policy.act_size)
- self.action_in_expert = tf.placeholder(
- shape=[None, action_length], dtype=tf.int32
- )
- self.expert_action = tf.concat(
- [
- tf.one_hot(self.action_in_expert[:, i], act_size)
- for i, act_size in enumerate(self.policy.act_size)
- ],
- axis=1,
- )
-
- encoded_policy_list = []
- encoded_expert_list = []
-
- (
- self.obs_in_expert,
- self.expert_visual_in,
- ) = ModelUtils.create_input_placeholders(
- self.policy.behavior_spec.observation_shapes, "gail_"
- )
-
- if self.policy.vec_obs_size > 0:
- if self.policy.normalize:
- encoded_expert_list.append(
- ModelUtils.normalize_vector_obs(
- self.obs_in_expert,
- self.policy.running_mean,
- self.policy.running_variance,
- self.policy.normalization_steps,
- )
- )
- encoded_policy_list.append(self.policy.processed_vector_in)
- else:
- encoded_expert_list.append(self.obs_in_expert)
- encoded_policy_list.append(self.policy.vector_in)
-
- if self.expert_visual_in:
- visual_policy_encoders = []
- visual_expert_encoders = []
- for i, (vis_in, exp_vis_in) in enumerate(
- zip(self.policy.visual_in, self.expert_visual_in)
- ):
- encoded_policy_visual = ModelUtils.create_visual_observation_encoder(
- vis_in,
- self.encoding_size,
- ModelUtils.swish,
- 1,
- f"gail_stream_{i}_visual_obs_encoder",
- False,
- )
-
- encoded_expert_visual = ModelUtils.create_visual_observation_encoder(
- exp_vis_in,
- self.encoding_size,
- ModelUtils.swish,
- 1,
- f"gail_stream_{i}_visual_obs_encoder",
- True,
- )
- visual_policy_encoders.append(encoded_policy_visual)
- visual_expert_encoders.append(encoded_expert_visual)
- hidden_policy_visual = tf.concat(visual_policy_encoders, axis=1)
- hidden_expert_visual = tf.concat(visual_expert_encoders, axis=1)
- encoded_policy_list.append(hidden_policy_visual)
- encoded_expert_list.append(hidden_expert_visual)
-
- self.encoded_expert = tf.concat(encoded_expert_list, axis=1)
- self.encoded_policy = tf.concat(encoded_policy_list, axis=1)
-
- def create_encoder(
- self, state_in: tf.Tensor, action_in: tf.Tensor, done_in: tf.Tensor, reuse: bool
- ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
- """
- Creates the encoder for the discriminator
- :param state_in: The encoded observation input
- :param action_in: The action input
- :param done_in: The done flags input
- :param reuse: If true, the weights will be shared with the previous encoder created
- """
- with tf.variable_scope("GAIL_model"):
- if self.use_actions:
- concat_input = tf.concat([state_in, action_in, done_in], axis=1)
- else:
- concat_input = state_in
-
- hidden_1 = tf.layers.dense(
- concat_input,
- self.h_size,
- activation=ModelUtils.swish,
- name="gail_d_hidden_1",
- reuse=reuse,
- )
-
- hidden_2 = tf.layers.dense(
- hidden_1,
- self.h_size,
- activation=ModelUtils.swish,
- name="gail_d_hidden_2",
- reuse=reuse,
- )
-
- z_mean = None
- if self.use_vail:
- # Latent representation
- z_mean = tf.layers.dense(
- hidden_2,
- self.z_size,
- reuse=reuse,
- name="gail_z_mean",
- kernel_initializer=ModelUtils.scaled_init(0.01),
- )
-
- self.noise = tf.random_normal(tf.shape(z_mean), dtype=tf.float32)
-
- # Sampled latent code
- self.z = z_mean + self.z_sigma * self.noise * self.use_noise
- estimate_input = self.z
- else:
- estimate_input = hidden_2
-
- estimate = tf.layers.dense(
- estimate_input,
- 1,
- activation=tf.nn.sigmoid,
- name="gail_d_estimate",
- reuse=reuse,
- )
- return estimate, z_mean, concat_input
-
- def create_network(self) -> None:
- """
- Helper for creating the intrinsic reward nodes
- """
- if self.use_vail:
- self.z_sigma = tf.get_variable(
- "gail_sigma_vail",
- self.z_size,
- dtype=tf.float32,
- initializer=tf.ones_initializer(),
- )
- self.z_sigma_sq = self.z_sigma * self.z_sigma
- self.z_log_sigma_sq = tf.log(self.z_sigma_sq + EPSILON)
- self.use_noise = tf.placeholder(
- shape=[1], dtype=tf.float32, name="gail_NoiseLevel"
- )
- self.expert_estimate, self.z_mean_expert, _ = self.create_encoder(
- self.encoded_expert, self.expert_action, self.done_expert, reuse=False
- )
- self.policy_estimate, self.z_mean_policy, _ = self.create_encoder(
- self.encoded_policy,
- self.policy.selected_actions,
- self.done_policy,
- reuse=True,
- )
- self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
- self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
- self.discriminator_score = tf.reshape(
- self.policy_estimate, [-1], name="gail_reward"
- )
- self.intrinsic_reward = -tf.log(1.0 - self.discriminator_score + EPSILON)
-
- def create_gradient_magnitude(self) -> tf.Tensor:
- """
- Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
- for off-policy. Compute gradients w.r.t randomly interpolated input.
- """
- expert = [self.encoded_expert, self.expert_action, self.done_expert]
- policy = [self.encoded_policy, self.policy.selected_actions, self.done_policy]
- interp = []
- for _expert_in, _policy_in in zip(expert, policy):
- alpha = tf.random_uniform(tf.shape(_expert_in))
- interp.append(alpha * _expert_in + (1 - alpha) * _policy_in)
-
- grad_estimate, _, grad_input = self.create_encoder(
- interp[0], interp[1], interp[2], reuse=True
- )
-
- grad = tf.gradients(grad_estimate, [grad_input])[0]
-
- # Norm's gradient could be NaN at 0. Use our own safe_norm
- safe_norm = tf.sqrt(tf.reduce_sum(grad ** 2, axis=-1) + EPSILON)
- gradient_mag = tf.reduce_mean(tf.pow(safe_norm - 1, 2))
-
- return gradient_mag
-
- def create_loss(self, learning_rate: float) -> None:
- """
- Creates the loss and update nodes for the GAIL reward generator
- :param learning_rate: The learning rate for the optimizer
- """
- self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
- self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
-
- if self.use_vail:
- self.beta = tf.get_variable(
- "gail_beta",
- [],
- trainable=False,
- dtype=tf.float32,
- initializer=tf.ones_initializer(),
- )
-
- self.discriminator_loss = -tf.reduce_mean(
- tf.log(self.expert_estimate + EPSILON)
- + tf.log(1.0 - self.policy_estimate + EPSILON)
- )
-
- if self.use_vail:
- # KL divergence loss (encourage latent representation to be normal)
- self.kl_loss = tf.reduce_mean(
- -tf.reduce_sum(
- 1
- + self.z_log_sigma_sq
- - 0.5 * tf.square(self.z_mean_expert)
- - 0.5 * tf.square(self.z_mean_policy)
- - tf.exp(self.z_log_sigma_sq),
- 1,
- )
- )
- self.loss = (
- self.beta * (self.kl_loss - self.mutual_information)
- + self.discriminator_loss
- )
- else:
- self.loss = self.discriminator_loss
-
- if self.gradient_penalty_weight > 0.0:
- self.loss += self.gradient_penalty_weight * self.create_gradient_magnitude()
-
- optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
- self.update_batch = optimizer.minimize(self.loss)
diff --git a/ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py b/ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py
deleted file mode 100644
index d2b585314e..0000000000
--- a/ml-agents/mlagents/trainers/tf/components/reward_signals/gail/signal.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from typing import Any, Dict
-import numpy as np
-from mlagents.tf_utils import tf
-
-from mlagents.trainers.tf.components.reward_signals import (
- RewardSignal,
- RewardSignalResult,
-)
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.tf.components.reward_signals.gail.model import GAILModel
-from mlagents.trainers.demo_loader import demo_to_buffer
-from mlagents.trainers.buffer import AgentBuffer
-from mlagents.trainers.settings import GAILSettings
-
-
-class GAILRewardSignal(RewardSignal):
- def __init__(self, policy: TFPolicy, settings: GAILSettings):
- """
- The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476
- :param policy: The policy of the learning model
- :param settings: The settings for this GAILRewardSignal.
- See https://arxiv.org/abs/1810.00821.
- """
- super().__init__(policy, settings)
- self.use_terminal_states = False
-
- self.model = GAILModel(
- policy,
- 128,
- settings.learning_rate,
- settings.encoding_size,
- settings.use_actions,
- settings.use_vail,
- )
- _, self.demonstration_buffer = demo_to_buffer(
- settings.demo_path, policy.sequence_length, policy.behavior_spec
- )
- self.has_updated = False
- self.update_dict: Dict[str, tf.Tensor] = {
- "gail_loss": self.model.loss,
- "gail_update_batch": self.model.update_batch,
- "gail_policy_estimate": self.model.mean_policy_estimate,
- "gail_expert_estimate": self.model.mean_expert_estimate,
- }
- if self.model.use_vail:
- self.update_dict["kl_loss"] = self.model.kl_loss
- self.update_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq
- self.update_dict["z_mean_expert"] = self.model.z_mean_expert
- self.update_dict["z_mean_policy"] = self.model.z_mean_policy
- self.update_dict["beta_update"] = self.model.update_beta
-
- self.stats_name_to_update_name = {
- "Losses/GAIL Loss": "gail_loss",
- "Policy/GAIL Policy Estimate": "gail_policy_estimate",
- "Policy/GAIL Expert Estimate": "gail_expert_estimate",
- }
-
- def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
- feed_dict: Dict[tf.Tensor, Any] = {
- self.policy.batch_size_ph: len(mini_batch["vector_obs"]),
- self.policy.sequence_length_ph: self.policy.sequence_length,
- }
- if self.model.use_vail:
- feed_dict[self.model.use_noise] = [0]
-
- if self.policy.use_vec_obs:
- feed_dict[self.policy.vector_in] = mini_batch["vector_obs"]
- if self.policy.vis_obs_size > 0:
- for i in range(len(self.policy.visual_in)):
- _obs = mini_batch["visual_obs%d" % i]
- feed_dict[self.policy.visual_in[i]] = _obs
-
- if self.policy.use_continuous_act:
- feed_dict[self.policy.selected_actions] = mini_batch["continuous_action"]
- else:
- feed_dict[self.policy.output] = mini_batch["discrete_action"]
- feed_dict[self.model.done_policy_holder] = np.array(
- mini_batch["done"]
- ).flatten()
- unscaled_reward = self.policy.sess.run(
- self.model.intrinsic_reward, feed_dict=feed_dict
- )
- scaled_reward = unscaled_reward * float(self.has_updated) * self.strength
- return RewardSignalResult(scaled_reward, unscaled_reward)
-
- def prepare_update(
- self, policy: TFPolicy, mini_batch: AgentBuffer, num_sequences: int
- ) -> Dict[tf.Tensor, Any]:
- """
- Prepare inputs for update.
- :param policy: The policy learning from GAIL signal
- :param mini_batch: A mini batch from trajectories sampled from the current policy
- :param num_sequences: Number of samples in batch
- :return: Feed_dict for update process.
- """
- # Get batch from demo buffer. Even if demo buffer is smaller, we sample with replacement
- mini_batch_demo = self.demonstration_buffer.sample_mini_batch(
- mini_batch.num_experiences, 1
- )
-
- feed_dict: Dict[tf.Tensor, Any] = {
- self.model.done_expert_holder: mini_batch_demo["done"],
- self.model.done_policy_holder: mini_batch["done"],
- }
-
- if self.model.use_vail:
- feed_dict[self.model.use_noise] = [1]
-
- if self.policy.use_continuous_act:
- feed_dict[policy.selected_actions] = mini_batch["continuous_action"]
- feed_dict[self.model.action_in_expert] = np.array(
- mini_batch_demo["continuous_action"]
- )
- else:
- feed_dict[policy.output] = mini_batch["discrete_action"]
- feed_dict[self.model.action_in_expert] = np.array(
- mini_batch_demo["discrete_action"]
- )
-
- if self.policy.use_vis_obs > 0:
- for i in range(len(policy.visual_in)):
- feed_dict[policy.visual_in[i]] = mini_batch["visual_obs%d" % i]
- feed_dict[self.model.expert_visual_in[i]] = mini_batch_demo[
- "visual_obs%d" % i
- ]
- if self.policy.use_vec_obs:
- feed_dict[policy.vector_in] = mini_batch["vector_obs"]
- feed_dict[self.model.obs_in_expert] = mini_batch_demo["vector_obs"]
- self.has_updated = True
- return feed_dict
diff --git a/ml-agents/mlagents/trainers/tf/components/reward_signals/reward_signal_factory.py b/ml-agents/mlagents/trainers/tf/components/reward_signals/reward_signal_factory.py
deleted file mode 100644
index 02fa63a9bc..0000000000
--- a/ml-agents/mlagents/trainers/tf/components/reward_signals/reward_signal_factory.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from typing import Dict, Type
-from mlagents.trainers.exception import UnityTrainerException
-from mlagents.trainers.tf.components.reward_signals import RewardSignal
-from mlagents.trainers.tf.components.reward_signals.extrinsic.signal import (
- ExtrinsicRewardSignal,
-)
-from mlagents.trainers.tf.components.reward_signals.gail.signal import GAILRewardSignal
-from mlagents.trainers.tf.components.reward_signals.curiosity.signal import (
- CuriosityRewardSignal,
-)
-from mlagents.trainers.policy.tf_policy import TFPolicy
-from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
-
-
-NAME_TO_CLASS: Dict[RewardSignalType, Type[RewardSignal]] = {
- RewardSignalType.EXTRINSIC: ExtrinsicRewardSignal,
- RewardSignalType.CURIOSITY: CuriosityRewardSignal,
- RewardSignalType.GAIL: GAILRewardSignal,
-}
-
-
-def create_reward_signal(
- policy: TFPolicy, name: RewardSignalType, settings: RewardSignalSettings
-) -> RewardSignal:
- """
- Creates a reward signal class based on the name and config entry provided as a dict.
- :param policy: The policy class which the reward will be applied to.
- :param name: The name of the reward signal
- :param config_entry: The config entries for that reward signal
- :return: The reward signal class instantiated
- """
- rcls = NAME_TO_CLASS.get(name)
- if not rcls:
- raise UnityTrainerException(f"Unknown reward signal type {name}")
-
- class_inst = rcls(policy, settings)
- return class_inst
diff --git a/ml-agents/mlagents/trainers/tf/distributions.py b/ml-agents/mlagents/trainers/tf/distributions.py
deleted file mode 100644
index 1d641d86a7..0000000000
--- a/ml-agents/mlagents/trainers/tf/distributions.py
+++ /dev/null
@@ -1,319 +0,0 @@
-import abc
-from typing import NamedTuple, List, Tuple
-import numpy as np
-
-from mlagents.tf_utils import tf
-from mlagents.trainers.tf.models import ModelUtils
-
-EPSILON = 1e-6 # Small value to avoid divide by zero
-
-
-class OutputDistribution(abc.ABC):
- @abc.abstractproperty
- def log_probs(self) -> tf.Tensor:
- """
- Returns a Tensor that when evaluated, produces the per-action log probabilities of this distribution.
- The shape of this Tensor should be equivalent to (batch_size x the number of actions) produced in sample.
- """
- pass
-
- @abc.abstractproperty
- def total_log_probs(self) -> tf.Tensor:
- """
- Returns a Tensor that when evaluated, produces the total log probability for a single sample.
- The shape of this Tensor should be equivalent to (batch_size x 1) produced in sample.
- """
- pass
-
- @abc.abstractproperty
- def sample(self) -> tf.Tensor:
- """
- Returns a Tensor that when evaluated, produces a sample of this OutputDistribution.
- """
- pass
-
- @abc.abstractproperty
- def entropy(self) -> tf.Tensor:
- """
- Returns a Tensor that when evaluated, produces the entropy of this distribution.
- """
- pass
-
-
-class DiscreteOutputDistribution(OutputDistribution):
- @abc.abstractproperty
- def sample_onehot(self) -> tf.Tensor:
- """
- Returns a one-hot version of the output.
- """
-
-
-class GaussianDistribution(OutputDistribution):
- """
- A Gaussian output distribution for continuous actions.
- """
-
- class MuSigmaTensors(NamedTuple):
- mu: tf.Tensor
- log_sigma: tf.Tensor
- sigma: tf.Tensor
-
- def __init__(
- self,
- logits: tf.Tensor,
- act_size: List[int],
- reparameterize: bool = False,
- tanh_squash: bool = False,
- condition_sigma: bool = True,
- log_sigma_min: float = -20,
- log_sigma_max: float = 2,
- ):
- """
- A Gaussian output distribution for continuous actions.
- :param logits: Hidden layer to use as the input to the Gaussian distribution.
- :param act_size: List containing the number of continuous actions.
- :param reparameterize: Whether or not to use the reparameterization trick (block gradients through
- log probability calculation.)
- :param tanh_squash: Squash the output using tanh, constraining it between -1 and 1.
- From: Haarnoja et. al, https://arxiv.org/abs/1801.01290
- :param log_sigma_min: Minimum log standard deviation to clip by.
- :param log_sigma_max: Maximum log standard deviation to clip by.
- """
- encoded = self._create_mu_log_sigma(
- logits,
- act_size,
- log_sigma_min,
- log_sigma_max,
- condition_sigma=condition_sigma,
- )
- self._sampled_policy = self._create_sampled_policy(encoded)
- if not reparameterize:
- _sampled_policy_probs = tf.stop_gradient(self._sampled_policy)
- else:
- _sampled_policy_probs = self._sampled_policy
- self._all_probs = self._create_log_probs(_sampled_policy_probs, encoded)
- if tanh_squash:
- self._sampled_policy = tf.tanh(self._sampled_policy)
- self._all_probs = self._do_squash_correction_for_tanh(
- self._all_probs, self._sampled_policy
- )
- self._total_prob = tf.reduce_sum(self._all_probs, axis=1, keepdims=True)
- self._entropy = self._create_entropy(encoded)
-
- def _create_mu_log_sigma(
- self,
- logits: tf.Tensor,
- act_size: List[int],
- log_sigma_min: float,
- log_sigma_max: float,
- condition_sigma: bool,
- ) -> "GaussianDistribution.MuSigmaTensors":
-
- mu = tf.layers.dense(
- logits,
- act_size[0],
- activation=None,
- name="mu",
- kernel_initializer=ModelUtils.scaled_init(0.01),
- reuse=tf.AUTO_REUSE,
- )
-
- if condition_sigma:
- # Policy-dependent log_sigma_sq
- log_sigma = tf.layers.dense(
- logits,
- act_size[0],
- activation=None,
- name="log_std",
- kernel_initializer=ModelUtils.scaled_init(0.01),
- )
- else:
- log_sigma = tf.get_variable(
- "log_std",
- [act_size[0]],
- dtype=tf.float32,
- initializer=tf.zeros_initializer(),
- )
- log_sigma = tf.clip_by_value(log_sigma, log_sigma_min, log_sigma_max)
- sigma = tf.exp(log_sigma)
- return self.MuSigmaTensors(mu, log_sigma, sigma)
-
- def _create_sampled_policy(
- self, encoded: "GaussianDistribution.MuSigmaTensors"
- ) -> tf.Tensor:
- epsilon = tf.random_normal(tf.shape(encoded.mu))
- sampled_policy = encoded.mu + encoded.sigma * epsilon
-
- return sampled_policy
-
- def _create_log_probs(
- self, sampled_policy: tf.Tensor, encoded: "GaussianDistribution.MuSigmaTensors"
- ) -> tf.Tensor:
- _gauss_pre = -0.5 * (
- ((sampled_policy - encoded.mu) / (encoded.sigma + EPSILON)) ** 2
- + 2 * encoded.log_sigma
- + np.log(2 * np.pi)
- )
- return _gauss_pre
-
- def _create_entropy(
- self, encoded: "GaussianDistribution.MuSigmaTensors"
- ) -> tf.Tensor:
- single_dim_entropy = 0.5 * tf.reduce_mean(
- tf.log(2 * np.pi * np.e) + 2 * encoded.log_sigma
- )
- # Make entropy the right shape
- return tf.ones_like(tf.reshape(encoded.mu[:, 0], [-1])) * single_dim_entropy
-
- def _do_squash_correction_for_tanh(self, probs, squashed_policy):
- """
- Adjust probabilities for squashed sample before output
- """
- adjusted_probs = probs - tf.log(1 - squashed_policy ** 2 + EPSILON)
- return adjusted_probs
-
- @property
- def total_log_probs(self) -> tf.Tensor:
- return self._total_prob
-
- @property
- def log_probs(self) -> tf.Tensor:
- return self._all_probs
-
- @property
- def sample(self) -> tf.Tensor:
- return self._sampled_policy
-
- @property
- def entropy(self) -> tf.Tensor:
- return self._entropy
-
-
-class MultiCategoricalDistribution(DiscreteOutputDistribution):
- """
- A categorical distribution for multi-branched discrete actions. Also supports action masking.
- """
-
- def __init__(self, logits: tf.Tensor, act_size: List[int], action_masks: tf.Tensor):
- """
- A categorical distribution for multi-branched discrete actions.
- :param logits: Hidden layer to use as the input to the Gaussian distribution.
- :param act_size: List containing the number of discrete actions per branch.
- :param action_masks: Tensor representing action masks. Should be of length sum(act_size), and 0 for masked
- and 1 for unmasked.
- """
- unmasked_log_probs = self._create_policy_branches(logits, act_size)
- (
- self._sampled_policy,
- self._all_probs,
- action_index,
- ) = self._get_masked_actions_probs(unmasked_log_probs, act_size, action_masks)
- self._sampled_onehot = self._action_onehot(self._sampled_policy, act_size)
- self._entropy = self._create_entropy(self._all_probs, action_index, act_size)
- self._total_prob = self._get_log_probs(
- self._sampled_onehot, self._all_probs, action_index, act_size
- )
-
- def _create_policy_branches(
- self, logits: tf.Tensor, act_size: List[int]
- ) -> List[tf.Tensor]:
- policy_branches = []
- for size in act_size:
- policy_branches.append(
- tf.layers.dense(
- logits,
- size,
- activation=None,
- use_bias=False,
- kernel_initializer=ModelUtils.scaled_init(0.01),
- )
- )
- return policy_branches
-
- def _get_masked_actions_probs(
- self,
- unmasked_log_probs: List[tf.Tensor],
- act_size: List[int],
- action_masks: tf.Tensor,
- ) -> Tuple[tf.Tensor, tf.Tensor, np.ndarray]:
- output, _, all_log_probs = ModelUtils.create_discrete_action_masking_layer(
- unmasked_log_probs, action_masks, act_size
- )
-
- action_idx = [0] + list(np.cumsum(act_size))
- return output, all_log_probs, action_idx
-
- def _action_onehot(self, sample: tf.Tensor, act_size: List[int]) -> tf.Tensor:
- action_oh = tf.concat(
- [tf.one_hot(sample[:, i], act_size[i]) for i in range(len(act_size))],
- axis=1,
- )
- return action_oh
-
- def _get_log_probs(
- self,
- sample_onehot: tf.Tensor,
- all_log_probs: tf.Tensor,
- action_idx: List[int],
- act_size: List[int],
- ) -> tf.Tensor:
- log_probs = tf.reduce_sum(
- (
- tf.stack(
- [
- -tf.nn.softmax_cross_entropy_with_logits_v2(
- labels=sample_onehot[:, action_idx[i] : action_idx[i + 1]],
- logits=all_log_probs[:, action_idx[i] : action_idx[i + 1]],
- )
- for i in range(len(act_size))
- ],
- axis=1,
- )
- ),
- axis=1,
- keepdims=True,
- )
- return log_probs
-
- def _create_entropy(
- self, all_log_probs: tf.Tensor, action_idx: List[int], act_size: List[int]
- ) -> tf.Tensor:
- entropy = tf.reduce_sum(
- (
- tf.stack(
- [
- tf.nn.softmax_cross_entropy_with_logits_v2(
- labels=tf.nn.softmax(
- all_log_probs[:, action_idx[i] : action_idx[i + 1]]
- ),
- logits=all_log_probs[:, action_idx[i] : action_idx[i + 1]],
- )
- for i in range(len(act_size))
- ],
- axis=1,
- )
- ),
- axis=1,
- )
-
- return entropy
-
- @property
- def log_probs(self) -> tf.Tensor:
- return self._all_probs
-
- @property
- def total_log_probs(self) -> tf.Tensor:
- return self._total_prob
-
- @property
- def sample(self) -> tf.Tensor:
- return self._sampled_policy
-
- @property
- def sample_onehot(self) -> tf.Tensor:
- return self._sampled_onehot
-
- @property
- def entropy(self) -> tf.Tensor:
- return self._entropy
diff --git a/ml-agents/mlagents/trainers/tf/model_serialization.py b/ml-agents/mlagents/trainers/tf/model_serialization.py
deleted file mode 100644
index aba884ed2c..0000000000
--- a/ml-agents/mlagents/trainers/tf/model_serialization.py
+++ /dev/null
@@ -1,219 +0,0 @@
-from distutils.util import strtobool
-import os
-from typing import Any, List, Set
-from distutils.version import LooseVersion
-
-try:
- from tf2onnx.tfonnx import process_tf_graph, tf_optimize
- from tf2onnx import optimizer
-
- ONNX_EXPORT_ENABLED = True
-except ImportError:
- # Either onnx and tf2onnx not installed, or they're not compatible with the version of tensorflow
- ONNX_EXPORT_ENABLED = False
- pass
-
-from mlagents.tf_utils import tf
-
-from tensorflow.python.platform import gfile
-from tensorflow.python.framework import graph_util
-
-from mlagents_envs.logging_util import get_logger
-from mlagents.trainers.settings import SerializationSettings
-from mlagents.trainers.tf import tensorflow_to_barracuda as tf2bc
-
-if LooseVersion(tf.__version__) < LooseVersion("1.12.0"):
- # ONNX is only tested on 1.12.0 and later
- ONNX_EXPORT_ENABLED = False
-
-logger = get_logger(__name__)
-
-
-POSSIBLE_INPUT_NODES = frozenset(
- [
- "action_masks",
- "epsilon",
- "prev_action",
- "recurrent_in",
- "sequence_length",
- "vector_observation",
- ]
-)
-
-POSSIBLE_OUTPUT_NODES = frozenset(["action", "recurrent_out", "value_estimate"])
-
-MODEL_CONSTANTS = frozenset(
- [
- "action_output_shape",
- "is_continuous_control",
- "memory_size",
- "version_number",
- "trainer_major_version",
- "trainer_minor_version",
- "trainer_patch_version",
- ]
-)
-VISUAL_OBSERVATION_PREFIX = "visual_observation_"
-
-
-def export_policy_model(
- model_path: str,
- output_filepath: str,
- behavior_name: str,
- graph: tf.Graph,
- sess: tf.Session,
-) -> None:
- """
- Exports a TF graph for a Policy to .nn and/or .onnx format for Unity embedding.
-
- :param output_filepath: file path to output the model (without file suffix)
- :param behavior_name: behavior name of the trained model
- :param graph: Tensorflow Graph for the policy
- :param sess: Tensorflow session for the policy
- """
- frozen_graph_def = _make_frozen_graph(behavior_name, graph, sess)
- if not os.path.exists(output_filepath):
- os.makedirs(output_filepath)
- # Save frozen graph
- frozen_graph_def_path = model_path + "/frozen_graph_def.pb"
- with gfile.GFile(frozen_graph_def_path, "wb") as f:
- f.write(frozen_graph_def.SerializeToString())
-
- # Convert to barracuda
- if SerializationSettings.convert_to_barracuda:
- tf2bc.convert(frozen_graph_def_path, f"{output_filepath}.nn")
- logger.info(f"Exported {output_filepath}.nn")
-
- # Save to onnx too (if we were able to import it)
- if ONNX_EXPORT_ENABLED:
- if SerializationSettings.convert_to_onnx:
- try:
- onnx_graph = convert_frozen_to_onnx(behavior_name, frozen_graph_def)
- onnx_output_path = f"{output_filepath}.onnx"
- with open(onnx_output_path, "wb") as f:
- f.write(onnx_graph.SerializeToString())
- logger.info(f"Converting to {onnx_output_path}")
- except Exception:
- # Make conversion errors fatal depending on environment variables (only done during CI)
- if _enforce_onnx_conversion():
- raise
- logger.exception(
- "Exception trying to save ONNX graph. Please report this error on "
- "https://github.com/Unity-Technologies/ml-agents/issues and "
- "attach a copy of frozen_graph_def.pb"
- )
-
- else:
- if _enforce_onnx_conversion():
- raise RuntimeError(
- "ONNX conversion enforced, but couldn't import dependencies."
- )
-
-
-def _make_frozen_graph(
- behavior_name: str, graph: tf.Graph, sess: tf.Session
-) -> tf.GraphDef:
- with graph.as_default():
- target_nodes = ",".join(_process_graph(behavior_name, graph))
- graph_def = graph.as_graph_def()
- output_graph_def = graph_util.convert_variables_to_constants(
- sess, graph_def, target_nodes.replace(" ", "").split(",")
- )
- return output_graph_def
-
-
-def convert_frozen_to_onnx(behavior_name: str, frozen_graph_def: tf.GraphDef) -> Any:
- # This is basically https://github.com/onnx/tensorflow-onnx/blob/master/tf2onnx/convert.py
-
- inputs = _get_input_node_names(frozen_graph_def)
- outputs = _get_output_node_names(frozen_graph_def)
- logger.info(f"onnx export - inputs:{inputs} outputs:{outputs}")
-
- frozen_graph_def = tf_optimize(
- inputs, outputs, frozen_graph_def, fold_constant=True
- )
-
- with tf.Graph().as_default() as tf_graph:
- tf.import_graph_def(frozen_graph_def, name="")
- with tf.Session(graph=tf_graph):
- g = process_tf_graph(
- tf_graph,
- input_names=inputs,
- output_names=outputs,
- opset=SerializationSettings.onnx_opset,
- )
-
- onnx_graph = optimizer.optimize_graph(g)
- model_proto = onnx_graph.make_model(behavior_name)
-
- return model_proto
-
-
-def _get_input_node_names(frozen_graph_def: Any) -> List[str]:
- """
- Get the list of input node names from the graph.
- Names are suffixed with ":0"
- """
- node_names = _get_frozen_graph_node_names(frozen_graph_def)
- input_names = node_names & POSSIBLE_INPUT_NODES
-
- # Check visual inputs sequentially, and exit as soon as we don't find one
- vis_index = 0
- while True:
- vis_node_name = f"{VISUAL_OBSERVATION_PREFIX}{vis_index}"
- if vis_node_name in node_names:
- input_names.add(vis_node_name)
- else:
- break
- vis_index += 1
- # Append the port
- return [f"{n}:0" for n in input_names]
-
-
-def _get_output_node_names(frozen_graph_def: Any) -> List[str]:
- """
- Get the list of output node names from the graph.
- Also include constants, so that they will be readable by the
- onnx importer.
- Names are suffixed with ":0"
- """
- node_names = _get_frozen_graph_node_names(frozen_graph_def)
- output_names = node_names & (POSSIBLE_OUTPUT_NODES | MODEL_CONSTANTS)
- # Append the port
- return [f"{n}:0" for n in output_names]
-
-
-def _get_frozen_graph_node_names(frozen_graph_def: Any) -> Set[str]:
- """
- Get all the node names from the graph.
- """
- names = set()
- for node in frozen_graph_def.node:
- names.add(node.name)
- return names
-
-
-def _process_graph(behavior_name: str, graph: tf.Graph) -> List[str]:
- """
- Gets the list of the output nodes present in the graph for inference
- :return: list of node names
- """
- all_nodes = [x.name for x in graph.as_graph_def().node]
- nodes = [x for x in all_nodes if x in POSSIBLE_OUTPUT_NODES | MODEL_CONSTANTS]
- logger.info("List of nodes to export for behavior :" + behavior_name)
- for n in nodes:
- logger.info("\t" + n)
- return nodes
-
-
-def _enforce_onnx_conversion() -> bool:
- env_var_name = "TEST_ENFORCE_ONNX_CONVERSION"
- if env_var_name not in os.environ:
- return False
-
- val = os.environ[env_var_name]
- try:
- # This handles e.g. "false" converting reasonably to False
- return strtobool(val)
- except Exception:
- return False
diff --git a/ml-agents/mlagents/trainers/tf/models.py b/ml-agents/mlagents/trainers/tf/models.py
deleted file mode 100644
index 8cbf08b866..0000000000
--- a/ml-agents/mlagents/trainers/tf/models.py
+++ /dev/null
@@ -1,719 +0,0 @@
-from typing import Callable, Dict, List, Tuple, NamedTuple
-
-import numpy as np
-from mlagents.tf_utils import tf
-from mlagents.trainers.settings import EncoderType, ScheduleType
-
-from mlagents.trainers.exception import UnityTrainerException
-
-ActivationFunction = Callable[[tf.Tensor], tf.Tensor]
-EncoderFunction = Callable[
- [tf.Tensor, int, ActivationFunction, int, str, bool], tf.Tensor
-]
-
-EPSILON = 1e-7
-
-
-class Tensor3DShape(NamedTuple):
- height: int
- width: int
- num_channels: int
-
-
-class NormalizerTensors(NamedTuple):
- init_op: tf.Operation
- update_op: tf.Operation
- steps: tf.Tensor
- running_mean: tf.Tensor
- running_variance: tf.Tensor
-
-
-class ModelUtils:
- # Minimum supported side for each encoder type. If refactoring an encoder, please
- # adjust these also.
- MIN_RESOLUTION_FOR_ENCODER = {
- EncoderType.MATCH3: 5,
- EncoderType.SIMPLE: 20,
- EncoderType.NATURE_CNN: 36,
- EncoderType.RESNET: 15,
- }
-
- @staticmethod
- def create_global_steps():
- """Creates TF ops to track and increment global training step."""
- global_step = tf.Variable(
- 0, name="global_step", trainable=False, dtype=tf.int64
- )
- steps_to_increment = tf.placeholder(
- shape=[], dtype=tf.int64, name="steps_to_increment"
- )
- increment_step = tf.assign(global_step, tf.add(global_step, steps_to_increment))
- return global_step, increment_step, steps_to_increment
-
- @staticmethod
- def create_schedule(
- schedule: ScheduleType,
- parameter: float,
- global_step: tf.Tensor,
- max_step: int,
- min_value: float,
- ) -> tf.Tensor:
- """
- Create a learning rate tensor.
- :param lr_schedule: Type of learning rate schedule.
- :param lr: Base learning rate.
- :param global_step: A TF Tensor representing the total global step.
- :param max_step: The maximum number of steps in the training run.
- :return: A Tensor containing the learning rate.
- """
- if schedule == ScheduleType.CONSTANT:
- parameter_rate = tf.Variable(parameter, trainable=False)
- elif schedule == ScheduleType.LINEAR:
- parameter_rate = tf.train.polynomial_decay(
- parameter, global_step, max_step, min_value, power=1.0
- )
- else:
- raise UnityTrainerException(f"The schedule {schedule} is invalid.")
- return parameter_rate
-
- @staticmethod
- def scaled_init(scale):
- return tf.initializers.variance_scaling(scale)
-
- @staticmethod
- def swish(input_activation: tf.Tensor) -> tf.Tensor:
- """Swish activation function. For more info: https://arxiv.org/abs/1710.05941"""
- return tf.multiply(input_activation, tf.nn.sigmoid(input_activation))
-
- @staticmethod
- def create_visual_input(camera_parameters: Tensor3DShape, name: str) -> tf.Tensor:
- """
- Creates image input op.
- :param camera_parameters: Parameters for visual observation.
- :param name: Desired name of input op.
- :return: input op.
- """
- o_size_h = camera_parameters.height
- o_size_w = camera_parameters.width
- c_channels = camera_parameters.num_channels
-
- visual_in = tf.placeholder(
- shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32, name=name
- )
- return visual_in
-
- @staticmethod
- def create_input_placeholders(
- observation_shapes: List[Tuple], name_prefix: str = ""
- ) -> Tuple[tf.Tensor, List[tf.Tensor]]:
- """
- Creates input placeholders for visual inputs.
- :param observation_shapes: A List of tuples that specify the resolutions
- of the input observations. Tuples for now are restricted to 1D (vector) or 3D (Tensor)
- :param name_prefix: A name prefix to add to the placeholder names. This is used so that there
- is no conflict when creating multiple placeholder sets.
- :returns: A List of Tensorflow placeholders where the input iamges should be fed.
- """
- visual_in: List[tf.Tensor] = []
- vector_in_size = 0
- for i, dimension in enumerate(observation_shapes):
- if len(dimension) == 3:
- _res = Tensor3DShape(
- height=dimension[0], width=dimension[1], num_channels=dimension[2]
- )
- visual_input = ModelUtils.create_visual_input(
- _res, name=name_prefix + "visual_observation_" + str(i)
- )
- visual_in.append(visual_input)
- elif len(dimension) == 1:
- vector_in_size += dimension[0]
- else:
- raise UnityTrainerException(
- f"Unsupported shape of {dimension} for observation {i}"
- )
- vector_in = tf.placeholder(
- shape=[None, vector_in_size],
- dtype=tf.float32,
- name=name_prefix + "vector_observation",
- )
- return vector_in, visual_in
-
- @staticmethod
- def create_vector_input(
- vec_obs_size: int, name: str = "vector_observation"
- ) -> tf.Tensor:
- """
- Creates ops for vector observation input.
- :param vec_obs_size: Size of stacked vector observation.
- :param name: Name of the placeholder op.
- :return: Placeholder for vector observations.
- """
- vector_in = tf.placeholder(
- shape=[None, vec_obs_size], dtype=tf.float32, name=name
- )
- return vector_in
-
- @staticmethod
- def normalize_vector_obs(
- vector_obs: tf.Tensor,
- running_mean: tf.Tensor,
- running_variance: tf.Tensor,
- normalization_steps: tf.Tensor,
- ) -> tf.Tensor:
- """
- Create a normalized version of an input tensor.
- :param vector_obs: Input vector observation tensor.
- :param running_mean: Tensorflow tensor representing the current running mean.
- :param running_variance: Tensorflow tensor representing the current running variance.
- :param normalization_steps: Tensorflow tensor representing the current number of normalization_steps.
- :return: A normalized version of vector_obs.
- """
- normalized_state = tf.clip_by_value(
- (vector_obs - running_mean)
- / tf.sqrt(
- running_variance / (tf.cast(normalization_steps, tf.float32) + 1)
- ),
- -5,
- 5,
- name="normalized_state",
- )
- return normalized_state
-
- @staticmethod
- def create_normalizer(vector_obs: tf.Tensor) -> NormalizerTensors:
- """
- Creates the normalizer and the variables required to store its state.
- :param vector_obs: A Tensor representing the next value to normalize. When the
- update operation is called, it will use vector_obs to update the running mean
- and variance.
- :return: A NormalizerTensors tuple that holds running mean, running variance, number of steps,
- and the update operation.
- """
- vec_obs_size = vector_obs.shape[1]
-
- steps = tf.get_variable(
- "normalization_steps",
- [],
- trainable=False,
- dtype=tf.int64,
- initializer=tf.zeros_initializer(),
- )
- running_mean = tf.get_variable(
- "running_mean",
- [vec_obs_size],
- trainable=False,
- dtype=tf.float32,
- initializer=tf.zeros_initializer(),
- )
- running_variance = tf.get_variable(
- "running_variance",
- [vec_obs_size],
- trainable=False,
- dtype=tf.float32,
- initializer=tf.ones_initializer(),
- )
- (
- initialize_normalization,
- update_normalization,
- ) = ModelUtils.create_normalizer_update(
- vector_obs, steps, running_mean, running_variance
- )
- return NormalizerTensors(
- initialize_normalization,
- update_normalization,
- steps,
- running_mean,
- running_variance,
- )
-
- @staticmethod
- def create_normalizer_update(
- vector_input: tf.Tensor,
- steps: tf.Tensor,
- running_mean: tf.Tensor,
- running_variance: tf.Tensor,
- ) -> Tuple[tf.Operation, tf.Operation]:
- """
- Creates the update operation for the normalizer.
- :param vector_input: Vector observation to use for updating the running mean and variance.
- :param running_mean: Tensorflow tensor representing the current running mean.
- :param running_variance: Tensorflow tensor representing the current running variance.
- :param steps: Tensorflow tensor representing the current number of steps that have been normalized.
- :return: A TF operation that updates the normalization based on vector_input.
- """
- # Based on Welford's algorithm for running mean and standard deviation, for batch updates. Discussion here:
- # https://stackoverflow.com/questions/56402955/whats-the-formula-for-welfords-algorithm-for-variance-std-with-batch-updates
- steps_increment = tf.shape(vector_input)[0]
- total_new_steps = tf.add(steps, tf.cast(steps_increment, dtype=tf.int64))
-
- # Compute the incremental update and divide by the number of new steps.
- input_to_old_mean = tf.subtract(vector_input, running_mean)
- new_mean = running_mean + tf.reduce_sum(
- input_to_old_mean / tf.cast(total_new_steps, dtype=tf.float32), axis=0
- )
- # Compute difference of input to the new mean for Welford update
- input_to_new_mean = tf.subtract(vector_input, new_mean)
- new_variance = running_variance + tf.reduce_sum(
- input_to_new_mean * input_to_old_mean, axis=0
- )
- update_mean = tf.assign(running_mean, new_mean)
- update_variance = tf.assign(running_variance, new_variance)
- update_norm_step = tf.assign(steps, total_new_steps)
- # First mean and variance calculated normally
- initial_mean, initial_variance = tf.nn.moments(vector_input, axes=[0])
- initialize_mean = tf.assign(running_mean, initial_mean)
- # Multiplied by total_new_step because it is divided by total_new_step in the normalization
- initialize_variance = tf.assign(
- running_variance,
- (initial_variance + EPSILON) * tf.cast(total_new_steps, dtype=tf.float32),
- )
- return (
- tf.group([initialize_mean, initialize_variance, update_norm_step]),
- tf.group([update_mean, update_variance, update_norm_step]),
- )
-
- @staticmethod
- def create_vector_observation_encoder(
- observation_input: tf.Tensor,
- h_size: int,
- activation: ActivationFunction,
- num_layers: int,
- scope: str,
- reuse: bool,
- ) -> tf.Tensor:
- """
- Builds a set of hidden state encoders.
- :param reuse: Whether to re-use the weights within the same scope.
- :param scope: Graph scope for the encoder ops.
- :param observation_input: Input vector.
- :param h_size: Hidden layer size.
- :param activation: What type of activation function to use for layers.
- :param num_layers: number of hidden layers to create.
- :return: List of hidden layer tensors.
- """
- with tf.variable_scope(scope):
- hidden = observation_input
- for i in range(num_layers):
- hidden = tf.layers.dense(
- hidden,
- h_size,
- activation=activation,
- reuse=reuse,
- name=f"hidden_{i}",
- kernel_initializer=tf.initializers.variance_scaling(1.0),
- )
- return hidden
-
- @staticmethod
- def create_visual_observation_encoder(
- image_input: tf.Tensor,
- h_size: int,
- activation: ActivationFunction,
- num_layers: int,
- scope: str,
- reuse: bool,
- ) -> tf.Tensor:
- """
- Builds a set of resnet visual encoders.
- :param image_input: The placeholder for the image input to use.
- :param h_size: Hidden layer size.
- :param activation: What type of activation function to use for layers.
- :param num_layers: number of hidden layers to create.
- :param scope: The scope of the graph within which to create the ops.
- :param reuse: Whether to re-use the weights within the same scope.
- :return: List of hidden layer tensors.
- """
- with tf.variable_scope(scope):
- conv1 = tf.layers.conv2d(
- image_input,
- 16,
- kernel_size=[8, 8],
- strides=[4, 4],
- activation=tf.nn.elu,
- reuse=reuse,
- name="conv_1",
- )
- conv2 = tf.layers.conv2d(
- conv1,
- 32,
- kernel_size=[4, 4],
- strides=[2, 2],
- activation=tf.nn.elu,
- reuse=reuse,
- name="conv_2",
- )
- hidden = tf.layers.flatten(conv2)
-
- with tf.variable_scope(scope + "/" + "flat_encoding"):
- hidden_flat = ModelUtils.create_vector_observation_encoder(
- hidden, h_size, activation, num_layers, scope, reuse
- )
- return hidden_flat
-
- @staticmethod
- def create_match3_visual_observation_encoder(
- image_input: tf.Tensor,
- h_size: int,
- activation: ActivationFunction,
- num_layers: int,
- scope: str,
- reuse: bool,
- ) -> tf.Tensor:
- """
- Builds a CNN with the architecture used by King for Candy Crush. Optimized
- for grid-shaped boards, such as with Match-3 games.
- :param image_input: The placeholder for the image input to use.
- :param h_size: Hidden layer size.
- :param activation: What type of activation function to use for layers.
- :param num_layers: number of hidden layers to create.
- :param scope: The scope of the graph within which to create the ops.
- :param reuse: Whether to re-use the weights within the same scope.
- :return: List of hidden layer tensors.
- """
- with tf.variable_scope(scope):
- conv1 = tf.layers.conv2d(
- image_input,
- 35,
- kernel_size=[3, 3],
- strides=[1, 1],
- activation=tf.nn.elu,
- reuse=reuse,
- name="conv_1",
- )
- conv2 = tf.layers.conv2d(
- conv1,
- 144,
- kernel_size=[3, 3],
- strides=[1, 1],
- activation=tf.nn.elu,
- reuse=reuse,
- name="conv_2",
- )
- hidden = tf.layers.flatten(conv2)
-
- with tf.variable_scope(scope + "/" + "flat_encoding"):
- hidden_flat = ModelUtils.create_vector_observation_encoder(
- hidden, h_size, activation, num_layers, scope, reuse
- )
- return hidden_flat
-
- @staticmethod
- def create_nature_cnn_visual_observation_encoder(
- image_input: tf.Tensor,
- h_size: int,
- activation: ActivationFunction,
- num_layers: int,
- scope: str,
- reuse: bool,
- ) -> tf.Tensor:
- """
- Builds a set of resnet visual encoders.
- :param image_input: The placeholder for the image input to use.
- :param h_size: Hidden layer size.
- :param activation: What type of activation function to use for layers.
- :param num_layers: number of hidden layers to create.
- :param scope: The scope of the graph within which to create the ops.
- :param reuse: Whether to re-use the weights within the same scope.
- :return: List of hidden layer tensors.
- """
- with tf.variable_scope(scope):
- conv1 = tf.layers.conv2d(
- image_input,
- 32,
- kernel_size=[8, 8],
- strides=[4, 4],
- activation=tf.nn.elu,
- reuse=reuse,
- name="conv_1",
- )
- conv2 = tf.layers.conv2d(
- conv1,
- 64,
- kernel_size=[4, 4],
- strides=[2, 2],
- activation=tf.nn.elu,
- reuse=reuse,
- name="conv_2",
- )
- conv3 = tf.layers.conv2d(
- conv2,
- 64,
- kernel_size=[3, 3],
- strides=[1, 1],
- activation=tf.nn.elu,
- reuse=reuse,
- name="conv_3",
- )
- hidden = tf.layers.flatten(conv3)
-
- with tf.variable_scope(scope + "/" + "flat_encoding"):
- hidden_flat = ModelUtils.create_vector_observation_encoder(
- hidden, h_size, activation, num_layers, scope, reuse
- )
- return hidden_flat
-
- @staticmethod
- def create_resnet_visual_observation_encoder(
- image_input: tf.Tensor,
- h_size: int,
- activation: ActivationFunction,
- num_layers: int,
- scope: str,
- reuse: bool,
- ) -> tf.Tensor:
- """
- Builds a set of resnet visual encoders.
- :param image_input: The placeholder for the image input to use.
- :param h_size: Hidden layer size.
- :param activation: What type of activation function to use for layers.
- :param num_layers: number of hidden layers to create.
- :param scope: The scope of the graph within which to create the ops.
- :param reuse: Whether to re-use the weights within the same scope.
- :return: List of hidden layer tensors.
- """
- n_channels = [16, 32, 32] # channel for each stack
- n_blocks = 2 # number of residual blocks
- with tf.variable_scope(scope):
- hidden = image_input
- for i, ch in enumerate(n_channels):
- hidden = tf.layers.conv2d(
- hidden,
- ch,
- kernel_size=[3, 3],
- strides=[1, 1],
- reuse=reuse,
- name="layer%dconv_1" % i,
- )
- hidden = tf.layers.max_pooling2d(
- hidden, pool_size=[3, 3], strides=[2, 2], padding="same"
- )
- # create residual blocks
- for j in range(n_blocks):
- block_input = hidden
- hidden = tf.nn.relu(hidden)
- hidden = tf.layers.conv2d(
- hidden,
- ch,
- kernel_size=[3, 3],
- strides=[1, 1],
- padding="same",
- reuse=reuse,
- name="layer%d_%d_conv1" % (i, j),
- )
- hidden = tf.nn.relu(hidden)
- hidden = tf.layers.conv2d(
- hidden,
- ch,
- kernel_size=[3, 3],
- strides=[1, 1],
- padding="same",
- reuse=reuse,
- name="layer%d_%d_conv2" % (i, j),
- )
- hidden = tf.add(block_input, hidden)
- hidden = tf.nn.relu(hidden)
- hidden = tf.layers.flatten(hidden)
-
- with tf.variable_scope(scope + "/" + "flat_encoding"):
- hidden_flat = ModelUtils.create_vector_observation_encoder(
- hidden, h_size, activation, num_layers, scope, reuse
- )
- return hidden_flat
-
- @staticmethod
- def get_encoder_for_type(encoder_type: EncoderType) -> EncoderFunction:
- ENCODER_FUNCTION_BY_TYPE = {
- EncoderType.SIMPLE: ModelUtils.create_visual_observation_encoder,
- EncoderType.NATURE_CNN: ModelUtils.create_nature_cnn_visual_observation_encoder,
- EncoderType.RESNET: ModelUtils.create_resnet_visual_observation_encoder,
- EncoderType.MATCH3: ModelUtils.create_match3_visual_observation_encoder,
- }
- return ENCODER_FUNCTION_BY_TYPE.get(
- encoder_type, ModelUtils.create_visual_observation_encoder
- )
-
- @staticmethod
- def break_into_branches(
- concatenated_logits: tf.Tensor, action_size: List[int]
- ) -> List[tf.Tensor]:
- """
- Takes a concatenated set of logits that represent multiple discrete action branches
- and breaks it up into one Tensor per branch.
- :param concatenated_logits: Tensor that represents the concatenated action branches
- :param action_size: List of ints containing the number of possible actions for each branch.
- :return: A List of Tensors containing one tensor per branch.
- """
- action_idx = [0] + list(np.cumsum(action_size))
- branched_logits = [
- concatenated_logits[:, action_idx[i] : action_idx[i + 1]]
- for i in range(len(action_size))
- ]
- return branched_logits
-
- @staticmethod
- def create_discrete_action_masking_layer(
- branches_logits: List[tf.Tensor],
- action_masks: tf.Tensor,
- action_size: List[int],
- ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
- """
- Creates a masking layer for the discrete actions
- :param branches_logits: A List of the unnormalized action probabilities for each branch
- :param action_masks: The mask for the logits. Must be of dimension [None x total_number_of_action]
- :param action_size: A list containing the number of possible actions for each branch
- :return: The action output dimension [batch_size, num_branches], the concatenated
- normalized log_probs (after softmax)
- and the concatenated normalized log log_probs
- """
- branch_masks = ModelUtils.break_into_branches(action_masks, action_size)
- raw_probs = [
- tf.multiply(tf.nn.softmax(branches_logits[k]) + EPSILON, branch_masks[k])
- for k in range(len(action_size))
- ]
- normalized_probs = [
- tf.divide(raw_probs[k], tf.reduce_sum(raw_probs[k], axis=1, keepdims=True))
- for k in range(len(action_size))
- ]
- output = tf.concat(
- [
- tf.multinomial(tf.log(normalized_probs[k] + EPSILON), 1)
- for k in range(len(action_size))
- ],
- axis=1,
- )
- return (
- output,
- tf.concat([normalized_probs[k] for k in range(len(action_size))], axis=1),
- tf.concat(
- [
- tf.log(normalized_probs[k] + EPSILON)
- for k in range(len(action_size))
- ],
- axis=1,
- ),
- )
-
- @staticmethod
- def _check_resolution_for_encoder(
- vis_in: tf.Tensor, vis_encoder_type: EncoderType
- ) -> None:
- min_res = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[vis_encoder_type]
- height = vis_in.shape[1]
- width = vis_in.shape[2]
- if height < min_res or width < min_res:
- raise UnityTrainerException(
- f"Visual observation resolution ({width}x{height}) is too small for"
- f"the provided EncoderType ({vis_encoder_type.value}). The min dimension is {min_res}"
- )
-
- @staticmethod
- def create_observation_streams(
- visual_in: List[tf.Tensor],
- vector_in: tf.Tensor,
- num_streams: int,
- h_size: int,
- num_layers: int,
- vis_encode_type: EncoderType = EncoderType.SIMPLE,
- stream_scopes: List[str] = None,
- ) -> List[tf.Tensor]:
- """
- Creates encoding stream for observations.
- :param num_streams: Number of streams to create.
- :param h_size: Size of hidden linear layers in stream.
- :param num_layers: Number of hidden linear layers in stream.
- :param stream_scopes: List of strings (length == num_streams), which contains
- the scopes for each of the streams. None if all under the same TF scope.
- :return: List of encoded streams.
- """
- activation_fn = ModelUtils.swish
- vector_observation_input = vector_in
-
- final_hiddens = []
- for i in range(num_streams):
- # Pick the encoder function based on the EncoderType
- create_encoder_func = ModelUtils.get_encoder_for_type(vis_encode_type)
-
- visual_encoders = []
- hidden_state, hidden_visual = None, None
- _scope_add = stream_scopes[i] if stream_scopes else ""
- if len(visual_in) > 0:
- for j, vis_in in enumerate(visual_in):
- ModelUtils._check_resolution_for_encoder(vis_in, vis_encode_type)
- encoded_visual = create_encoder_func(
- vis_in,
- h_size,
- activation_fn,
- num_layers,
- f"{_scope_add}main_graph_{i}_encoder{j}", # scope
- False, # reuse
- )
- visual_encoders.append(encoded_visual)
- hidden_visual = tf.concat(visual_encoders, axis=1)
- if vector_in.get_shape()[-1] > 0:
- # Don't encode non-existant or 0-shape inputs
- hidden_state = ModelUtils.create_vector_observation_encoder(
- vector_observation_input,
- h_size,
- activation_fn,
- num_layers,
- scope=f"{_scope_add}main_graph_{i}",
- reuse=False,
- )
- if hidden_state is not None and hidden_visual is not None:
- final_hidden = tf.concat([hidden_visual, hidden_state], axis=1)
- elif hidden_state is None and hidden_visual is not None:
- final_hidden = hidden_visual
- elif hidden_state is not None and hidden_visual is None:
- final_hidden = hidden_state
- else:
- raise Exception(
- "No valid network configuration possible. "
- "There are no states or observations in this brain"
- )
- final_hiddens.append(final_hidden)
- return final_hiddens
-
- @staticmethod
- def create_recurrent_encoder(input_state, memory_in, sequence_length, name="lstm"):
- """
- Builds a recurrent encoder for either state or observations (LSTM).
- :param sequence_length: Length of sequence to unroll.
- :param input_state: The input tensor to the LSTM cell.
- :param memory_in: The input memory to the LSTM cell.
- :param name: The scope of the LSTM cell.
- """
- s_size = input_state.get_shape().as_list()[1]
- m_size = memory_in.get_shape().as_list()[1]
- lstm_input_state = tf.reshape(input_state, shape=[-1, sequence_length, s_size])
- memory_in = tf.reshape(memory_in[:, :], [-1, m_size])
- half_point = int(m_size / 2)
- with tf.variable_scope(name):
- rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(half_point)
- lstm_vector_in = tf.nn.rnn_cell.LSTMStateTuple(
- memory_in[:, :half_point], memory_in[:, half_point:]
- )
- recurrent_output, lstm_state_out = tf.nn.dynamic_rnn(
- rnn_cell, lstm_input_state, initial_state=lstm_vector_in
- )
-
- recurrent_output = tf.reshape(recurrent_output, shape=[-1, half_point])
- return recurrent_output, tf.concat([lstm_state_out.c, lstm_state_out.h], axis=1)
-
- @staticmethod
- def create_value_heads(
- stream_names: List[str], hidden_input: tf.Tensor
- ) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
- """
- Creates one value estimator head for each reward signal in stream_names.
- Also creates the node corresponding to the mean of all the value heads in self.value.
- self.value_head is a dictionary of stream name to node containing the value estimator head for that signal.
- :param stream_names: The list of reward signal names
- :param hidden_input: The last layer of the Critic. The heads will consist of one dense hidden layer on top
- of the hidden input.
- """
- value_heads = {}
- for name in stream_names:
- value = tf.layers.dense(hidden_input, 1, name=f"{name}_value")
- value_heads[name] = value
- value = tf.reduce_mean(list(value_heads.values()), 0)
- return value_heads, value
diff --git a/ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py b/ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py
deleted file mode 100644
index 59838bf021..0000000000
--- a/ml-agents/mlagents/trainers/tf/tensorflow_to_barracuda.py
+++ /dev/null
@@ -1,1674 +0,0 @@
-# pylint: skip-file
-# flake8: noqa
-from __future__ import print_function
-import numpy as np
-import struct # convert from Python values and C structs
-from mlagents.tf_utils import tf
-import re
-
-# import barracuda
-# from barracuda import Struct
-from mlagents.trainers import barracuda
-from mlagents.trainers.barracuda import Struct
-from google.protobuf import descriptor
-from google.protobuf.json_format import MessageToJson
-
-
-if __name__ == "__main__":
- # Handle command line argumengts
- args = barracuda.parse_args(
- description="Convert Tensorflow model to Barracuda binary",
- source_extension=".pb",
- help="input Tensorflow serialized .pb file",
- )
- # Te following code can be used as an example of API used from another module
- # convert() is the main entry point for converter
- import tf.tensorflow_to_barracuda as tf2bc
-
- tf2bc.convert(args.source_file, args.target_file, args.trim_unused_by_output, args)
-
-
-# TODO: support more than 1 LSTM layer per model - prepend scope to names and inputs
-# TODO: support different activation functions in LSTM
-# TODO: strip output Identity node, instead patch upstream layer names
-# TODO: use ScaleBias and Pow with alpha when input is constant Tensor
-# TODO: support all data format types (curretly only NHWC)
-# TODO: support all data types (currently only FLOAT, INT32, BOOL)
-# TODO: implement FusedResizeAndPadConv2D
-
-# Important ProtoBuf definitions:
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/types.proto
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor.proto
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/node_def.proto
-#
-# Node descriptions:
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/nn_ops.cc
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/math_ops.cc
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/ops/random_ops.cc
-#
-# Class doc:
-# https://www.tensorflow.org/api_docs/cc/
-#
-known_classes = {
- "Dense": Struct(
- id=1,
- rank=2,
- out_shapes=lambda shapes: [
- [shapes[0][0], 1, 1, shapes[0][1]]
- if len(shapes[0]) > 1
- else [1, 1, 1, 1], # W
- [1, 1, 1, shapes[-1][-1]], # B
- ],
- patch_data=lambda data: [data[0], data[1]],
- ),
- "MatMul": Struct(
- id=1,
- rank=2,
- out_shapes=lambda shapes: [
- [shapes[0][0], 1, 1, shapes[0][1]], # W
- [1, 1, 1, shapes[0][1]], # B
- ],
- patch_data=lambda data: [data[0], np.zeros(np.shape(data[1]))],
- ),
- "BiasAdd": Struct(
- id=51, # implemented as ScaleBias
- out_shapes=lambda shapes: [
- [1, 1, 1, shapes[0][0]], # ONE
- [1, 1, 1, shapes[0][0]], # B
- ],
- patch_data=lambda data: [np.ones(np.shape(data[0])), data[0]],
- ),
- # TODO: NCHW
- "Conv2D": Struct(
- id=20,
- rank=4,
- out_shapes=lambda shapes: [shapes[0], [1, 1, 1, shapes[-1][-1]]], # K # B
- patch_data=lambda data: [data[0], data[1]],
- ),
- "DepthwiseConv2dNative": Struct( # DepthwiseConv2D
- id=21,
- rank=4,
- out_shapes=lambda s: [
- [
- s[0][0],
- s[0][1],
- s[0][3],
- s[0][2],
- ], # K TF:[H, W, in_channels, channel_multiplier] => [H, W, 1, in_channels]
- [1, 1, 1, s[-1][-1]] if len(s) > 1 else [1, 1, 1, s[0][2]], # B
- ],
- patch_data=lambda data: [np.transpose(data[0], (0, 1, 3, 2)), data[1]],
- ),
- "Conv2DBackpropInput": Struct( # Conv2DTranspose
- id=22,
- rank=4,
- out_shapes=lambda s: [
- [
- s[0][0],
- s[0][1],
- s[0][3],
- s[0][2],
- ], # K TF:[H, W, in_channels, out_channels] => [H, W, out_channels, in_channels]
- [1, 1, 1, s[-1][-1]] if len(s) > 1 else [1, 1, 1, s[0][2]], # B
- ],
- patch_data=lambda data: [np.transpose(data[0], (0, 1, 3, 2)), data[1]],
- ),
- "Pad": 29,
- # TODO: 3D
- "ResizeNearestNeighbor": 23, # implemented as Upsample2D
- "ResizeBilinear": 23, # implemented as Upsample2D
- "ResizeBicubic": 23, # implemented as Upsample2D
- "MaxPool": 25,
- "AvgPool": 26,
- "GlobalAveragePool": 28,
- "GlobalAvgPool": 28,
- "Activation": 50,
- "BatchNormalization": Struct(
- id=51, # after fusion implemented as ScaleBias
- out_shapes=lambda shapes: [
- [1, 1, 1, shapes[0][0]], # S
- [1, 1, 1, shapes[0][0]], # B
- ],
- patch_data=lambda data:
- # fuse [gamma, beta, mean, var, epsilon] => [scale, bias]
- # TODO: double-check if epsilon is the last data argument and not the 1st?
- barracuda.fuse_batchnorm_weights(data[0], data[1], data[2], data[3], data[4])
- if len(data) == 5
- else
- # fuse [ONE, beta, mean, var, epsilon] => [scale, bias]
- # TODO: double-check if epsilon is the last data argument and not the 1st?
- barracuda.fuse_batchnorm_weights(
- np.ones(np.shape(data[0])), data[0], data[1], data[2], data[3]
- ),
- ),
- "FusedBatchNorm": Struct(
- id=51, # after fusion implemented as ScaleBias
- out_shapes=lambda shapes: [
- [1, 1, 1, shapes[0][0]], # S
- [1, 1, 1, shapes[0][0]], # B
- ],
- patch_data=lambda data, layer:
- # fuse [gamma, beta, mean, var, epsilon] => [scale, bias]
- barracuda.fuse_batchnorm_weights(
- data[0], data[1], data[2], data[3], get_epsilon(layer)
- ),
- ),
- "BatchNormalizationRuntime": Struct(
- id=52,
- out_shapes=lambda shapes: [
- [1, 1, 1, shapes[0][0]], # G
- [1, 1, 1, shapes[0][0]], # B
- ],
- patch_data=lambda data: [data[0], data[1]]
- if len(data) == 4
- else [np.ones(np.shape(data[0])), data[0]],
- ),
- "InstanceNormalization": Struct( # TODO: epsilon
- id=52,
- out_shapes=lambda shapes: [
- [1, 1, 1, shapes[0][0]], # G
- [1, 1, 1, shapes[0][0]], # B
- ],
- patch_data=lambda data: [data[0], data[1]]
- if len(data) == 2
- else [np.ones(np.shape(data[0])), data[0]],
- ),
- "LRN": 53,
- "RandomStandardNormal": 64,
- "RandomUniform": 65,
- "Multinomial": Struct(id=66, rank=2),
- "OneHot": Struct(id=67, rank=lambda inputs: inputs[0] + 1),
- # Broadcast ops
- "Add": Struct(id=100, rank=lambda inputs: np.max(inputs)),
- "AddV2": Struct(id=100, rank=lambda inputs: np.max(inputs)),
- "Sub": Struct(id=101, rank=lambda inputs: np.max(inputs)),
- "Mul": Struct(id=102, rank=lambda inputs: np.max(inputs)),
- "RealDiv": Struct(id=103, rank=lambda inputs: np.max(inputs)),
- "Pow": Struct(id=104, rank=lambda inputs: np.max(inputs)),
- "Minimum": Struct(id=110, rank=lambda inputs: np.max(inputs)),
- "Maximum": Struct(id=111, rank=lambda inputs: np.max(inputs)),
- # Reduce ops
- "Max": Struct(id=124, rank=lambda inputs: inputs[0] - 1),
- "Mean": Struct(id=125, rank=lambda inputs: inputs[0] - 1),
- "Min": Struct(id=126, rank=lambda inputs: inputs[0] - 1),
- "Prod": Struct(id=127, rank=lambda inputs: inputs[0] - 1),
- "Sum": Struct(id=128, rank=lambda inputs: inputs[0] - 1),
- "Flatten": Struct(id=200, rank=2),
- "Reshape": 201,
- "Concat": 210,
- "StridedSlice": 211,
- "Nop": 0,
-}
-
-requires_runtime_flag = {
- "Dropout": "DropoutRuntime",
- "BatchNormalization": "BatchNormalizationRuntime",
-}
-
-known_activations = {
- "Linear": 0,
- "Relu": 1,
- "Softmax": 2,
- "Tanh": 3,
- "Sigmoid": 4,
- "Elu": 5,
- "Relu6": 6,
- "LeakyRelu": 7,
- "Selu": 8,
- "Swish": 9,
- "LogSoftmax": 10,
- "Softplus": 11,
- "Softsign": 12,
- "Abs": 100,
- "Neg": 101,
- "Ceil": 102,
- "Floor": 104,
- "Sqrt": 111,
- "Exp": 113,
- "Log": 114,
- "Acos": 200,
- "Acosh": 201,
- "Asin": 202,
- "Asinh": 203,
- "Atan": 204,
- "Atanh": 205,
- "Cos": 206,
- "Cosh": 207,
- "Sin": 208,
- "Sinh": 209,
- "Tan": 210,
-}
-
-known_paddings = {"VALID": [0, 0, 0, 0], "SAME": [-1]} # SameUpper
-
-supported_data_formats = {"NHWC"}
-
-known_patterns = {
- # TODO: Flatten pattern using namespace regexp
- repr(["Shape", "StridedSlice", "Pack", "Reshape"]): "Flatten",
- repr(["Shape", "StridedSlice", "Prod", "Pack", "Reshape"]): "Flatten",
- repr(
- ["Shape", "Slice", "Slice", "Prod", "ExpandDims", "ConcatV2", "Reshape"]
- ): "Flatten",
- repr(["Add", "Rsqrt", "Mul", "Mul", "Sub", "Add"]): "BatchNormalization",
- repr(["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"]): "BatchNormalization",
- repr(
- [
- "Mean",
- "StopGradient",
- "SquaredDifference",
- "Mean",
- "Sub",
- "Add",
- "Pow",
- "RealDiv",
- "Mul",
- "Add",
- ]
- ): "InstanceNormalization_ByTensorOrder",
- repr(
- [
- "Mean",
- "StopGradient",
- "SquaredDifference",
- "Mean",
- "Squeeze",
- "Squeeze",
- "Add",
- "Rsqrt",
- "Mul",
- "Mul",
- "Mul",
- "Sub",
- "Add",
- ]
- ): "InstanceNormalization_ByTensorName",
- repr(["MatMul", "BiasAdd"]): "Dense",
- repr(["Conv2D", "BiasAdd"]): "Conv2D",
- repr(["DepthwiseConv2dNative", "BiasAdd"]): "DepthwiseConv2dNative",
- repr(["Conv2DBackpropInput", "BiasAdd"]): "Conv2DBackpropInput",
- repr(["Conv2DBackpropInput"]): "Conv2DBackpropInput",
- repr(
- [
- "Shape",
- "StridedSlice",
- "StridedSlice",
- "StridedSlice",
- "Mul",
- "Mul",
- "Pack",
- "Conv2DBackpropInput",
- "BiasAdd",
- ]
- ): "Conv2DBackpropInput",
- repr(
- [
- "Shape",
- "StridedSlice",
- "StridedSlice",
- "StridedSlice",
- "Mul",
- "Mul",
- "Pack",
- "Conv2DBackpropInput",
- ]
- ): "Conv2DBackpropInput",
- repr(
- ["Shape", "StridedSlice", "Mul", "ResizeNearestNeighbor"]
- ): "ResizeNearestNeighbor",
- repr(
- ["Pack", "Reshape"]
- ): "Flatten$", # for now we assume that this combination is trivial Flatten
- # for exmaple it is used in ML-agents LSTM nets with sequence_length==1
- repr(
- [
- "StridedSlice",
- "Reshape",
- re.compile("^lstm/"),
- "Reshape",
- "ConcatV2",
- "Identity",
- ]
- ): "BasicLSTMReshapeOut",
- repr(
- [re.compile("^lstm/"), "Reshape", "ConcatV2", "Identity"]
- ): "BasicLSTMReshapeOut",
- repr(
- ["Reshape", re.compile("^lstm_[a-z]*/"), "Reshape", "ConcatV2"]
- ): "BasicLSTMReshapeOut",
- repr(["Reshape", re.compile("^lstm_[a-z]*/"), "ConcatV2"]): "BasicLSTMConcatOut",
- repr(["Sigmoid", "Mul"]): "Swish",
- repr(["Mul", "Abs", "Mul", "Add"]): "LeakyRelu",
- repr(
- ["Shape", "Reshape"]
- ): "ReshapeLikeInput0", # shape comes from the 1st node as input[0]
- repr(["Reshape"]): "Reshape",
- repr(["ConcatV2"]): "ConcatV2",
- repr(["Mean"]): "Mean",
- repr(["Pad"]): "Pad",
- repr(["Multinomial"]): "Multinomial",
- repr(["OneHot"]): "OneHot",
- repr(["Square"]): "Square",
- repr(["SquaredDifference"]): "SquaredDifference",
- repr(["StridedSlice"]): "StridedSlice",
- repr(["Squeeze"]): "Squeeze",
- repr(["ExpandDims"]): "ExpandDims",
- # TODO: FusedResizeAndPadConv2D
-}
-
-
-def by_name(args, name):
- for a in args:
- if a.name.endswith(name):
- return a
-
-
-def by_op(args, op):
- for a in args:
- if a.op == op:
- return a
-
-
-def order_by(args, names):
- ordered = []
- arg_count = len(args)
- for name in names:
- ordered += [a for a in args if a.endswith(name)]
- args = [a for a in args if not a.endswith(name)]
- ordered += args # append what is left
- assert len(ordered) == arg_count
- return ordered
-
-
-transform_patterns = {
- "Flatten": lambda nodes, inputs, tensors, _: Struct(op="Flatten", input=inputs),
- "Flatten$": lambda nodes, inputs, tensors, _: Struct(
- op="Flatten",
- input=[
- inputs[-1]
- ], # take only the last input, assume all other arguments are trivial (like sequence_length==1
- # always in ML-agents LSTM nets)
- ),
- "Reshape": lambda nodes, inputs, tensors, context: Struct(
- op="Reshape",
- rank=len(tensors[0].data)
- if len(tensors)
- > 0 # tensor data is treated as reshape coefficient, if not empty
- else context.layer_ranks[inputs[1]]
- if len(inputs) == 2 # otherwise shape of the 2nd input tensor is used
- else -1,
- input=inputs,
- shape=[
- tensors[0].data[0],
- tensors[0].data[1],
- tensors[0].data[2],
- tensors[0].data[3],
- ]
- if len(tensors) > 0 and len(tensors[0].data) == 4
- else [tensors[0].data[0], 1, tensors[0].data[1], tensors[0].data[2]]
- if len(tensors) > 0 and len(tensors[0].data) == 3
- else [tensors[0].data[0], 1, 1, tensors[0].data[1]]
- if len(tensors) > 0 and len(tensors[0].data) == 2
- else [1, 1, 1, tensors[0].data[0]]
- if len(tensors) > 0 and len(tensors[0].data) == 1
- else [],
- ),
- "ReshapeLikeInput0": lambda nodes, inputs, tensors, context: Struct(
- op="Reshape",
- rank=context.layer_ranks[inputs[0]]
- if len(inputs)
- == 2 # unlike standard 'Reshape' input[0] is used as shape & input[1] as data
- else -1,
- input=[inputs[1], inputs[0]]
- if len(inputs)
- == 2 # unlike standard 'Reshape' input[0] is used as shape & input[1] as data
- else inputs,
- ),
- "Pad": lambda nodes, inputs, tensors, _: Struct(
- op="Pad"
- if (
- len(tensors) > 0
- and np.shape(tensors[0]) == [4, 2]
- and get_attr(nodes[-1], "mode", default="constant").lower() == "constant"
- )
- else "BarracudaUnsupportedPad",
- input=inputs,
- pads=[
- tensors[0].data[1, 0],
- tensors[0].data[1, 1],
- tensors[0].data[2, 0],
- tensors[0].data[2, 1],
- ]
- if len(tensors) > 0 and np.shape(tensors[0]) == [4, 2]
- else [0, 0, 0, 0],
- beta=get_attr(nodes[-1], "constant_values") or 0,
- ),
- "Squeeze": lambda nodes, inputs, tensors, context: Struct(
- op="Nop", # Squeeze is no-operation in Barracuda
- input=inputs,
- rank=context.layer_ranks[inputs[0]] - len(get_attr(nodes[-1], "squeeze_dims"))
- if len(get_attr(nodes[-1], "squeeze_dims")) > 0
- else -1, # if list of squeeze axis is not specified, it is unknown what would be the rank of result
- ),
- "ExpandDims": lambda nodes, inputs, tensors, context: Struct(
- op="Nop", # ExpandDims is no-operation in Barracuda
- input=[inputs[0]],
- rank=context.layer_ranks[inputs[0]] + 1,
- ),
- "Multinomial": lambda nodes, inputs, tensors, _: Struct(
- op="Multinomial",
- input=inputs,
- shape=[int(by_name(tensors, "/num_samples").data[0])],
- # seed = get_attr(nodes[0], 'seed'),
- ),
- "OneHot": lambda nodes, inputs, tensors, _: Struct(
- op="OneHot",
- input=inputs,
- shape=[int(by_name(tensors, "/depth").data[0])],
- alpha=by_name(tensors, "/on_value").data[0],
- beta=by_name(tensors, "/off_value").data[0],
- ),
- "Square": lambda nodes, inputs, tensors, _: Struct(
- op="Mul", input=[inputs[0], inputs[0]] # input * input
- ),
- "ConcatV2": lambda nodes, inputs, tensors, context: Struct(
- op="Concat",
- input=inputs,
- axis=axis_to_barracuda(
- int(by_name(tensors, "/axis").data[0]), context.layer_ranks[inputs[0]]
- ),
- ),
- "StridedSlice": lambda nodes, inputs, tensors, context: strided_slice(
- nodes[-1].name,
- inputs[0],
- context.layer_ranks[inputs[0]],
- begin=tensors[0].data,
- end=tensors[1].data,
- strides=tensors[2].data,
- begin_mask=get_attr(nodes[-1], "begin_mask"),
- end_mask=get_attr(nodes[-1], "end_mask"),
- ellipsis_mask=get_attr(nodes[-1], "ellipsis_mask"),
- new_axis_mask=get_attr(nodes[-1], "new_axis_mask"),
- shrink_axis_mask=get_attr(nodes[-1], "shrink_axis_mask"),
- ),
- "BatchNormalization": lambda nodes, inputs, tensors, _: Struct(
- op="BatchNormalization",
- input=[i for i in inputs]
- + order_by([t.name for t in tensors], ["gamma", "beta", "mean", "variance"]),
- ),
- "InstanceNormalization_ByTensorName": lambda nodes, inputs, tensors, _: Struct(
- op="InstanceNormalization",
- input=[i for i in inputs]
- + order_by([t.name for t in tensors], ["scale", "offset"]),
- ),
- "InstanceNormalization_ByTensorOrder": lambda nodes, inputs, tensors, _: Struct(
- op="InstanceNormalization",
- input=[i for i in inputs] + [t.name for t in tensors][-2:],
- ),
- "Dense": lambda nodes, inputs, tensors, _: Struct(
- op="Dense",
- input=[i for i in inputs] + [t.name for t in tensors],
- data_frmt=get_attr(
- by_op(nodes, "Dense") or by_op(nodes, "MatMul"), "data_format"
- ),
- ),
- "Conv2D": lambda nodes, inputs, tensors, _: Struct(
- op="Conv2D",
- input=[i for i in inputs] + [t.name for t in tensors],
- padding=get_attr(by_op(nodes, "Conv2D"), "padding"),
- strides=get_attr(by_op(nodes, "Conv2D"), "strides"),
- dilations=get_attr(by_op(nodes, "Conv2D"), "dilations"),
- data_frmt=get_attr(by_op(nodes, "Conv2D"), "data_format"),
- ),
- "DepthwiseConv2dNative": lambda nodes, inputs, tensors, _: Struct(
- op="DepthwiseConv2dNative",
- input=[i for i in inputs] + [t.name for t in tensors],
- padding=get_attr(by_op(nodes, "DepthwiseConv2dNative"), "padding"),
- strides=get_attr(by_op(nodes, "DepthwiseConv2dNative"), "strides"),
- dilations=get_attr(by_op(nodes, "DepthwiseConv2dNative"), "dilations"),
- data_frmt=get_attr(by_op(nodes, "DepthwiseConv2dNative"), "data_format"),
- ),
- "Conv2DBackpropInput": lambda nodes, inputs, tensors, _: Struct(
- op="Conv2DBackpropInput",
- input=[i for i in inputs]
- + [t.name for t in tensors][1:][
- -2:
- ], # [1:] - skips the 0th tensor, since Conv2DBackpropInput 0th tensor is 'input_sizes'
- # (which differs from other Conv layers)
- # [-2:] - take only last 2 tensors, this allows to process large patterns with the same code
- padding=get_attr(by_op(nodes, "Conv2DBackpropInput"), "padding"),
- strides=get_attr(by_op(nodes, "Conv2DBackpropInput"), "strides"),
- dilations=get_attr(by_op(nodes, "Conv2DBackpropInput"), "dilations"),
- data_frmt=get_attr(by_op(nodes, "Conv2DBackpropInput"), "data_format"),
- ),
- "ResizeNearestNeighbor": lambda nodes, inputs, tensors, _: Struct(
- op="ResizeNearestNeighbor",
- input=[i for i in inputs],
- ksize=[int(tensors[0].data[0]), int(tensors[0].data[1])]
- if len(tensors) == 1 and len(tensors[0].data) == 2
- else [int(tensors[-1].data[0]), int(tensors[-1].data[1])]
- if len(tensors) >= 4 and len(tensors[-1].data) == 2
- else [1, 1],
- ),
- "Mean": lambda nodes, inputs, tensors, _:
- # take only the last input
- barracuda.mean(nodes[-1].name, inputs[-1], axis=tensors[0].data),
- "SquaredDifference": lambda nodes, inputs, tensors, _: sqr_diff(
- nodes[-1].name, inputs[0], inputs[1]
- ),
- "BasicLSTMReshapeOut": lambda nodes, inputs, tensors, context: basic_lstm(
- nodes, inputs, tensors, context, find_type="Reshape"
- ),
- "BasicLSTMConcatOut": lambda nodes, inputs, tensors, context: basic_lstm(
- nodes, inputs, tensors, context, find_type="ConcatV2"
- ),
- "Swish": lambda nodes, inputs, tensors, _: Struct(op="Swish", input=inputs),
- "LeakyRelu": lambda nodes, inputs, tensors, _: Struct(op="LeakyRelu", input=inputs),
- # TODO:'Round'
- # TODO:'Rsqrt'
-}
-
-
-# Debug
-def debug(s):
- print(s)
- return s
-
-
-# Helper
-def embody(v, default=0):
- return default if v is None else v
-
-
-# Parse
-def get_attr(node, attr_name, default=None):
- if type(node) == Struct:
- if hasattr(node, attr_name):
- return getattr(node, attr_name)
- else:
- return default
-
- # See: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/attr_value.proto
- val = node.attr[attr_name]
-
- if val.HasField("list"):
- # NOTE: can't find way to identify type of list BUT it is almost always list(int)
- # except list(float) in FractionalAvg/MaxPool
- if len(val.list.shape) > 0:
- return val.list.shape
- else:
- return val.list.i
- if val.HasField("b"):
- return val.b
- if val.HasField("i"):
- return val.i
- if val.HasField("f"):
- return val.f
- if val.HasField("s"):
- return val.s.decode("utf-8")
- if val.HasField("shape"):
- return val.shape
- if val.HasField("tensor"):
- return val.tensor
- return default
-
-
-def get_epsilon(layer):
- return get_attr(
- layer, "epsilon", default=0.001
- ) # default epsilon taken from tf.layers.batch_normalization
-
-
-def get_layer_rank(layer):
- shape = get_attr(layer, "shape")
- if not shape:
- outputShapes = get_attr(layer, "_output_shapes")
- if outputShapes:
- shape = outputShapes[0]
- if not shape:
- return None
- if isinstance(shape, list):
- return 1
- shape = [dim.size for dim in shape.dim]
- return len(shape)
-
-
-def get_layer_shape(layer):
- shape = get_attr(layer, "shape")
- if not shape:
- return [-1, -1, -1, -1]
- shape = [dim.size for dim in shape.dim]
- if len(shape) == 1:
- return [1, 1, 1, shape[0]]
- if len(shape) == 2:
- return [shape[0], 1, 1, shape[1]]
- if len(shape) == 3:
- return [shape[0], 1, shape[1], shape[2]]
- return shape
-
-
-def get_tensor_dims(tensor):
- if isinstance(tensor, np.ndarray):
- return np.shape(tensor)
-
- dims = []
- if tensor.tensor_shape:
- dims = [v.size for v in tensor.tensor_shape.dim]
- if tensor.float_val:
- dims = np.shape(tensor.float_val)
- if tensor.int_val:
- dims = np.shape(tensor.int_val)
- if tensor.bool_val:
- dims = np.shape(tensor.bool_val)
- return dims
-
-
-def get_tensor_dtype(tensor):
- if isinstance(tensor, np.ndarray):
- return tensor.dtype
-
- dataType = ""
- fields = tensor.ListFields()
-
- for field, value in fields:
- if (
- field.name == "dtype"
- and field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM
- ):
- dataType = field.enum_type.values_by_number.get(value, None).name
-
- return dataType
-
-
-def get_tensor_data(tensor):
- if isinstance(tensor, np.ndarray):
- return tensor.astype(float)
-
- dims = get_tensor_dims(tensor)
- elems = np.product(dims)
-
- if tensor.tensor_content:
- # TODO: support other types
- dataType = get_tensor_dtype(tensor)
- if dataType == "DT_FLOAT":
- data = struct.unpack("<" + str(elems) + "f", tensor.tensor_content)
- elif dataType == "DT_INT32":
- data = struct.unpack("<" + str(elems) + "i", tensor.tensor_content)
- elif dataType == "DT_BOOL":
- data = struct.unpack("<" + str(elems) + "?", tensor.tensor_content)
- else:
- print("UNSUPPORTED: data type", dataType)
- if tensor.float_val:
- data = tensor.float_val
- if tensor.int_val:
- data = np.array(tensor.int_val, dtype=float)
- if tensor.int64_val:
- data = np.array(tensor.int64_val, dtype=float)
- if tensor.bool_val:
- data = np.array(tensor.bool_val, dtype=float)
- return np.array(data).reshape(dims)
-
-
-def flatten(items, enter=lambda x: isinstance(x, list)):
- # http://stackoverflow.com/a/40857703
- # https://github.com/ctmakro/canton/blob/master/canton/misc.py
- """Yield items from any nested iterable; see REF."""
- for x in items:
- if enter(x):
- yield from flatten(x)
- else:
- yield x
-
-
-def replace_strings_in_list(array_of_strigs, replace_with_strings):
- "A value in replace_with_strings can be either single string or list of strings"
- potentially_nested_list = [
- replace_with_strings.get(s) or s for s in array_of_strigs
- ]
- return list(flatten(potentially_nested_list))
-
-
-def remove_duplicates_from_list(array):
- "Preserves the order of elements in the list"
- output = []
- unique = set()
- for a in array:
- if a not in unique:
- unique.add(a)
- output.append(a)
- return output
-
-
-#########################################################
-
-
-def pool_to_HW(shape, data_frmt):
- """ Convert from NHWC|NCHW => HW
- """
- if len(shape) != 4:
- return shape # Not NHWC|NCHW, return as is
- if data_frmt == "NCHW":
- return [shape[2], shape[3]]
- return [shape[1], shape[2]]
-
-
-def strides_to_HW(shape, format):
- return pool_to_HW(shape, format)
-
-
-def axis_to_barracuda(axis, input_rank):
- N = 0
- H = 1
- W = 2
- C = 3
- if axis < 0:
- axis = input_rank + axis
- assert axis >= 0
- assert axis < input_rank
- if input_rank == 4:
- # [NHWC]
- return [N, H, W, C][axis]
- if input_rank == 3:
- # [N_WC]
- return [N, W, C][axis]
- elif input_rank == 2:
- # [N__C]
- return [N, C][axis]
- elif input_rank == 1:
- # [___C]
- return [C][axis]
- return -1
-
-
-#########################################################
-
-
-def sqr_diff(name, a, b):
- nn = barracuda.Build(name)
- d = nn.sub(a, b)
- nn.mul(d, d, out=name)
- return nn.layers
-
-
-def strided_slice(
- name,
- input,
- input_rank,
- begin,
- end,
- strides,
- begin_mask,
- end_mask,
- ellipsis_mask,
- new_axis_mask,
- shrink_axis_mask,
-):
- assert input_rank != -1
- begin = begin.astype(np.int32).tolist()
- end = end.astype(np.int32).tolist()
- strides = strides.astype(np.int32).tolist()
-
- # StridedSlice range and mask descriptions:
- # https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/strided-slice
- # TODO: I don't think elipsis and newaxis would work together well with current implementation
-
- assert len(begin) == len(end)
- assert len(begin) == len(strides)
-
- # prepare begin, end, stride arrays
- output_rank = input_rank
- insert_pos = 0
- while ellipsis_mask:
- ellipsis_mask >>= 1
- insert_pos += 1
-
- # NOTE: begin=0, end=0, stride=1 <= full range from existing axis
- # begin=0, end=0, stride=0 <= new axis OR shrink axis to single 1st element
- # begin=N, end=N, stride=0 <= shrink axis to single Nth element
- while len(begin) < input_rank:
- if insert_pos:
- begin.insert(insert_pos, 0)
- end.insert(insert_pos, 0)
- strides.insert(insert_pos, 1)
- else:
- begin.append(0)
- end.append(0)
- strides.append(1)
- assert len(begin) <= input_rank
-
- descriptor_count = input_rank
- for i in range(len(begin)):
- if begin_mask & (1 << i):
- begin[i] = 0
- if end_mask & (1 << i):
- end[i] = 0
- if new_axis_mask & (1 << i):
- begin[i] = end[i] = strides[i] = 0
- output_rank += 1
- if shrink_axis_mask & (1 << i):
- end[i] = begin[i]
- strides[i] = 0
- output_rank -= 1
-
- # convert to Barracuda layout
- descriptor_count = len(begin)
- assert descriptor_count <= 4
- if descriptor_count == 3:
- begin = [begin[0], 0, begin[1], begin[2]]
- end = [end[0], 0, end[1], end[2]]
- strides = [strides[0], 1, strides[1], strides[2]]
- elif descriptor_count == 2:
- begin = [begin[0], 0, 0, begin[1]]
- end = [end[0], 0, 0, end[1]]
- strides = [strides[0], 1, 1, strides[1]]
- elif descriptor_count == 1:
- begin = [0, 0, 0, begin[0]]
- end = [0, 0, 0, end[0]]
- strides = [1, 1, 1, strides[0]]
-
- nn = barracuda.Build(name)
- nn.strided_slice(input, begin, end, strides, output_rank, out=name)
- return nn.layers
-
-
-# search backwards starting from index_of_actual_output_node for non-const node
-def locate_actual_output_node(
- nodes, index_of_actual_output_node=-1, find_type="Reshape"
-):
- while (-index_of_actual_output_node - 1) < len(nodes) and nodes[
- index_of_actual_output_node
- ].op != find_type:
- index_of_actual_output_node -= 1
- actual_output_node = nodes[index_of_actual_output_node]
- assert -index_of_actual_output_node < len(nodes)
- return actual_output_node
-
-
-def gru(
- nodes,
- inputs,
- tensors,
- context,
- index_of_actual_output_node,
- assert_output_node_op_type=None,
-):
- assert len(inputs) == 2
-
- def find_tensor_by_name(name, default=None):
- nonlocal tensors
- candidates = [t for t in tensors if t.name.endswith(name)]
- return candidates[0].data if candidates else default
-
- input = inputs[-1]
- state = inputs[0]
- gates_kernel = find_tensor_by_name("/gates/kernel")
- gates_bias = find_tensor_by_name(
- "/gates/bias", default=np.zeros(np.shape(gates_kernel)[-1])
- )
- candidate_kernel = find_tensor_by_name("/candidate/kernel")
- candidate_bias = find_tensor_by_name(
- "/candidate/bias", default=np.zeros(np.shape(candidate_kernel)[-1])
- )
- new_state = nodes[-1].name + "_h"
-
- assert np.shape(gates_kernel)[-1] == np.shape(gates_bias)[-1]
- assert np.shape(candidate_kernel)[-1] == np.shape(candidate_bias)[-1]
-
- num_gates = 2
- seq_length = 1
- hidden_size = np.shape(gates_kernel)[-1] // num_gates
-
- gate_kernels = np.split(gates_kernel, num_gates, axis=-1)
- gate_biases = np.split(gates_bias, num_gates, axis=-1)
-
- context.model_tensors["kernel_r"] = gate_kernels[0]
- context.model_tensors["kernel_u"] = gate_kernels[1]
- context.model_tensors["kernel_c"] = candidate_kernel
- context.model_tensors["bias_r"] = gate_biases[0]
- context.model_tensors["bias_u"] = gate_biases[1]
- context.model_tensors["bias_c"] = candidate_bias
-
- context.layer_ranks[state] = 2
-
- new_layers = barracuda.gru(
- "gru",
- input,
- state,
- "kernel_r",
- "kernel_u",
- "kernel_c",
- "bias_r",
- "bias_u",
- "bias_c",
- new_state,
- )
-
- state_shape = [1, 1, seq_length, hidden_size]
- context.model_memories += [state_shape, state, new_state]
-
- # map exptected output of the replaced pattern to output from our GRU cell
- actual_output_node = locate_actual_output_node(
- nodes, index_of_actual_output_node, assert_output_node_op_type
- )
- context.map_ignored_layer_to_its_input[actual_output_node.name] = new_state
-
- return new_layers
-
-
-def basic_lstm(nodes, inputs, tensors, context, find_type="Reshape"):
- assert len(inputs) == 2
-
- def find_tensor_by_name(name, default=None):
- nonlocal tensors
- candidates = [t for t in tensors if t.name.endswith(name)]
- return candidates[0].data if candidates else default
-
- def find_forget_bias():
- nonlocal nodes
- nonlocal tensors
- # TODO: make it more fault-tolerant
- # search for scalar float constant that is input to Add node
- # and hope it is not a constant for some complex activation function
- for t in tensors:
- if np.prod(t.shape) == 1 and get_tensor_dtype(t.obj) == "DT_FLOAT":
- for n in nodes:
- if n.op == "Add" and t.name in n.input:
- return t.data
- return np.zeros(1)
-
- input = inputs[-1]
- state_c = inputs[0] + "_c"
- state_h = inputs[0] + "_h"
- kernel = find_tensor_by_name("/kernel")
- bias = find_tensor_by_name("/bias", default=np.zeros(np.shape(kernel)[-1]))
- forget_bias = find_forget_bias()
- new_state_c = nodes[-1].name + "_c"
- new_state_h = nodes[-1].name + "_h"
-
- assert np.shape(kernel)[-1] == np.shape(bias)[-1]
-
- num_gates = 4
- seq_length = 1
- hidden_size = np.shape(kernel)[-1] // num_gates
-
- kernels = np.split(kernel, num_gates, axis=-1)
- biases = np.split(bias, num_gates, axis=-1)
-
- context.model_tensors["kernel_i"] = kernels[0]
- context.model_tensors["kernel_j"] = kernels[1]
- context.model_tensors["kernel_f"] = kernels[2]
- context.model_tensors["kernel_o"] = kernels[3]
- context.model_tensors["bias_i"] = biases[0]
- context.model_tensors["bias_j"] = biases[1]
- context.model_tensors["bias_f"] = biases[2] + forget_bias
- context.model_tensors["bias_o"] = biases[3]
-
- context.layer_ranks[state_c] = 2
- context.layer_ranks[state_h] = 2
-
- # lstm_value/strided_slice/stack => lstm_value
- lstm_name = next(i.name for i in nodes if i.name.startswith("lstm")).split("/")[0]
-
- new_layers = barracuda.lstm(
- lstm_name,
- input,
- state_c,
- state_h,
- "kernel_i",
- "kernel_j",
- "kernel_f",
- "kernel_o",
- "bias_i",
- "bias_j",
- "bias_f",
- "bias_o",
- new_state_c,
- new_state_h,
- )
-
- state_shape = [1, 1, seq_length, hidden_size]
- context.model_memories += [state_shape, state_c, new_state_c]
- context.model_memories += [state_shape, state_h, new_state_h]
-
- # map expected output of the replaced pattern to output from our LSTM cell
- actual_output_node = locate_actual_output_node(nodes, -1, find_type)
- concat_out_node = locate_actual_output_node(nodes, -1, "ConcatV2")
- context.map_ignored_layer_to_its_input[actual_output_node.name] = new_state_h
- context.map_ignored_layer_to_its_input[concat_out_node.name] = new_state_c
-
- return new_layers
-
-
-#########################################################
-
-
-def process_layer(layer, context, args):
- model_tensors = context.model_tensors
- input_shapes = context.input_shapes
- layer_ranks = context.layer_ranks
- map_ignored_layer_to_its_input = context.map_ignored_layer_to_its_input
-
- name = layer.name
- class_name = layer.op
- inputs = (
- layer.input
- ) # Tensorflow inputs are always explicit, but in case of Keras we had 'inputs = layer.input or [prev_layer_name]'
- inputs = replace_strings_in_list(inputs, map_ignored_layer_to_its_input)
-
- if class_name == "Nop":
- assert len(inputs) <= 1
- map_ignored_layer_to_its_input[name] = inputs
- return
-
- if class_name == "Const":
- model_tensors[name] = layer.attr["value"].tensor
- layer_ranks[name] = (
- get_layer_rank(layer) or 1
- ) # we treast constants without shape as rank=1 (scalar converted to tensor)
- return
-
- if class_name == "Placeholder":
- assert inputs == []
- map_ignored_layer_to_its_input[name] = inputs
- input_shapes[name] = get_layer_shape(layer)
- layer_ranks[name] = get_layer_rank(layer)
- return
-
- if class_name == "Identity":
- connected_to_const = len(inputs) == 1 and inputs[0] in model_tensors
- if connected_to_const:
- map_ignored_layer_to_its_input[name] = inputs
- return
- else:
- # treat Identity layer that are connected to processing nodes
- # as output from the network
- class_name = "Linear"
-
- if args.print_layers or args.verbose:
- var_tensors = [i for i in inputs if i not in model_tensors]
- const_tensors = [i for i in inputs if i in model_tensors]
- print(
- "'%s' %s Vars:%s Const:%s" % (name, class_name, var_tensors, const_tensors)
- )
-
- if class_name in known_activations:
- activation = class_name
- class_name = "Activation"
- else:
- activation = "Linear"
-
- if class_name not in known_classes:
- if class_name in requires_runtime_flag:
- print("SKIP:", class_name, "layer is used only for training")
- else:
- print("IGNORED:", class_name, "unknown layer")
- map_ignored_layer_to_its_input[name] = inputs
- return
-
- klass = known_classes[class_name]
- if type(klass) == int:
- klass = Struct(id=klass)
-
- o_l = Struct()
- o_l.type = klass.id
- o_l.class_name = class_name
- o_l.name = name
-
- auto_pad = get_attr(layer, "padding") # layer.attr['padding'].s.decode("utf-8")
- pads = get_attr(layer, "pads")
- strides = get_attr(layer, "strides") # layer.attr['strides'].list.i
- pool_size = get_attr(layer, "ksize") # layer.attr['ksize'].list.i
- shape = get_attr(layer, "shape")
- starts = get_attr(layer, "starts")
- ends = get_attr(layer, "ends")
- slice_strides = get_attr(layer, "slice_strides")
- rank = get_attr(layer, "rank") or get_layer_rank(layer)
- data_frmt = get_attr(
- layer, "data_format"
- ) # layer.attr['data_format'].s.decode("utf-8")
- axis = get_attr(layer, "axis")
- alpha = get_attr(layer, "alpha", default=1)
- beta = get_attr(layer, "beta")
-
- if activation and activation not in known_activations:
- print("IGNORED: unknown activation", activation)
- if auto_pad and auto_pad not in known_paddings:
- print("IGNORED: unknown padding", auto_pad)
- if data_frmt and data_frmt not in supported_data_formats:
- print("UNSUPPORTED: data format", data_frmt)
-
- o_l.activation = known_activations.get(activation) or 0
- o_l.pads = (
- known_paddings.get(auto_pad) if auto_pad else pads or starts or [0, 0, 0, 0]
- )
- o_l.strides = strides_to_HW(strides, data_frmt) if strides else slice_strides or []
- o_l.pool_size = (
- pool_to_HW(pool_size, data_frmt) if pool_size else ends or shape or []
- )
- o_l.axis = embody(axis, default=-1)
- o_l.alpha = embody(alpha, default=1)
- o_l.beta = beta or 0
- o_l.rank = (
- -1
- ) # default initialization, actual value will be set later on in this function
-
- tensor_names = [i for i in inputs if i in model_tensors]
- o_l.tensors = [
- Struct(
- name=x,
- shape=get_tensor_dims(model_tensors[x]),
- data=get_tensor_data(model_tensors[x]),
- )
- for x in tensor_names
- ]
- # Patch shapes & data
- layer_has_model_tensors = len(o_l.tensors) > 0
- if hasattr(klass, "out_shapes") and layer_has_model_tensors:
- shapes = klass.out_shapes([x.shape for x in o_l.tensors])
-
- # if we have more shapes than actual tensors,
- # then create & fill missing tensors with zeros
- in_tensor_num = len(o_l.tensors)
- for index, new_shape in enumerate(shapes):
- if index >= in_tensor_num:
- new_tensor = Struct(
- name=("%s/patch:%i") % (name, index - in_tensor_num),
- shape=new_shape,
- data=np.zeros(new_shape),
- )
- o_l.tensors.append(new_tensor)
- assert len(shapes) <= len(o_l.tensors)
-
- if hasattr(klass, "patch_data"):
- data = [x.data for x in o_l.tensors]
-
- patch_data_fn = klass.patch_data
- patch_data_expected_arg_count = patch_data_fn.__code__.co_argcount
- patch_data_args = (
- (data, layer) if patch_data_expected_arg_count > 1 else (data,)
- )
- tensor_data = patch_data_fn(*patch_data_args)
- o_l.tensors = o_l.tensors[
- : len(tensor_data)
- ] # resize tensor array to match patched data - patching might reduce number of tensors
- for x, data in zip(o_l.tensors, tensor_data):
- x.data = data
-
- # after this point we should have equal amount of shapes and tensors
- assert len(o_l.tensors) == len(shapes)
-
- for x, shape in zip(o_l.tensors, shapes):
- assert x.data.size == np.prod(shape)
- x.shape = shape
-
- o_l.inputs = [i for i in inputs if i not in model_tensors]
-
- else:
- # no 'patch_data' lambda was specifiowned, op does not require tensor args
- o_l.tensors = []
- o_l.inputs = inputs
-
- # Force all tensors to float32
- for x in o_l.tensors:
- x.data = x.data.astype(np.float32)
-
- input_ranks = [layer_ranks.get(i, -1) for i in o_l.inputs]
- for i in o_l.inputs:
- if i not in layer_ranks and "lstm" not in i:
- print("WARNING: rank unknown for tensor", i, "while processing node", name)
- if hasattr(klass, "rank"):
- rank = klass.rank
- if hasattr(rank, "__call__"):
- assert (
- -1 not in input_ranks
- ) # for rank() lambda all input ranks have to be known (not -1)
- rank = rank(input_ranks)
- if rank is None:
-
- def all_elements_equal(arr): # http://stackoverflow.com/q/3844948/
- return arr.count(arr[0]) == len(arr)
-
- assert len(input_ranks) > 0
- assert all_elements_equal(input_ranks)
- rank = input_ranks[0]
- layer_ranks[name] = rank
- o_l.rank = rank
-
- # Layer is ready
- context.layers.append(o_l)
-
-
-class ModelBuilderContext:
- def __init__(self):
- self.layers = []
- self.input_shapes = {}
- self.model_tensors = {}
- self.model_memories = []
- self.layer_ranks = {}
- self.map_ignored_layer_to_its_input = {}
-
-
-def process_model(model, args):
- o_context = ModelBuilderContext()
-
- # Find node patterns
- nodes_as_array = [node for node in model.node]
- nodes_as_array = slow_but_stable_topological_sort(nodes_as_array, verbose=True)
-
- node_index = 0
- while node_index < len(nodes_as_array):
- node = nodes_as_array[node_index]
- match = False
- for pattern_repr, pattern_name in known_patterns.items():
- pattern = eval(pattern_repr)
- if node_index + len(pattern) > len(nodes_as_array):
- continue # pattern too long, skip
-
- require_exact_match = pattern[0] == "Const" or pattern[0] == "Identity"
- pattern_end = node_index
-
- def match_node(node, pattern):
- return node.op == pattern or (
- hasattr(pattern, "match") and pattern.match(node.name)
- )
-
- for p in pattern:
- if not require_exact_match:
- while (
- pattern_end < len(nodes_as_array)
- and nodes_as_array[pattern_end].op != p
- and (
- nodes_as_array[pattern_end].op == "Const"
- or nodes_as_array[pattern_end].op == "Identity"
- )
- ):
- pattern_end += 1
- if pattern_end >= len(nodes_as_array):
- break
-
- match = False
- if hasattr(p, "match"): # regexp
- while pattern_end < len(nodes_as_array) and p.match(
- nodes_as_array[pattern_end].name
- ):
- match = True
- pattern_end += 1
- else: # exact string
- match = nodes_as_array[pattern_end].op == p
- pattern_end += 1
-
- if not match:
- break
-
- def get_tensors(pattern_nodes):
- nonlocal o_context
- map_ignored_layer_to_its_input = (
- o_context.map_ignored_layer_to_its_input
- )
- model_tensors = o_context.model_tensors
-
- # tensors <= all Const nodes within this pattern
- const_nodes = [n for n in pattern_nodes if n.op == "Const"]
-
- # TODO: unify / reuse code from process_layer
- identity_nodes = [n for n in pattern_nodes if n.op == "Identity"]
- for i in identity_nodes:
- inputs = replace_strings_in_list(
- i.input, map_ignored_layer_to_its_input
- )
- map_ignored_layer_to_its_input[i.name] = inputs
-
- # gather inputs from Op nodes (not Const, not Identity)
- op_nodes = [
- n
- for n in pattern_nodes
- if n not in const_nodes and n not in identity_nodes
- ]
- inputs_to_op_nodes = list(
- flatten([list(flatten(n.input)) for n in op_nodes])
- )
- inputs_to_op_nodes = replace_strings_in_list(
- inputs_to_op_nodes, map_ignored_layer_to_its_input
- )
- inputs_to_op_nodes = [i.split(":")[0] for i in inputs_to_op_nodes]
-
- const_nodes_by_name = {n.name: n for n in const_nodes}
- tensors = []
- for i in inputs_to_op_nodes:
- if i in model_tensors:
- src = model_tensors[i]
- tensors += [
- Struct(
- name=i,
- obj=src,
- shape=get_tensor_dims(src),
- data=get_tensor_data(src),
- )
- ]
- elif i in const_nodes_by_name:
- src = const_nodes_by_name[i].attr["value"].tensor
- tensors += [
- Struct(
- name=i,
- obj=src,
- shape=get_tensor_dims(src),
- data=get_tensor_data(src),
- )
- ]
- tensor_names = [n.name for n in tensors]
-
- # filter only inputs that are coming from nodes that are outside this pattern
- # preserve the order
- pattern_nodes = [n.name for n in pattern_nodes] + tensor_names
- # inputs_from_outside_pattern = remove_duplicates_from_list([i for i in inputs_to_op_nodes if
- # nodes_by_name[i] not in pattern_nodes])
- inputs_from_outside_pattern = remove_duplicates_from_list(
- [i for i in inputs_to_op_nodes if i not in pattern_nodes]
- )
-
- return inputs_from_outside_pattern, tensors
-
- if match:
- nodes = nodes_as_array[node_index:pattern_end]
- name = nodes[-1].name
- var_tensors, const_tensors = get_tensors(nodes)
- if args.print_patterns or args.verbose:
- print(
- "PATTERN:",
- name,
- "~~",
- pattern_name,
- "<-",
- var_tensors,
- "+",
- [t.name for t in const_tensors],
- )
- print(" ", pattern)
- for n in nodes:
- if n.op == "Const" or n.op == "Identity":
- process_layer(n, o_context, args)
-
- new_layers = transform_patterns[pattern_name](
- nodes, var_tensors, const_tensors, o_context
- )
- if not isinstance(new_layers, list):
- if not hasattr(new_layers, name):
- new_layers.name = name
- new_layers = [new_layers]
-
- for l in new_layers:
- # TODO: prefix new layer names with scope, patch inputs
- # l.name = name + '/' + l.name
- process_layer(l, o_context, args)
-
- node_index = pattern_end
- break # pattern found & processed
-
- if not match:
- # TODO: gather tensors in the same way as patterns do
- process_layer(node, o_context, args)
- node_index += 1
-
- def find_unconnected_const_nodes(nodes):
- nodes_with_consts = {node.name: node for node in nodes if node.op == "Const"}
- for node in nodes:
- for i in node.input:
- nodes_with_consts.pop(i, None)
- return list(nodes_with_consts.keys())
-
- return (
- o_context.layers,
- o_context.input_shapes,
- o_context.model_tensors,
- o_context.model_memories,
- find_unconnected_const_nodes(nodes_as_array),
- )
-
-
-# Sort nodes so that all input dependencies are satisfied beforehand
-# while preserving original order of the nodes in the model whenever possible.
-# NOITE: preservation of original order is important for pattern matching
-def slow_but_stable_topological_sort(nodes, verbose):
-
- nodes_with_consts = [node for node in nodes if node.op == "Const"]
- nodes_for_sorting = [node for node in nodes if node.op != "Const"]
-
- # TODO: optimize for performance
- # based on http://blog.gapotchenko.com/stable-topological-sort
-
- def assign_ids(nodes):
- ids = []
- id_by_name = {}
- id = 0
- for node in nodes:
- id_by_name[node.name] = id
- ids.append(id)
- id += 1
-
- inputs_by_id = [None] * len(nodes)
- for node in nodes:
- id = id_by_name[node.name]
- inputs_by_id[id] = {id_by_name.get(i, -1) for i in node.input}
-
- return ids, inputs_by_id
-
- def sort(ids, inputs_by_id, verbose_lambda):
- sorted = False
- n = len(ids)
- while not sorted:
- sorted = True
- for i in range(n):
- for j in range(i):
- if ids[i] in inputs_by_id[ids[j]]:
- tmp = ids.pop(i)
- ids.insert(j, tmp)
- sorted = False
- verbose_lambda(sorted)
- return ids
-
- prefix_printed = False
-
- def print_status(sorted):
- nonlocal prefix_printed
- if not sorted:
- if not prefix_printed:
- print("Sorting model, may take a while...", end="", flush=True)
- prefix_printed = True
- else:
- print(".", end="", flush=True)
- else:
- if prefix_printed:
- print(" Done!")
-
- ids, inputs_by_id = assign_ids(nodes_for_sorting)
- ids = sort(
- ids, inputs_by_id, lambda sorted: print_status(sorted) if verbose else None
- )
-
- assert len(ids) == len(nodes_for_sorting)
- assert len(ids) + len(nodes_with_consts) == len(nodes)
- return nodes_with_consts + [nodes_for_sorting[id] for id in ids]
-
-
-def very_slow_but_stable_topological_sort(nodes, verbose):
- # TODO: optimize for performance
- # based on http://blog.gapotchenko.com/stable-topological-sort
- n = len(nodes)
- sorted = False
-
- while not sorted:
- sorted = True
- for i in range(n):
- for j in range(i):
- if nodes[i].name in nodes[j].input:
- tmp = nodes.pop(i)
- nodes.insert(j, tmp)
- sorted = False
- assert len(nodes) == n
- return nodes
-
-
-#########################################################
-
-
-def convert(
- source_file,
- target_file,
- trim_unused_by_output="",
- verbose=False,
- compress_f16=False,
-):
- """
- Converts a TensorFlow model into a Barracuda model.
- :param source_file: The TensorFlow Model
- :param target_file: The name of the file the converted model will be saved to
- :param trim_unused_by_output: The regexp to match output nodes to remain in the model.
- All other unconnected nodes will be removed.
- :param verbose: If True, will display debug messages
- :param compress_f16: If true, the float values will be converted to f16
- :return:
- """
- if type(verbose) == bool:
- args = Struct()
- args.verbose = verbose
- args.print_layers = verbose
- args.print_source_json = verbose
- args.print_barracuda_json = verbose
- args.print_layer_links = verbose
- args.print_patterns = verbose
- args.print_tensors = verbose
- args.print_supported_ops = verbose
- else:
- args = verbose
-
- if args.print_supported_ops:
- barracuda.print_known_operations(known_classes, known_activations)
-
- # Load Tensorflow model
- print("Converting %s to %s" % (source_file, target_file))
- f = open(source_file, "rb")
- i_model = tf.GraphDef()
- i_model.ParseFromString(f.read())
-
- if args.verbose:
- print("OP_TYPES:", {layer.op for layer in i_model.node})
-
- if args.print_source_json or args.verbose:
- for layer in i_model.node:
- if not layer.op == "Const":
- print("MODEL:", MessageToJson(layer) + ",")
-
- # Convert
- o_model = barracuda.Model()
- o_model.layers, o_input_shapes, o_model.tensors, o_model.memories, o_model.globals = process_model(
- i_model, args
- )
-
- # Cleanup unconnected Identities (they might linger after processing complex node patterns like LSTM)
- def cleanup_layers(layers):
- all_layers = {l.name for l in layers}
- all_inputs = {i for l in layers for i in l.inputs}
-
- def is_unconnected_identity(layer):
- if layer.class_name == "Activation" and layer.activation == 0: # Identity
- assert len(layer.inputs) == 1
- if layer.inputs[0] not in all_layers and layer.name not in all_inputs:
- return True
- return False
-
- return [l for l in layers if not is_unconnected_identity(l)]
-
- o_model.layers = cleanup_layers(o_model.layers)
-
- all_inputs = {i for l in o_model.layers for i in l.inputs}
-
- # Trim
- if trim_unused_by_output:
- o_model.layers = barracuda.trim(
- o_model.layers, trim_unused_by_output, args.verbose
- )
-
- # Create load layer for constants
- def dims_to_barracuda_shape(dims):
- shape = list(dims)
- while len(shape) < 4:
- shape = [1] + shape
- return shape
-
- const_tensors = [i for i in all_inputs if i in o_model.tensors]
- const_tensors += o_model.globals
- for x in const_tensors:
- shape = dims_to_barracuda_shape(get_tensor_dims(o_model.tensors[x]))
- o_l = Struct(
- type=255, # Load
- class_name="Const",
- name=x,
- pads=[0, 0, 0, 0],
- strides=[],
- pool_size=[],
- axis=-1,
- alpha=1,
- beta=0,
- activation=0,
- inputs=[],
- tensors=[
- Struct(
- name=x,
- shape=shape,
- data=np.reshape(get_tensor_data(o_model.tensors[x]), shape).astype(
- np.float32
- ),
- )
- ],
- )
- o_model.layers.insert(0, o_l)
-
- # Find model inputs & outputs
- all_layers = {l.name for l in o_model.layers}
- # global inputs => are inputs that are NOT connected to any layer in the network
- # global outputs => are outputs that are NOT feeding any layer in the network OR are coming from Identity layers
- o_model.inputs = {
- i: o_input_shapes[i]
- for l in o_model.layers
- for i in l.inputs
- if i not in all_layers and i not in o_model.memories
- }
-
- def is_output_layer(layer):
- if (
- layer.class_name == "Const"
- ): # Constants never count as global output even when unconnected
- return False
- if (
- layer.name not in all_inputs
- ): # this layer is not inputing to any other layer
- return True
- if (
- layer.class_name == "Activation" and layer.activation == 0
- ): # Identity marks global output
- return True
- return False
-
- o_model.outputs = [l.name for l in o_model.layers if is_output_layer(l)]
-
- # Compress
- if compress_f16:
- o_model = barracuda.compress(o_model)
-
- # Sort model so that layer inputs are always ready upfront
- o_model.layers = barracuda.sort(
- o_model.layers, o_model.inputs, o_model.memories, args.verbose
- )
- o_model.layers = barracuda.fuse(o_model.layers, args.verbose)
-
- # Summary
- barracuda.summary(
- o_model,
- print_layer_links=args.print_layer_links or args.verbose,
- print_barracuda_json=args.print_barracuda_json or args.verbose,
- print_tensors=args.print_tensors or args.verbose,
- )
-
- # Write to file
- barracuda.write(o_model, target_file)
- print("DONE: wrote", target_file, "file.")
diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
index 6e9f2b0e61..93fa60d551 100644
--- a/ml-agents/mlagents/trainers/trainer/rl_trainer.py
+++ b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
@@ -24,19 +24,10 @@
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.agent_processor import AgentManagerQueue
from mlagents.trainers.trajectory import Trajectory
-from mlagents.trainers.settings import TrainerSettings, FrameworkType
+from mlagents.trainers.settings import TrainerSettings
from mlagents.trainers.stats import StatsPropertyType
from mlagents.trainers.model_saver.model_saver import BaseModelSaver
-from mlagents.trainers.exception import UnityTrainerException
-from mlagents import tf_utils
-
-if tf_utils.is_available():
- from mlagents.trainers.policy.tf_policy import TFPolicy
- from mlagents.trainers.model_saver.tf_model_saver import TFModelSaver
-else:
- TFPolicy = None # type: ignore
- TFModelSaver = None # type: ignore
logger = get_logger(__name__)
@@ -59,18 +50,11 @@ def __init__(self, *args, **kwargs):
self._stats_reporter.add_property(
StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict()
)
- self.framework = self.trainer_settings.framework
- if self.framework == FrameworkType.TENSORFLOW and not tf_utils.is_available():
- raise UnityTrainerException(
- "To use the TensorFlow backend, install the TensorFlow Python package first."
- )
-
- logger.debug(f"Using framework {self.framework.value}")
self._next_save_step = 0
self._next_summary_step = 0
self.model_saver = self.create_model_saver(
- self.framework, self.trainer_settings, self.artifact_path, self.load
+ self.trainer_settings, self.artifact_path, self.load
)
def end_episode(self) -> None:
@@ -126,12 +110,7 @@ def create_policy(
behavior_spec: BehaviorSpec,
create_graph: bool = False,
) -> Policy:
- if self.framework == FrameworkType.PYTORCH:
- return self.create_torch_policy(parsed_behavior_id, behavior_spec)
- else:
- return self.create_tf_policy(
- parsed_behavior_id, behavior_spec, create_graph=create_graph
- )
+ return self.create_torch_policy(parsed_behavior_id, behavior_spec)
@abc.abstractmethod
def create_torch_policy(
@@ -142,30 +121,13 @@ def create_torch_policy(
"""
pass
- @abc.abstractmethod
- def create_tf_policy(
- self,
- parsed_behavior_id: BehaviorIdentifiers,
- behavior_spec: BehaviorSpec,
- create_graph: bool = False,
- ) -> TFPolicy:
- """
- Create a Policy object that uses the TensorFlow backend.
- """
- pass
-
@staticmethod
def create_model_saver(
- framework: str, trainer_settings: TrainerSettings, model_path: str, load: bool
+ trainer_settings: TrainerSettings, model_path: str, load: bool
) -> BaseModelSaver:
- if framework == FrameworkType.PYTORCH:
- model_saver = TorchModelSaver( # type: ignore
- trainer_settings, model_path, load
- )
- else:
- model_saver = TFModelSaver( # type: ignore
- trainer_settings, model_path, load
- )
+ model_saver = TorchModelSaver( # type: ignore
+ trainer_settings, model_path, load
+ )
return model_saver
def _policy_mean_reward(self) -> Optional[float]:
@@ -187,7 +149,7 @@ def _checkpoint(self) -> ModelCheckpoint:
"Trainer has multiple policies, but default behavior only saves the first."
)
checkpoint_path = self.model_saver.save_checkpoint(self.brain_name, self.step)
- export_ext = "nn" if self.framework == FrameworkType.TENSORFLOW else "onnx"
+ export_ext = "onnx"
new_checkpoint = ModelCheckpoint(
int(self.step),
f"{checkpoint_path}.{export_ext}",
@@ -214,7 +176,7 @@ def save_model(self) -> None:
model_checkpoint = self._checkpoint()
self.model_saver.copy_final_model(model_checkpoint.file_path)
- export_ext = "nn" if self.framework == FrameworkType.TENSORFLOW else "onnx"
+ export_ext = "onnx"
final_checkpoint = attr.evolve(
model_checkpoint, file_path=f"{self.model_saver.model_path}.{export_ext}"
)
diff --git a/ml-agents/mlagents/trainers/trainer/trainer_factory.py b/ml-agents/mlagents/trainers/trainer/trainer_factory.py
index 78419b6062..0cb548baa6 100644
--- a/ml-agents/mlagents/trainers/trainer/trainer_factory.py
+++ b/ml-agents/mlagents/trainers/trainer/trainer_factory.py
@@ -9,7 +9,7 @@
from mlagents.trainers.sac.trainer import SACTrainer
from mlagents.trainers.ghost.trainer import GhostTrainer
from mlagents.trainers.ghost.controller import GhostController
-from mlagents.trainers.settings import TrainerSettings, TrainerType, FrameworkType
+from mlagents.trainers.settings import TrainerSettings, TrainerType
logger = get_logger(__name__)
@@ -26,8 +26,6 @@ def __init__(
param_manager: EnvironmentParameterManager,
init_path: str = None,
multi_gpu: bool = False,
- force_torch: bool = False,
- force_tensorflow: bool = False,
):
"""
The TrainerFactory generates the Trainers based on the configuration passed as
@@ -45,10 +43,6 @@ def __init__(
the EnvironmentParameters must change.
:param init_path: Path from which to load model.
:param multi_gpu: If True, multi-gpu will be used. (currently not available)
- :param force_torch: If True, the Trainers will all use the PyTorch framework
- instead of what is specified in the config YAML.
- :param force_tensorflow: If True, thee Trainers will all use the TensorFlow
- framework.
"""
self.trainer_config = trainer_config
self.output_path = output_path
@@ -59,8 +53,6 @@ def __init__(
self.param_manager = param_manager
self.multi_gpu = multi_gpu
self.ghost_controller = GhostController()
- self._force_torch = force_torch
- self._force_tf = force_tensorflow
def generate(self, behavior_name: str) -> Trainer:
if behavior_name not in self.trainer_config.keys():
@@ -69,20 +61,6 @@ def generate(self, behavior_name: str) -> Trainer:
f"in the trainer configuration file: {sorted(self.trainer_config.keys())}"
)
trainer_settings = self.trainer_config[behavior_name]
- if self._force_torch:
- trainer_settings.framework = FrameworkType.PYTORCH
- logger.warning(
- "Note that specifying --torch is not required anymore as PyTorch is the default framework."
- )
- if self._force_tf:
- trainer_settings.framework = FrameworkType.TENSORFLOW
- logger.warning(
- "Setting the framework to TensorFlow. TensorFlow trainers will be deprecated in the future."
- )
- if self._force_torch:
- logger.warning(
- "Both --torch and --tensorflow CLI options were specified. Using TensorFlow."
- )
return TrainerFactory._initialize_trainer(
trainer_settings,
behavior_name,
diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py
index f595b8b198..7f9808f5dd 100644
--- a/ml-agents/mlagents/trainers/trainer_controller.py
+++ b/ml-agents/mlagents/trainers/trainer_controller.py
@@ -8,8 +8,6 @@
from collections import defaultdict
import numpy as np
-from mlagents.tf_utils import tf
-from mlagents import tf_utils
from mlagents_envs.logging_util import get_logger
from mlagents.trainers.env_manager import EnvManager, EnvironmentStep
@@ -29,8 +27,8 @@
from mlagents.trainers.trainer import TrainerFactory
from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
from mlagents.trainers.agent_processor import AgentManager
-from mlagents.tf_utils.globals import get_rank
from mlagents import torch_utils
+from mlagents.torch_utils.globals import get_rank
class TrainerController:
@@ -50,7 +48,7 @@ def __init__(
:param param_manager: EnvironmentParameterManager object which stores information about all
environment parameters.
:param train: Whether to train model, or only run inference.
- :param training_seed: Seed to use for Numpy and Tensorflow random number generation.
+ :param training_seed: Seed to use for Numpy and Torch random number generation.
:param threaded: Whether or not to run trainers in a separate thread. Disable for testing/debugging.
"""
self.trainers: Dict[str, Trainer] = {}
@@ -67,8 +65,6 @@ def __init__(
self.trainer_threads: List[threading.Thread] = []
self.kill_trainers = False
np.random.seed(training_seed)
- if tf_utils.is_available():
- tf.set_random_seed(training_seed)
torch_utils.torch.manual_seed(training_seed)
self.rank = get_rank()
@@ -136,7 +132,9 @@ def _create_trainer_and_manager(
self.trainer_threads.append(trainerthread)
policy = trainer.create_policy(
- parsed_behavior_id, env_manager.training_behaviors[name_behavior_id]
+ parsed_behavior_id,
+ env_manager.training_behaviors[name_behavior_id],
+ create_graph=True,
)
trainer.add_policy(parsed_behavior_id, policy)
@@ -167,8 +165,6 @@ def _create_trainers_and_managers(
@timed
def start_learning(self, env_manager: EnvManager) -> None:
self._create_output_path(self.output_path)
- if tf_utils.is_available():
- tf.reset_default_graph()
try:
# Initial reset
self._reset_env(env_manager)
diff --git a/ml-agents/mlagents/trainers/training_status.py b/ml-agents/mlagents/trainers/training_status.py
index 3eff30b84a..41ea9e907e 100644
--- a/ml-agents/mlagents/trainers/training_status.py
+++ b/ml-agents/mlagents/trainers/training_status.py
@@ -6,7 +6,6 @@
import cattr
from mlagents.torch_utils import torch
-from mlagents.tf_utils import tf, is_available as tf_is_available
from mlagents_envs.logging_util import get_logger
from mlagents.trainers import __version__
from mlagents.trainers.exception import TrainerError
@@ -28,7 +27,6 @@ class StatusMetaData:
stats_format_version: str = STATUS_FORMAT_VERSION
mlagents_version: str = __version__
torch_version: str = torch.__version__
- tensorflow_version: str = tf.__version__ if tf_is_available() else -1
def to_dict(self) -> Dict[str, str]:
return cattr.unstructure(self)
@@ -47,10 +45,6 @@ def check_compatibility(self, other: "StatusMetaData") -> None:
logger.warning(
"Checkpoint was loaded from a different version of ML-Agents. Some things may not resume properly."
)
- if self.tensorflow_version != other.tensorflow_version:
- logger.warning(
- "Tensorflow checkpoint was saved with a different version of Tensorflow. Model may not resume properly."
- )
if self.torch_version != other.torch_version:
logger.warning(
"PyTorch checkpoint was saved with a different version of PyTorch. Model may not resume properly."
diff --git a/ml-agents/setup.py b/ml-agents/setup.py
index e9338ca45e..55fcc94467 100644
--- a/ml-agents/setup.py
+++ b/ml-agents/setup.py
@@ -80,5 +80,4 @@ def run(self):
]
},
cmdclass={"verify": VerifyVersionCommand},
- extras_require={"tensorflow": ["tensorflow>=1.14,<3.0", "six>=1.12.0"]},
)
diff --git a/ml-agents/tests/yamato/yamato_utils.py b/ml-agents/tests/yamato/yamato_utils.py
index ad61223eb4..c2612054ab 100644
--- a/ml-agents/tests/yamato/yamato_utils.py
+++ b/ml-agents/tests/yamato/yamato_utils.py
@@ -112,18 +112,15 @@ def init_venv(
# Set up the venv and install mlagents
subprocess.check_call(f"python -m venv {venv_path}", shell=True)
- pip_commands = [
- "--upgrade pip",
- "--upgrade setuptools",
- # TODO build these and publish to internal pypi
- "~/tensorflow_pkg/tensorflow-2.0.0-cp37-cp37m-macosx_10_14_x86_64.whl",
- "tf2onnx==1.6.1",
- ]
+ pip_commands = ["--upgrade pip", "--upgrade setuptools"]
if mlagents_python_version:
# install from pypi
pip_commands += [
f"mlagents=={mlagents_python_version}",
f"gym-unity=={mlagents_python_version}",
+ # TODO build these and publish to internal pypi
+ "~/tensorflow_pkg/tensorflow-2.0.0-cp37-cp37m-macosx_10_14_x86_64.whl",
+ "tf2onnx==1.6.1",
]
else:
# Local install
diff --git a/test_constraints_max_tf1_version.txt b/test_constraints_max_tf1_version.txt
deleted file mode 100644
index d14c5fe4cc..0000000000
--- a/test_constraints_max_tf1_version.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# pip constraints to use the *highest* versions allowed in ml-agents/setup.py
-# with the exception of tensorflow, which is constrained to <2
-# For projects with upper bounds, we should periodically update this list to the latest release version
-grpcio>=1.23.0
-numpy>=1.17.2
-tensorflow>=1.15.2,<2.0.0
-h5py>=2.10.0
diff --git a/test_constraints_max_tf2_version.txt b/test_constraints_max_tf2_version.txt
deleted file mode 100644
index 74dca2a3c7..0000000000
--- a/test_constraints_max_tf2_version.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# pip constraints to use the *highest* versions allowed in ml-agents/setup.py
-# For projects with upper bounds, we should periodically update this list to the latest release version
-grpcio>=1.23.0
-numpy>=1.17.2
-tensorflow==2.3.0
-h5py>=2.10.0
diff --git a/test_constraints_min_version.txt b/test_constraints_min_version.txt
deleted file mode 100644
index a6dac3fcf9..0000000000
--- a/test_constraints_min_version.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-# pip constraints to use the *lowest* versions allowed in ml-agents/setup.py
-grpcio==1.11.0
-numpy==1.14.1
-Pillow==4.2.1
-protobuf==3.6
-tensorflow==1.14.0
-h5py==2.9.0
-tensorboard==1.15.0
diff --git a/test_requirements.txt b/test_requirements.txt
index b08f286fa7..1a3d424ec2 100644
--- a/test_requirements.txt
+++ b/test_requirements.txt
@@ -3,7 +3,3 @@ pytest>4.0.0,<6.0.0
pytest-cov==2.6.1
pytest-xdist==1.34.0
-# Tensorflow tests are here for the time being, before they are used in the codebase.
-tensorflow>=1.14,<3.0
-
-tf2onnx>=1.5.5