Skip to content

Commit 7ac2bd0

Browse files
Ervin TRuo-Ping Dong
authored andcommitted
[bug-fix] Don't load non-wrapped policy (#4593)
* Always initialize non-wrapped policy * Load ghosted policy * Update changelog * Resume test * Add test * Add torch test and fix torch.
1 parent e67bc08 commit 7ac2bd0

File tree

3 files changed

+77
-6
lines changed

3 files changed

+77
-6
lines changed

com.unity.ml-agents/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ and this project adheres to
1717
#### com.unity.ml-agents (C#)
1818
- Fixed a bug with visual observations using .onnx model files and newer versions of Barracuda (1.1.0 or later). (#4533)
1919

20+
#### ml-agents / ml-agents-envs / gym-unity (Python)
21+
- Fixed an issue where runs could not be resumed when using TensorFlow and Ghost Training. (#4593)
22+
2023
## [1.0.5] - 2020-09-23
2124
### Minor Changes
2225
#### com.unity.ml-agents (C#)

ml-agents/mlagents/trainers/ghost/trainer.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -147,11 +147,11 @@ def get_step(self) -> int:
147147
@property
148148
def reward_buffer(self) -> Deque[float]:
149149
"""
150-
Returns the reward buffer. The reward buffer contains the cumulative
151-
rewards of the most recent episodes completed by agents using this
152-
trainer.
153-
:return: the reward buffer.
154-
"""
150+
Returns the reward buffer. The reward buffer contains the cumulative
151+
rewards of the most recent episodes completed by agents using this
152+
trainer.
153+
:return: the reward buffer.
154+
"""
155155
return self.trainer.reward_buffer
156156

157157
@property
@@ -326,7 +326,6 @@ def create_policy(
326326
"""
327327
policy = self.trainer.create_policy(parsed_behavior_id, brain_parameters)
328328
policy.create_tf_graph()
329-
policy.initialize_or_load()
330329
policy.init_load_weights()
331330
team_id = parsed_behavior_id.team_id
332331
self.controller.subscribe_team_id(team_id, self)
@@ -346,6 +345,11 @@ def create_policy(
346345
self._save_snapshot() # Need to save after trainer initializes policy
347346
self._learning_team = self.controller.get_learning_team
348347
self.wrapped_trainer_team = team_id
348+
else:
349+
# Load the weights of the ghost policy from the wrapped one
350+
policy.load_weights(
351+
self.trainer.get_policy(parsed_behavior_id).get_weights()
352+
)
349353
return policy
350354

351355
def add_policy(

ml-agents/mlagents/trainers/tests/test_ghost.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,70 @@ def test_load_and_set(dummy_config, use_discrete):
9595
np.testing.assert_array_equal(w, lw)
9696

9797

98+
def test_resume(dummy_config, tmp_path):
99+
brain_params_team0 = BrainParameters(
100+
brain_name="test_brain?team=0",
101+
vector_observation_space_size=1,
102+
camera_resolutions=[],
103+
vector_action_space_size=[2],
104+
vector_action_descriptions=[],
105+
vector_action_space_type=0,
106+
)
107+
108+
brain_name = BehaviorIdentifiers.from_name_behavior_id(
109+
brain_params_team0.brain_name
110+
).brain_name
111+
112+
brain_params_team1 = BrainParameters(
113+
brain_name="test_brain?team=1",
114+
vector_observation_space_size=1,
115+
camera_resolutions=[],
116+
vector_action_space_size=[2],
117+
vector_action_descriptions=[],
118+
vector_action_space_type=0,
119+
)
120+
121+
tmp_path = tmp_path.as_posix()
122+
ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path)
123+
controller = GhostController(100)
124+
trainer = GhostTrainer(
125+
ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path
126+
)
127+
128+
parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
129+
brain_params_team0.brain_name
130+
)
131+
policy = trainer.create_policy(parsed_behavior_id0, brain_params_team0)
132+
trainer.add_policy(parsed_behavior_id0, policy)
133+
134+
parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
135+
brain_params_team1.brain_name
136+
)
137+
policy = trainer.create_policy(parsed_behavior_id1, brain_params_team1)
138+
trainer.add_policy(parsed_behavior_id1, policy)
139+
140+
trainer.save_model(parsed_behavior_id0.behavior_id)
141+
142+
# Make a new trainer, check that the policies are the same
143+
ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path)
144+
trainer2 = GhostTrainer(
145+
ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path
146+
)
147+
policy = trainer2.create_policy(parsed_behavior_id0, brain_params_team0)
148+
trainer2.add_policy(parsed_behavior_id0, policy)
149+
150+
policy = trainer2.create_policy(parsed_behavior_id1, brain_params_team1)
151+
trainer2.add_policy(parsed_behavior_id1, policy)
152+
153+
trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id)
154+
trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id)
155+
weights = trainer1_policy.get_weights()
156+
weights2 = trainer2_policy.get_weights()
157+
158+
for w, lw in zip(weights, weights2):
159+
np.testing.assert_array_equal(w, lw)
160+
161+
98162
def test_process_trajectory(dummy_config):
99163
brain_params_team0 = BrainParameters(
100164
brain_name="test_brain?team=0",

0 commit comments

Comments
 (0)