DLR-RM
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 3 additions & 3 deletions b/‎CONTRIBUTING.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/_static/css/baselines_theme.css‎
Lines changed: 9 additions & 0 deletions b/‎docs/_static/css/baselines_theme.css‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/misc/changelog.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/misc/changelog.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎stable_baselines3/a2c/a2c.py‎
Lines changed: 21 additions & 21 deletions b/‎stable_baselines3/a2c/a2c.py‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎stable_baselines3/common/atari_wrappers.py‎
Lines changed: 23 additions & 23 deletions b/‎stable_baselines3/common/atari_wrappers.py‎
Lines changed: 23 additions & 23 deletions
@@ -50,9 +50,9 @@ def my_function(arg1: type1, arg2: type2) -> returntype:
     """
     Short description of the function.
 
-    :param arg1: (type1) describe what is arg1
-    :param arg2: (type2) describe what is arg2
-    :return: (returntype) describe what is returned
+    :param arg1: describe what is arg1
+    :param arg2: describe what is arg2
+    :return: describe what is returned
     """
     ...
     return my_variable
 
@@ -50,3 +50,12 @@ a.icon.icon-home {
 .codeblock,pre.literal-block,.rst-content .literal-block,.rst-content pre.literal-block,div[class^='highlight'] {
         background: #f8f8f8;;
 }
+
+/* Change style of types in the docstrings .rst-content .field-list */
+.field-list .xref.py.docutils, .field-list code.docutils, .field-list .docutils.literal.notranslate
+{
+  border: None;
+  padding-left: 0;
+  padding-right: 0;
+  color: #404040;
+}
@@ -56,6 +56,7 @@ Documentation:
 - Added ``StopTrainingOnMaxEpisodes`` details and example (@xicocaio)
 - Updated custom policy section (added custom feature extractor example)
 - Re-enable ``sphinx_autodoc_typehints``
+- Updated doc style for type hints and remove duplicated type hints
 
 
 
 
@@ -21,34 +21,34 @@ class A2C(OnPolicyAlgorithm):
 
     Introduction to A2C: https://hackernoon.com/intuitive-rl-intro-to-advantage-actor-critic-a2c-4ff545978752
 
-    :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, ...)
-    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
-    :param learning_rate: (float or callable) The learning rate, it can be a function
-    :param n_steps: (int) The number of steps to run for each environment per update
+    :param policy: The policy model to use (MlpPolicy, CnnPolicy, ...)
+    :param env: The environment to learn from (if registered in Gym, can be str)
+    :param learning_rate: The learning rate, it can be a function
+    :param n_steps: The number of steps to run for each environment per update
         (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
-    :param gamma: (float) Discount factor
-    :param gae_lambda: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator
+    :param gamma: Discount factor
+    :param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator
         Equivalent to classic advantage when set to 1.
-    :param ent_coef: (float) Entropy coefficient for the loss calculation
-    :param vf_coef: (float) Value function coefficient for the loss calculation
-    :param max_grad_norm: (float) The maximum value for the gradient clipping
-    :param rms_prop_eps: (float) RMSProp epsilon. It stabilizes square root computation in denominator
+    :param ent_coef: Entropy coefficient for the loss calculation
+    :param vf_coef: Value function coefficient for the loss calculation
+    :param max_grad_norm: The maximum value for the gradient clipping
+    :param rms_prop_eps: RMSProp epsilon. It stabilizes square root computation in denominator
         of RMSProp update
-    :param use_rms_prop: (bool) Whether to use RMSprop (default) or Adam as optimizer
-    :param use_sde: (bool) Whether to use generalized State Dependent Exploration (gSDE)
+    :param use_rms_prop: Whether to use RMSprop (default) or Adam as optimizer
+    :param use_sde: Whether to use generalized State Dependent Exploration (gSDE)
         instead of action noise exploration (default: False)
-    :param sde_sample_freq: (int) Sample a new noise matrix every n steps when using gSDE
+    :param sde_sample_freq: Sample a new noise matrix every n steps when using gSDE
         Default: -1 (only sample at the beginning of the rollout)
-    :param normalize_advantage: (bool) Whether to normalize or not the advantage
-    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
-    :param create_eval_env: (bool) Whether to create a second environment that will be
+    :param normalize_advantage: Whether to normalize or not the advantage
+    :param tensorboard_log: the log location for tensorboard (if None, no logging)
+    :param create_eval_env: Whether to create a second environment that will be
         used for evaluating the agent periodically. (Only available when passing string for the environment)
-    :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-    :param verbose: (int) the verbosity level: 0 no output, 1 info, 2 debug
-    :param seed: (int) Seed for the pseudo random generators
-    :param device: (str or th.device) Device (cpu, cuda, ...) on which the code should be run.
+    :param policy_kwargs: additional arguments to be passed to the policy on creation
+    :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
+    :param seed: Seed for the pseudo random generators
+    :param device: Device (cpu, cuda, ...) on which the code should be run.
         Setting it to auto, the code will be run on the GPU if possible.
-    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
+    :param _init_setup_model: Whether or not to build the network at the creation of the instance
     """
 
     def __init__(
 
@@ -18,8 +18,8 @@ def __init__(self, env: gym.Env, noop_max: int = 30):
         Sample initial states by taking random number of no-ops on reset.
         No-op is assumed to be action 0.
 
-        :param env: (gym.Env) the environment to wrap
-        :param noop_max: (int) the maximum value of no-ops to run
+        :param env: the environment to wrap
+        :param noop_max: the maximum value of no-ops to run
         """
         gym.Wrapper.__init__(self, env)
         self.noop_max = noop_max
@@ -47,7 +47,7 @@ def __init__(self, env: gym.Env):
         """
         Take action on reset for environments that are fixed until firing.
 
-        :param env: (gym.Env) the environment to wrap
+        :param env: the environment to wrap
         """
         gym.Wrapper.__init__(self, env)
         assert env.unwrapped.get_action_meanings()[1] == "FIRE"
@@ -70,7 +70,7 @@ def __init__(self, env: gym.Env):
         Make end-of-life == end-of-episode, but only reset on true game over.
         Done by DeepMind for the DQN and co. since it helps value estimation.
 
-        :param env: (gym.Env) the environment to wrap
+        :param env: the environment to wrap
         """
         gym.Wrapper.__init__(self, env)
         self.lives = 0
@@ -97,7 +97,7 @@ def reset(self, **kwargs) -> np.ndarray:
         and the learner need not know about any of this behind-the-scenes.
 
         :param kwargs: Extra keywords passed to env.reset() call
-        :return: (np.ndarray) the first observation of the environment
+        :return: the first observation of the environment
         """
         if self.was_real_done:
             obs = self.env.reset(**kwargs)
@@ -113,8 +113,8 @@ def __init__(self, env: gym.Env, skip: int = 4):
         """
         Return only every ``skip``-th frame (frameskipping)
 
-        :param env: (gym.Env) the environment
-        :param skip: (int) number of ``skip``-th frame
+        :param env: the environment
+        :param skip: number of ``skip``-th frame
         """
         gym.Wrapper.__init__(self, env)
         # most recent raw observations (for max pooling across time steps)
@@ -126,8 +126,8 @@ def step(self, action: int) -> GymStepReturn:
         Step the environment with the given action
         Repeat action, sum reward, and max over last observations.
 
-        :param action: ([int] or [float]) the action
-        :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information
+        :param action: the action
+        :return: observation, reward, done, information
         """
         total_reward = 0.0
         done = None
@@ -155,16 +155,16 @@ def __init__(self, env: gym.Env):
         """
         Clips the reward to {+1, 0, -1} by its sign.
 
-        :param env: (gym.Env) the environment
+        :param env: the environment
         """
         gym.RewardWrapper.__init__(self, env)
 
     def reward(self, reward: float) -> float:
         """
         Bin reward to {+1, 0, -1} by its sign.
 
-        :param reward: (float)
-        :return: (float)
+        :param reward:
+        :return:
         """
         return np.sign(reward)
 
@@ -175,9 +175,9 @@ def __init__(self, env: gym.Env, width: int = 84, height: int = 84):
         Convert to grayscale and warp frames to 84x84 (default)
         as done in the Nature paper and later work.
 
-        :param env: (gym.Env) the environment
-        :param width: (int)
-        :param height: (int)
+        :param env: the environment
+        :param width:
+        :param height:
         """
         gym.ObservationWrapper.__init__(self, env)
         self.width = width
@@ -190,8 +190,8 @@ def observation(self, frame: np.ndarray) -> np.ndarray:
         """
         returns the current observation from a frame
 
-        :param frame: (np.ndarray) environment frame
-        :return: (np.ndarray) the observation
+        :param frame: environment frame
+        :return: the observation
         """
         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
         frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
@@ -212,13 +212,13 @@ class AtariWrapper(gym.Wrapper):
     * Grayscale observation
     * Clip reward to {-1, 0, 1}
 
-    :param env: (gym.Env) gym environment
-    :param noop_max: (int): max number of no-ops
-    :param frame_skip: (int): the frequency at which the agent experiences the game.
-    :param screen_size: (int): resize Atari frame
-    :param terminal_on_life_loss: (bool): if True, then step() returns done=True whenever a
+    :param env: gym environment
+    :param noop_max:: max number of no-ops
+    :param frame_skip:: the frequency at which the agent experiences the game.
+    :param screen_size:: resize Atari frame
+    :param terminal_on_life_loss:: if True, then step() returns done=True whenever a
             life is lost.
-    :param clip_reward: (bool) If True (default), the reward is clip to {-1, 0, 1} depending on its sign.
+    :param clip_reward: If True (default), the reward is clip to {-1, 0, 1} depending on its sign.
     """
 
     def __init__(