huggingface
diff --git a/‎examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/generation/logits_process.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/generation/logits_process.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/modeling_outputs.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers/modeling_outputs.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transformers/modeling_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/modeling_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/align/configuration_align.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/align/configuration_align.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/blip/modeling_blip_text.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/blip/modeling_blip_text.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/blip/modeling_tf_blip_text.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/blip/modeling_tf_blip_text.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/blip_2/modeling_blip_2.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/blip_2/modeling_blip_2.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bloom/modeling_bloom.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bloom/modeling_bloom.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/data2vec/configuration_data2vec_audio.py‎
Lines changed: 3 additions & 0 deletions b/‎src/transformers/models/data2vec/configuration_data2vec_audio.py‎
Lines changed: 3 additions & 0 deletions
@@ -253,7 +253,7 @@ def forward(
 
         Returns:
             :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-            loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
                 Classification (or regression if config.num_labels==1) loss.
             logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
                 Classification (or regression if config.num_labels==1) scores (before SoftMax).
 
@@ -678,7 +678,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
     generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more information.
 
     Args:
-        prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`):
+        prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`):
             This function constraints the beam search to allowed tokens only at each step. This function takes 2
             arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
             next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
 
@@ -1522,7 +1522,7 @@ class Seq2SeqTSModelOutput(ModelOutput):
         scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
             Scaling values of each time series' context window which is used to give the model inputs of the same
             magnitude and then used to rescale back to the original magnitude.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+        static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
             Static features of each time series' in a batch which are copied to the covariates at inference time.
     """
 
@@ -1593,7 +1593,7 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
         scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
             Scaling values of each time series' context window which is used to give the model inputs of the same
             magnitude and then used to rescale back to the original magnitude.
-        static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
+        static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
             Static features of each time series' in a batch which are copied to the covariates at inference time.
     """
 
 
@@ -912,7 +912,7 @@ def get_head_mask(
                 The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
             num_hidden_layers (`int`):
                 The number of hidden layers in the model.
-            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+            is_attention_chunked (`bool`, *optional*, defaults to `False`):
                 Whether or not the attentions scores are computed by chunks or not.
 
         Returns:
 
@@ -184,7 +184,7 @@ class AlignVisionConfig(PretrainedConfig):
             List of output channel sizes to be used in each block for convolutional layers.
         depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
             List of block indices with square padding.
-        strides: (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
+        strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
             List of stride sizes to be used in each block for convolutional layers.
         num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
             List of the number of times each block is to repeated.
 
@@ -613,7 +613,7 @@ def get_extended_attention_mask(
                 Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
             input_shape (`Tuple[int]`):
                 The shape of the input to the model.
-            device: (`torch.device`):
+            device (`torch.device`):
                 The device of the input to the model.
 
         Returns:
 
@@ -633,7 +633,7 @@ def get_extended_attention_mask(
                 Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
             input_shape (`Tuple[int]`):
                 The shape of the input to the model.
-            is_decoder: (`bool`):
+            is_decoder (`bool`):
                 Whether the model is used as a decoder.
 
         Returns:
 
@@ -1059,7 +1059,7 @@ def get_extended_attention_mask(
                 Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
             input_shape (`Tuple[int]`):
                 The shape of the input to the model.
-            device: (`torch.device`):
+            device (`torch.device`):
                 The device of the input to the model.
 
         Returns:
 
@@ -256,7 +256,7 @@ def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
         Merge heads together over the last dimenstion
 
         Args:
-            x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
 
         Returns:
             torch.tensor: [batch_size, seq_length, num_heads * head_dim]
 
@@ -62,6 +62,9 @@ class Data2VecAudioConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):