Update flash_attention_patch.py

Orion-Zheng · Orion-Zheng · commit 7768afbad09e · 2023-10-16T14:00:45.000+08:00
To be compatible with the new change in the Transformers library, where a new argument 'padding_mask' was added to forward function of attention layer. huggingface/transformers#25598
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py
@@ -65,6 +65,7 @@ def attention_forward(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
+    **kwargs
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
     Re-define LLaMA-2 `LlamaAttention` forward method using flash-attention.