Support Cache Class for New Versions of Transformers Library

craymichael · facebook-github-bot · commit 9e412b8a943d · 2024-09-09T15:28:55.000-07:00
Summary:
Fixes D62210529 (now reverted by D62262760). Transformers library is now an optional dependency. We do not depend on it, however, we have some logic for `transformers` models here. The library will only be imported if a model already has the library in the corresponding environment. This TARGETS configuration prevents transformers version conflicts which e.g. caused T200877742.

Add support for new transformers Cache objects. This may need changes in the future as it seems that LLMs handle Caching differently. Some handle Caching themselves, however, some of them do not and some of them don't support Caches yet. Llama models seem to have a `_supports_cache_class` flag that indicates whether this new Cache object is supported. If it isn't marked as supported, we assume it takes legacy format (tuple past values). Multiple checks added to ensure compatibility.

(minor) Also, changed the defaults for LLM generation to dismiss warnings (does not change generation behavior).

Differential Revision: D62408520
diff --git a/captum/_utils/transformers_typing.py b/captum/_utils/transformers_typing.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+
+# pyre-strict
+
+try:
+    # pyre-ignore[21]: Could not find a module corresponding to import
+    #  `transformers.cache_utils`
+    from transformers.cache_utils import Cache, DynamicCache
+except ImportError:
+    Cache = DynamicCache = None
diff --git a/captum/attr/_core/llm_attr.py b/captum/attr/_core/llm_attr.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 import torch
+from captum._utils.transformers_typing import Cache, DynamicCache
 from captum._utils.typing import TokenizerLike
 from captum.attr._core.feature_ablation import FeatureAblation
 from captum.attr._core.kernel_shap import KernelShap
@@ -27,8 +28,12 @@
 )
 from torch import nn, Tensor
 
-
-DEFAULT_GEN_ARGS = {"max_new_tokens": 25, "do_sample": False}
+DEFAULT_GEN_ARGS: Dict[str, Any] = {
+    "max_new_tokens": 25,
+    "do_sample": False,
+    "temperature": None,
+    "top_p": None,
+}
 
 
 class LLMAttributionResult:
@@ -258,15 +263,24 @@ def _forward_func(
         init_model_inp = perturbed_input
 
         model_inp = init_model_inp
-        attention_mask = torch.tensor([[1] * model_inp.shape[1]])
-        attention_mask = attention_mask.to(model_inp.device)
+        attention_mask = torch.ones(
+            [1, model_inp.shape[1]], dtype=torch.long, device=model_inp.device
+        )
         model_kwargs = {"attention_mask": attention_mask}
 
         log_prob_list = []
         outputs = None
         for target_token in target_tokens:
             if use_cached_outputs:
                 if outputs is not None:
+                    if (
+                        Cache is not None
+                        and getattr(self.model, "_supports_cache_class", False)
+                        and not isinstance(outputs.past_key_values, Cache)
+                    ):
+                        outputs.past_key_values = DynamicCache.from_legacy_cache(
+                            outputs.past_key_values
+                        )
                     model_kwargs = self.model._update_model_kwargs_for_generation(
                         outputs, model_kwargs
                     )
@@ -275,7 +289,7 @@ def _forward_func(
                 )
                 outputs = self.model.forward(**model_inputs)
             else:
-                outputs = self.model.forward(model_inp, attention_mask=attention_mask)
+                outputs = self.model.forward(model_inp, **model_kwargs)
             new_token_logits = outputs.logits[:, -1]
             log_probs = torch.nn.functional.log_softmax(new_token_logits, dim=1)
 
@@ -345,7 +359,8 @@ def attribute(
                     Defaults: 1.
             gen_args (dict, optional): arguments for generating the target. Only used if
                     target is not given. When None, the default arguments are used,
-                    {"max_length": 25, "do_sample": False}
+                    {"max_new_tokens": 25, "do_sample": False,
+                    "temperature": None, "top_p": None}
                     Defaults: None
             **kwargs (Any): any extra keyword arguments passed to the call of the
                     underlying attribute function of the given attribution instance
@@ -516,7 +531,8 @@ def attribute(
                     Default: None
             gen_args (dict, optional): arguments for generating the target. Only used if
                     target is not given. When None, the default arguments are used,
-                    {"max_length": 25, "do_sample": False}
+                    {"max_new_tokens": 25, "do_sample": False,
+                    "temperature": None, "top_p": None}
                     Defaults: None
             **kwargs (Any): any extra keyword arguments passed to the call of the
                     underlying attribute function of the given attribution instance