stop stripping first character from output string (#1351)

csauper · facebook-github-bot · commit 36a3549a0bd0 · 2024-09-17T11:38:26.000-07:00
Summary: Pull Request resolved: #1351 previously first character was stripped as SOS token, but that doesn't actually seem to be the case with current LLMs. Keep all tokens. Differential Revision: D62775617
diff --git a/captum/attr/_core/llm_attr.py b/captum/attr/_core/llm_attr.py
@@ -383,6 +383,7 @@ def attribute(
         self,
         inp: InterpretableInput,
         target: Union[str, torch.Tensor, None] = None,
+        skip_tokens: Union[List[int], List[str], None] = None,
         num_trials: int = 1,
         gen_args: Optional[Dict[str, Any]] = None,
         use_cached_outputs: bool = True,
@@ -397,6 +398,12 @@ def attribute(
                     which attributions are computed. If None, it uses the model
                     to generate the target based on the input and gen_args.
                     Default: None
+            skip_tokens (List[int] or List[str], optional): the tokens to skip in the
+                    the output's interpretable representation. Use this argument to define
+                    uninterested tokens, commonly like special tokens, e.g., sos, and unk.
+                    It can be a list of strings of the tokens or a list of integers of the
+                    token ids.
+                    Default: None
             num_trials (int, optional): number of trials to run. Return is the average
                     attribibutions over all the trials.
                     Defaults: 1.
@@ -435,9 +442,20 @@ def attribute(
             assert gen_args is None, "gen_args must be None when target is given"
 
             if type(target) is str:
-                # exclude sos
-                target_tokens = self.tokenizer.encode(target)[1:]
-                target_tokens = torch.tensor(target_tokens)
+                encoded = self.tokenizer.encode(target)
+
+                if skip_tokens:
+                    if isinstance(skip_tokens[0], str):
+                        skip_tokens = cast(List[str], skip_tokens)
+                        skip_tokens = self.tokenizer.convert_tokens_to_ids(skip_tokens)
+                        assert isinstance(skip_tokens, list)
+
+                    skip_token_set = set(skip_tokens)
+                    encoded = [
+                        token for token in encoded if token not in skip_token_set
+                    ]
+
+                target_tokens = torch.tensor(encoded)
             elif type(target) is torch.Tensor:
                 target_tokens = target
             else:
@@ -562,6 +580,7 @@ def attribute(
         self,
         inp: InterpretableInput,
         target: Union[str, torch.Tensor, None] = None,
+        skip_tokens: Union[List[int], List[str], None] = None,
         gen_args: Optional[Dict[str, Any]] = None,
         **kwargs: Any,
     ) -> LLMAttributionResult:
@@ -572,6 +591,12 @@ def attribute(
                     which attributions are computed. If None, it uses the model
                     to generate the target based on the input and gen_args.
                     Default: None
+            skip_tokens (List[int] or List[str], optional): the tokens to skip in the
+                    the output's interpretable representation. Use this argument to define
+                    uninterested tokens, commonly like special tokens, e.g., sos, and unk.
+                    It can be a list of strings of the tokens or a list of integers of the
+                    token ids.
+                    Default: None
             gen_args (dict, optional): arguments for generating the target. Only used if
                     target is not given. When None, the default arguments are used,
                     {"max_new_tokens": 25, "do_sample": False,
@@ -607,9 +632,20 @@ def attribute(
             assert gen_args is None, "gen_args must be None when target is given"
 
             if type(target) is str:
-                # exclude sos
-                target_tokens = self.tokenizer.encode(target)[1:]
-                target_tokens = torch.tensor(target_tokens)
+                encoded = self.tokenizer.encode(target)
+
+                if skip_tokens:
+                    if isinstance(skip_tokens[0], str):
+                        skip_tokens = cast(List[str], skip_tokens)
+                        skip_tokens = self.tokenizer.convert_tokens_to_ids(skip_tokens)
+                        assert isinstance(skip_tokens, list)
+
+                    skip_token_set = set(skip_tokens)
+                    encoded = [
+                        token for token in encoded if token not in skip_token_set
+                    ]
+
+                target_tokens = torch.tensor(encoded)
             elif type(target) is torch.Tensor:
                 target_tokens = target
             else:
diff --git a/tests/attr/test_llm_attr.py b/tests/attr/test_llm_attr.py
@@ -95,7 +95,10 @@ def convert_tokens_to_ids(
         raise NotImplementedError
 
     def decode(self, token_ids: Tensor) -> str:
-        return " ".join(self.convert_ids_to_tokens(token_ids.tolist()))
+        tokens = self.convert_ids_to_tokens(token_ids.tolist())
+        if isinstance(tokens, list):
+            tokens = " ".join(tokens)
+        return tokens
 
 
 class Result(NamedTuple):
@@ -271,6 +274,7 @@ def test_llm_attr(
         res = llm_attr.attribute(
             inp,
             "m n o p q",
+            skip_tokens=[0],
             use_cached_outputs=self.use_cached_outputs,
             # pyre-fixme[6]: In call `LLMAttribution.attribute`,
             # for 4th positional argument, expected
@@ -330,7 +334,10 @@ def test_llm_attr_fa_log_prob(self) -> None:
 
         inp = TextTemplateInput("{} b {} {} e {}", ["a", "c", "d", "f"])
         res = llm_fa.attribute(
-            inp, "m n o p q", use_cached_outputs=self.use_cached_outputs
+            inp,
+            "m n o p q",
+            skip_tokens=[0],
+            use_cached_outputs=self.use_cached_outputs,
         )
 
         # With FeatureAblation, the seq attr in log_prob
@@ -385,6 +392,7 @@ def test_llm_attr_without_token(
         res = llm_fa.attribute(
             inp,
             "m n o p q",
+            skip_tokens=[0],
             use_cached_outputs=self.use_cached_outputs,
             # pyre-fixme[6]: In call `LLMAttribution.attribute`,
             # for 4th positional argument, expected
@@ -448,7 +456,7 @@ def test_llm_attr(
             )
 
         inp = TextTokenInput("a b c", tokenizer)
-        res = llm_attr.attribute(inp, "m n o p q", **attr_kws)
+        res = llm_attr.attribute(inp, "m n o p q", skip_tokens=[0], **attr_kws)
 
         # 5 output tokens, 4 input tokens including sos
         self.assertEqual(res.seq_attr.shape, (4,))
@@ -523,7 +531,7 @@ def test_llm_attr_with_skip_tokens(
             )
 
         inp = TextTokenInput("a b c", tokenizer, skip_tokens=[0])
-        res = llm_attr.attribute(inp, "m n o p q", **attr_kws)
+        res = llm_attr.attribute(inp, "m n o p q", skip_tokens=[0], **attr_kws)
 
         # 5 output tokens, 4 input tokens including sos
         self.assertEqual(res.seq_attr.shape, (3,))
@@ -537,3 +545,41 @@ def test_llm_attr_with_skip_tokens(
         self.assertEqual(res.seq_attr.device.type, self.device)
         assert res.token_attr is not None  # make pyre/mypy happy
         self.assertEqual(token_attr.device.type, self.device)  # type: ignore
+
+    @parameterized.expand(
+        [
+            (LayerIntegratedGradients, None),
+            (LayerGradientXActivation, None),
+            (LayerGradientShap, (torch.tensor([[1, 0, 1, 0]]),)),
+        ]
+    )
+    def test_llm_attr_with_no_skip_tokens(
+        self, AttrClass: Type[GradientAttribution], baselines: Optional[Tuple[Tensor]]
+    ) -> None:
+        llm = DummyLLM()
+        llm.to(self.device)
+        tokenizer = DummyTokenizer()
+        attr = AttrClass(llm, llm.emb)  # type: ignore[call-arg]
+        llm_attr = LLMGradientAttribution(attr, tokenizer)
+
+        attr_kws: Dict[str, Any] = {}
+        if baselines is not None:
+            attr_kws["baselines"] = tuple(
+                baseline.to(self.device) for baseline in baselines
+            )
+
+        inp = TextTokenInput("a b c", tokenizer)
+        res = llm_attr.attribute(inp, "m n o p q", **attr_kws)
+
+        # 5 output tokens, 4 input tokens including sos
+        self.assertEqual(res.seq_attr.shape, (4,))
+        assert res.token_attr is not None  # make pyre/mypy happy
+        self.assertIsNotNone(res.token_attr)
+        token_attr = res.token_attr
+        self.assertEqual(token_attr.shape, (6, 4))  # type: ignore
+        self.assertEqual(res.input_tokens, ["<sos>", "a", "b", "c"])
+        self.assertEqual(res.output_tokens, ["<sos>", "m", "n", "o", "p", "q"])
+
+        self.assertEqual(res.seq_attr.device.type, self.device)
+        assert res.token_attr is not None  # make pyre/mypy happy
+        self.assertEqual(token_attr.device.type, self.device)  # type: ignore
diff --git a/tests/attr/test_llm_attr_gpu.py b/tests/attr/test_llm_attr_gpu.py
@@ -84,7 +84,10 @@ def convert_tokens_to_ids(
         raise NotImplementedError
 
     def decode(self, token_ids: Tensor) -> str:
-        return " ".join(self.convert_ids_to_tokens(token_ids.tolist()))
+        tokens = self.convert_ids_to_tokens(token_ids.tolist())
+        if isinstance(tokens, list):
+            tokens = " ".join(tokens)
+        return tokens
 
 
 class Result(NamedTuple):
@@ -195,7 +198,10 @@ def test_llm_attr_gpu(self, AttrClass: Type[PerturbationAttribution]) -> None:
 
         inp = TextTemplateInput("{} b {} {} e {}", ["a", "c", "d", "f"])
         res = llm_attr.attribute(
-            inp, "m n o p q", use_cached_outputs=self.use_cached_outputs
+            inp,
+            "m n o p q",
+            skip_tokens=[0],
+            use_cached_outputs=self.use_cached_outputs,
         )
         self.assertEqual(res.seq_attr.shape, (4,))
         self.assertEqual(cast(Tensor, res.token_attr).shape, (5, 4))
@@ -234,7 +240,10 @@ def test_llm_attr_fa_log_prob_gpu(self) -> None:
 
         inp = TextTemplateInput("{} b {} {} e {}", ["a", "c", "d", "f"])
         res = llm_fa.attribute(
-            inp, "m n o p q", use_cached_outputs=self.use_cached_outputs
+            inp,
+            "m n o p q",
+            skip_tokens=[0],
+            use_cached_outputs=self.use_cached_outputs,
         )
 
         # With FeatureAblation, the seq attr in log_prob
@@ -253,7 +262,10 @@ def test_llm_attr_without_token_gpu(
 
         inp = TextTemplateInput("{} b {} {} e {}", ["a", "c", "d", "f"])
         res = llm_fa.attribute(
-            inp, "m n o p q", use_cached_outputs=self.use_cached_outputs
+            inp,
+            "m n o p q",
+            skip_tokens=[0],
+            use_cached_outputs=self.use_cached_outputs,
         )
 
         self.assertEqual(res.seq_attr.shape, (4,))
@@ -280,7 +292,7 @@ def test_llm_attr(self) -> None:
         llm_attr = LLMGradientAttribution(attr, tokenizer)
 
         inp = TextTokenInput("a b c", tokenizer)
-        res = llm_attr.attribute(inp, "m n o p q")
+        res = llm_attr.attribute(inp, "m n o p q", skip_tokens=[0])
         # 5 output tokens, 4 input tokens including sos
         self.assertEqual(res.seq_attr.shape, (4,))
         assert res.token_attr is not None  # make pyre/mypy happy
@@ -324,7 +336,7 @@ def test_llm_attr_with_skip_tokens(self) -> None:
         llm_attr = LLMGradientAttribution(attr, tokenizer)
 
         inp = TextTokenInput("a b c", tokenizer, skip_tokens=[0])
-        res = llm_attr.attribute(inp, "m n o p q")
+        res = llm_attr.attribute(inp, "m n o p q", skip_tokens=[0])
 
         # 5 output tokens, 4 input tokens including sos
         self.assertEqual(res.seq_attr.shape, (3,))
@@ -338,3 +350,26 @@ def test_llm_attr_with_skip_tokens(self) -> None:
         self.assertEqual(res.seq_attr.device.type, self.device)
         assert res.token_attr is not None  # make pyre/mypy happy
         self.assertEqual(token_attr.device.type, self.device)  # type: ignore
+
+    def test_llm_attr_with_no_skip_tokens(self) -> None:
+        llm = DummyLLM()
+        llm.to(self.device)
+        tokenizer = DummyTokenizer()
+        attr = LayerIntegratedGradients(llm, llm.emb)
+        llm_attr = LLMGradientAttribution(attr, tokenizer)
+
+        inp = TextTokenInput("a b c", tokenizer)
+        res = llm_attr.attribute(inp, "m n o p q")
+
+        # 6 output tokens including sos, 4 input tokens including sos
+        self.assertEqual(res.seq_attr.shape, (4,))
+        assert res.token_attr is not None  # make pyre/mypy happy
+        self.assertIsNotNone(res.token_attr)
+        token_attr = res.token_attr
+        self.assertEqual(token_attr.shape, (6, 4))  # type: ignore
+        self.assertEqual(res.input_tokens, ["<sos>", "a", "b", "c"])
+        self.assertEqual(res.output_tokens, ["<sos>", "m", "n", "o", "p", "q"])
+
+        self.assertEqual(res.seq_attr.device.type, self.device)
+        assert res.token_attr is not None  # make pyre/mypy happy
+        self.assertEqual(token_attr.device.type, self.device)  # type: ignore