LLM tokenizer pretty decoding fix for emojis/unicode (#1360)

craymichael · facebook-github-bot · commit bacac27bf384 · 2024-09-26T15:36:43.000-07:00
Summary: Pull Request resolved: #1360 Emojis are not well-handled in the current decoding logic (see example in test plan). What unfortunately happens is that emojis/unicode are tokenized as two symbols - I believe one is to indicate extended unicode (or maybe a type of unicode, e.g., emoji), and the second is the type (e.g., smiley face, omega, ...). This solution makes the assumption that these will always come in pairs and that the symbol "�" is returned by tokenizer if the symbol is unknown (we verify that this wasn't the intended symbol by running it back through tokenizer). This logic will break down if symbols are split up into 3 or more tokens. Example: Input String: 😂 Output Token IDs: list of length 2 Pretty Decoded Tokens: ['😂[1/2]', '😂[2/2]'] Note that we cannot just output a single token here as we will be providing attributions for each of the token IDs. In attribution, all such cases here should really be grouped together so inputs are valid and attributions make sense. Reviewed By: cyrjano Differential Revision: D63435671 fbshipit-source-id: c029ab17b7c7e6ef1a3fff429da2ecb902d42595
diff --git a/captum/attr/_core/llm_attr.py b/captum/attr/_core/llm_attr.py
@@ -230,7 +230,32 @@ def _convert_ids_to_pretty_tokens(ids: Tensor, tokenizer: TokenizerLike) -> List
     > BPE splitting mostly to avoid digesting spaces since the standard BPE algorithm
     > used spaces in its process
     """
-    return [tokenizer.decode(id_) for id_ in ids]
+    pretty_tokens = []
+    idx = 0
+    while idx < len(ids):
+        decoded = tokenizer.decode(ids[idx])
+        # Handle case where single token (e.g. unicode) is split into multiple IDs
+        # NOTE: This logic will fail if a tokenizer splits a token into 3+ IDs
+        if decoded.strip() == "�" and tokenizer.encode(decoded) != [ids[idx]]:
+            # ID at idx is split, ensure next token is also from a split
+            decoded_next = tokenizer.decode(ids[idx + 1])
+            if decoded_next.strip() == "�" and tokenizer.encode(decoded_next) != [
+                ids[idx + 1]
+            ]:
+                # Both tokens are from a split, combine them
+                decoded = tokenizer.decode(ids[idx : idx + 2])
+                pretty_tokens.append(decoded + "[1/2]")
+                pretty_tokens.append(decoded + "[2/2]")
+            else:
+                # Treat tokens as separate
+                pretty_tokens.append(decoded)
+                pretty_tokens.append(decoded_next)
+            idx += 2
+        else:
+            # Just a normal token
+            idx += 1
+            pretty_tokens.append(decoded)
+    return pretty_tokens
 
 
 class LLMAttribution(Attribution):