Remove-warns (#26483)

ArthurZucker · web-flow · commit e4dad4fe3252 · 2023-10-02T16:52:00.000+02:00
* fix stripping

* remove some warnings and update some warnings

* revert changes for other PR
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
@@ -125,7 +125,7 @@ def __init__(
 
         if legacy is None:
             logger.warning_once(
-                f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is"
+                f"You are using the default legacy behaviour of the {self.__class__}. This is"
                 " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
                 " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
                 " means, and thouroughly read the reason why this was added as explained in"
@@ -138,7 +138,7 @@ def __init__(
         self.add_bos_token = add_bos_token
         self.add_eos_token = add_eos_token
         self.use_default_system_prompt = use_default_system_prompt
-        self.sp_model = self.get_spm_processor()
+        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
 
         super().__init__(
             bos_token=bos_token,
@@ -160,9 +160,9 @@ def unk_token_length(self):
         return len(self.sp_model.encode(str(self.unk_token)))
 
     # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
-    def get_spm_processor(self):
+    def get_spm_processor(self, from_slow=False):
         tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        if self.legacy:  # no dependency on protobuf
+        if self.legacy or from_slow:  # no dependency on protobuf
             tokenizer.Load(self.vocab_file)
             return tokenizer
 
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
@@ -186,7 +186,7 @@ def __init__(
 
         if legacy is None:
             logger.warning_once(
-                f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is"
+                f"You are using the default legacy behaviour of the {self.__class__}. This is"
                 " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
                 " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
                 " means, and thouroughly read the reason why this was added as explained in"
@@ -195,7 +195,7 @@ def __init__(
             legacy = True
 
         self.legacy = legacy
-        self.sp_model = self.get_spm_processor()
+        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
         self.vocab_file = vocab_file
         self._extra_ids = extra_ids
 
@@ -210,9 +210,10 @@ def __init__(
             **kwargs,
         )
 
-    def get_spm_processor(self):
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
+    def get_spm_processor(self, from_slow=False):
         tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        if self.legacy:  # no dependency on protobuf
+        if self.legacy or from_slow:  # no dependency on protobuf
             tokenizer.Load(self.vocab_file)
             return tokenizer
 
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
@@ -979,11 +979,6 @@ def _decode(
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
-        if spaces_between_special_tokens:
-            logger.warning_once(
-                "spaces_between_special_tokens is deprecated and will be removed in transformers v5. It was adding spaces between `added_tokens`, not special tokens, "
-                "and does not exist in our fast implementation. Future tokenizers will handle the decoding process on a per-model rule."
-            )
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
         legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
             token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -2204,11 +2204,6 @@ def _from_pretrained(
                         f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary."
                     )
         else:
-            logger.warning_once(
-                "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, "
-                " it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again."
-                " You will see the new `added_tokens_decoder` attribute that will store the relevant information."
-            )
             # begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
             if special_tokens_map_file is not None:
                 with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
@@ -2277,16 +2272,6 @@ def _from_pretrained(
         # uses the information stored in `added_tokens_decoder`. Checks after addition that we have the same ids
         if init_kwargs.get("slow_to_fast", False):
             tokenizer.add_tokens([token for _, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])])
-            warnings = ""
-            for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0]):
-                if tokenizer.convert_tokens_to_ids(str(token)) != index:
-                    warnings += f"\texpected id: {tokenizer.convert_tokens_to_ids(str(token))}, found: {index},  token: `{token}`,\n"
-            if len(warnings) > 1:
-                logger.warn(
-                    f"You are converting a {slow_tokenizer.__class__.__name__} to a {cls.__name__}, but"
-                    f" wrong indexes were founds when adding the `added_tokens` from the `slow` tokenizer to the `fast`. "
-                    f" The following tokens had unexpected id :\n{warnings}. You should try using `from_slow`."
-                )
             # finally we add all the special_tokens to make sure eveything is initialized
             tokenizer.add_tokens(tokenizer.all_special_tokens_extended, special_tokens=True)