Skip to content

Commit e4dad4f

Browse files
authored
Remove-warns (#26483)
* fix stripping * remove some warnings and update some warnings * revert changes for other PR
1 parent 1b8decb commit e4dad4f

File tree

4 files changed

+9
-28
lines changed

4 files changed

+9
-28
lines changed

src/transformers/models/llama/tokenization_llama.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def __init__(
125125

126126
if legacy is None:
127127
logger.warning_once(
128-
f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is"
128+
f"You are using the default legacy behaviour of the {self.__class__}. This is"
129129
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
130130
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
131131
" means, and thouroughly read the reason why this was added as explained in"
@@ -138,7 +138,7 @@ def __init__(
138138
self.add_bos_token = add_bos_token
139139
self.add_eos_token = add_eos_token
140140
self.use_default_system_prompt = use_default_system_prompt
141-
self.sp_model = self.get_spm_processor()
141+
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
142142

143143
super().__init__(
144144
bos_token=bos_token,
@@ -160,9 +160,9 @@ def unk_token_length(self):
160160
return len(self.sp_model.encode(str(self.unk_token)))
161161

162162
# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
163-
def get_spm_processor(self):
163+
def get_spm_processor(self, from_slow=False):
164164
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
165-
if self.legacy: # no dependency on protobuf
165+
if self.legacy or from_slow: # no dependency on protobuf
166166
tokenizer.Load(self.vocab_file)
167167
return tokenizer
168168

src/transformers/models/t5/tokenization_t5.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def __init__(
186186

187187
if legacy is None:
188188
logger.warning_once(
189-
f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is"
189+
f"You are using the default legacy behaviour of the {self.__class__}. This is"
190190
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
191191
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
192192
" means, and thouroughly read the reason why this was added as explained in"
@@ -195,7 +195,7 @@ def __init__(
195195
legacy = True
196196

197197
self.legacy = legacy
198-
self.sp_model = self.get_spm_processor()
198+
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
199199
self.vocab_file = vocab_file
200200
self._extra_ids = extra_ids
201201

@@ -210,9 +210,10 @@ def __init__(
210210
**kwargs,
211211
)
212212

213-
def get_spm_processor(self):
213+
# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
214+
def get_spm_processor(self, from_slow=False):
214215
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
215-
if self.legacy: # no dependency on protobuf
216+
if self.legacy or from_slow: # no dependency on protobuf
216217
tokenizer.Load(self.vocab_file)
217218
return tokenizer
218219

src/transformers/tokenization_utils.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -979,11 +979,6 @@ def _decode(
979979
) -> str:
980980
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
981981

982-
if spaces_between_special_tokens:
983-
logger.warning_once(
984-
"spaces_between_special_tokens is deprecated and will be removed in transformers v5. It was adding spaces between `added_tokens`, not special tokens, "
985-
"and does not exist in our fast implementation. Future tokenizers will handle the decoding process on a per-model rule."
986-
)
987982
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
988983
legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
989984
token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size

src/transformers/tokenization_utils_base.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2204,11 +2204,6 @@ def _from_pretrained(
22042204
f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary."
22052205
)
22062206
else:
2207-
logger.warning_once(
2208-
"Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, "
2209-
" it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again."
2210-
" You will see the new `added_tokens_decoder` attribute that will store the relevant information."
2211-
)
22122207
# begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
22132208
if special_tokens_map_file is not None:
22142209
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
@@ -2277,16 +2272,6 @@ def _from_pretrained(
22772272
# uses the information stored in `added_tokens_decoder`. Checks after addition that we have the same ids
22782273
if init_kwargs.get("slow_to_fast", False):
22792274
tokenizer.add_tokens([token for _, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])])
2280-
warnings = ""
2281-
for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0]):
2282-
if tokenizer.convert_tokens_to_ids(str(token)) != index:
2283-
warnings += f"\texpected id: {tokenizer.convert_tokens_to_ids(str(token))}, found: {index}, token: `{token}`,\n"
2284-
if len(warnings) > 1:
2285-
logger.warn(
2286-
f"You are converting a {slow_tokenizer.__class__.__name__} to a {cls.__name__}, but"
2287-
f" wrong indexes were founds when adding the `added_tokens` from the `slow` tokenizer to the `fast`. "
2288-
f" The following tokens had unexpected id :\n{warnings}. You should try using `from_slow`."
2289-
)
22902275
# finally we add all the special_tokens to make sure eveything is initialized
22912276
tokenizer.add_tokens(tokenizer.all_special_tokens_extended, special_tokens=True)
22922277

0 commit comments

Comments
 (0)