Skip to content

Commit 83fe012

Browse files
authored
small fix tokenizer regex patch (#42528)
* small fix * update * we prob still had 1 issue * fix * pop in case
1 parent dac2ad7 commit 83fe012

File tree

1 file changed

+9
-3
lines changed

1 file changed

+9
-3
lines changed

src/transformers/tokenization_utils_tokenizers.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
TextInput,
4343
TruncationStrategy,
4444
)
45-
from .utils import PaddingStrategy, add_end_docstrings, logging
45+
from .utils import PaddingStrategy, add_end_docstrings, is_offline_mode, logging
4646

4747

4848
logger = logging.get_logger(__name__)
@@ -219,6 +219,7 @@ def __init__(self, *args, **kwargs):
219219

220220
# Optionally patches mistral tokenizers with wrong regex
221221
if vocab_size > 100000 and getattr(self._tokenizer, "pre_tokenizer", None) is not None:
222+
kwargs.pop("tokenizer", None)
222223
self._tokenizer = self._patch_mistral_regex(
223224
self._tokenizer,
224225
self.init_kwargs.get("name_or_path", None),
@@ -1089,7 +1090,12 @@ def is_base_mistral(model_id: str) -> bool:
10891090
return True
10901091
return False
10911092

1092-
if pretrained_model_name_or_path is not None and (is_local or is_base_mistral(pretrained_model_name_or_path)):
1093+
if is_offline_mode():
1094+
is_local = True
1095+
1096+
if pretrained_model_name_or_path is not None and (
1097+
is_local or (not is_local and is_base_mistral(pretrained_model_name_or_path))
1098+
):
10931099
_config_file = cached_file(
10941100
pretrained_model_name_or_path,
10951101
"config.json",
@@ -1126,7 +1132,7 @@ def is_base_mistral(model_id: str) -> bool:
11261132
]
11271133
):
11281134
return tokenizer
1129-
elif transformers_version and version.parse(transformers_version) >= version.parse("5.0.0"):
1135+
elif transformers_version and version.parse(transformers_version) >= version.parse("4.57.3"):
11301136
return tokenizer
11311137

11321138
mistral_config_detected = True

0 commit comments

Comments
 (0)