Skip to content

Commit 4f64f7e

Browse files
committed
check tokenizer hash
1 parent ef69410 commit 4f64f7e

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

convert.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import enum
88
import faulthandler
99
import functools
10+
import hashlib
1011
import itertools
1112
import json
1213
import math
@@ -1630,6 +1631,11 @@ def write_llama3_tokenizer(tokenizer_path, input_tokenizer_path):
16301631
tokenizer.save_pretrained(tokenizer_path)
16311632
return tokenizer
16321633

1634+
def is_llama3_tokenizer(tokenizer_path) -> bool:
1635+
llama3_tokenizer_model_hash : str = "82e9d31979e92ab929cd544440f129d9ecd797b69e327f80f17e1c50d5551b55"
1636+
with open(tokenizer_path, "rb") as f:
1637+
tokenizer_hash = hashlib.sha256(f.read()).hexdigest()
1638+
return llama3_tokenizer_model_hash == tokenizer_hash
16331639

16341640
def main(args_in: list[str] | None = None) -> None:
16351641
output_choices = ["f32", "f16"]
@@ -1667,12 +1673,10 @@ def main(args_in: list[str] | None = None) -> None:
16671673
metadata = Metadata.load(args.metadata)
16681674

16691675
#TODO: add more bandaids for llama 3 detection
1670-
try:
1676+
if is_llama3_tokenizer(os.path.join(args.model, "tokenizer.model")):
16711677
global is_llama3_model
16721678
write_llama3_tokenizer(args.model, os.path.join(args.model, "tokenizer.model"))
16731679
is_llama3_model = True
1674-
except:
1675-
pass
16761680

16771681
if args.get_outfile:
16781682
model_plus = load_some_model(args.model)

0 commit comments

Comments
 (0)