21
21
# TODO: automate the update of convert-hf-to-gguf.py
22
22
#
23
23
24
+ import logging
24
25
import os
25
26
import requests
26
27
import sys
27
28
import json
28
29
29
30
from hashlib import sha256
30
31
from enum import IntEnum , auto
32
+ from transformers import AutoTokenizer
33
+
34
+ logger = logging .getLogger ("convert-hf-to-gguf-update" )
35
+
31
36
32
37
class TOKENIZER_TYPE (IntEnum ):
33
38
SPM = auto ()
34
39
BPE = auto ()
35
40
WPM = auto ()
36
41
42
+
37
43
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
38
44
# will be updated with time - contributions welcome
39
45
chktxt = '\n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ```````\" \" \" \" ......!!!!!!?????? I\' ve been \' told he\' s there, \' RE you sure? \' M not sure I\' ll make it, \' D you like some tea? We\' Ve a\' lL'
40
46
41
47
if len (sys .argv ) == 2 :
42
48
token = sys .argv [1 ]
43
49
else :
44
- print ("Usage: python convert-hf-to-gguf-update.py <huggingface_token>" )
50
+ logger . info ("Usage: python convert-hf-to-gguf-update.py <huggingface_token>" )
45
51
sys .exit (1 )
46
52
47
53
# TODO: add models here, base models preferred
48
54
models = [
49
- { "name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
50
- { "name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
51
- { "name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
52
- { "name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
53
- { "name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
54
- { "name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
55
- { "name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
56
- { "name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
57
- { "name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
58
- { "name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
59
- { "name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
60
- ]
55
+ { "name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
56
+ { "name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
57
+ { "name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
58
+ { "name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
59
+ { "name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
60
+ { "name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
61
+ { "name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
62
+ { "name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
63
+ { "name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
64
+ { "name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
65
+ { "name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
66
+ ]
61
67
62
68
# make directory "models/tokenizers" if it doesn't exist
63
69
if not os .path .exists ("models/tokenizers" ):
64
70
os .makedirs ("models/tokenizers" )
65
71
72
+
66
73
def download_file_with_auth (url , token , save_path ):
67
74
headers = {"Authorization" : f"Bearer { token } " }
68
75
response = requests .get (url , headers = headers )
69
76
if response .status_code == 200 :
70
77
with open (save_path , 'wb' ) as f :
71
78
f .write (response .content )
72
- print (f"File { save_path } downloaded successfully" )
79
+ logger . info (f"File { save_path } downloaded successfully" )
73
80
else :
74
- print (f"Failed to download file. Status code: { response .status_code } " )
81
+ logger .info (f"Failed to download file. Status code: { response .status_code } " )
82
+
75
83
76
84
# download the tokenizer models
77
85
for model in models :
@@ -82,10 +90,10 @@ def download_file_with_auth(url, token, save_path):
82
90
if not os .path .exists (f"models/tokenizers/{ name } " ):
83
91
os .makedirs (f"models/tokenizers/{ name } " )
84
92
else :
85
- print (f"Directory models/tokenizers/{ name } already exists - skipping" )
93
+ logger . info (f"Directory models/tokenizers/{ name } already exists - skipping" )
86
94
continue
87
95
88
- print (f"Downloading { name } to models/tokenizers/{ name } " )
96
+ logger . info (f"Downloading { name } to models/tokenizers/{ name } " )
89
97
90
98
url = f"{ repo } /raw/main/config.json"
91
99
save_path = f"models/tokenizers/{ name } /config.json"
@@ -116,76 +124,76 @@ def download_file_with_auth(url, token, save_path):
116
124
continue
117
125
118
126
# create the tokenizer
119
- from transformers import AutoTokenizer
120
127
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
121
128
122
129
chktok = tokenizer .encode (chktxt )
123
130
chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
124
131
125
- print (f"model: { name } " )
126
- print (f"tokt: { tokt } " )
127
- print (f"repo: { model ['repo' ]} " )
128
- print (f"chktok: { chktok } " )
129
- print (f"chkhsh: { chkhsh } " )
132
+ logger . info (f"model: { name } " )
133
+ logger . info (f"tokt: { tokt } " )
134
+ logger . info (f"repo: { model ['repo' ]} " )
135
+ logger . info (f"chktok: { chktok } " )
136
+ logger . info (f"chkhsh: { chkhsh } " )
130
137
131
138
# print the "pre_tokenizer" content from the tokenizer.json
132
139
with open (f"models/tokenizers/{ name } /tokenizer.json" , "r" , encoding = "utf-8" ) as f :
133
140
cfg = json .load (f )
134
141
pre_tokenizer = cfg ["pre_tokenizer" ]
135
- print ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
142
+ logger . info ("pre_tokenizer: " + json .dumps (pre_tokenizer , indent = 4 ))
136
143
137
- print ( f" \n " )
144
+ logger . info ( " " )
138
145
139
146
src_ifs += f" if chkhsh == \" { chkhsh } \" :\n "
140
147
src_ifs += f" # ref: { model ['repo' ]} \n "
141
148
src_ifs += f" res = \" { name } \" \n "
142
149
143
- src_func = ""
144
- src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n "
145
- src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n "
146
- src_func += " # is specific for the BPE pre-tokenizer used by the model\n "
147
- src_func += " # we will use this unique identifier to write a \" tokenizer.ggml.pre\" entry in the GGUF file which we can\n "
148
- src_func += " # use in llama.cpp to implement the same pre-tokenizer\n "
149
- src_func += "\n "
150
- src_func += f" chktxt = { repr (chktxt )} \n "
151
- src_func += "\n "
152
- src_func += " chktok = tokenizer.encode(chktxt)\n "
153
- src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n "
154
- src_func += "\n "
155
- src_func += " print(f\" chktok: {chktok}\" )\n "
156
- src_func += " print(f\" chkhsh: {chkhsh}\" )\n "
157
- src_func += "\n "
158
- src_func += " res = None\n "
159
- src_func += "\n "
160
- src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n "
161
- src_func += " # or pull the latest version of the model from Huggingface\n "
162
- src_func += " # don't edit the hashes manually!\n "
163
- src_func += f"{ src_ifs } \n "
164
- src_func += " if res is None:\n "
165
- src_func += " print(\" \\ n\" )\n "
166
- src_func += " print(\" **************************************************************************************\" )\n "
167
- src_func += " print(\" ** WARNING: The BPE pre-tokenizer was not recognized!\" )\n "
168
- src_func += " print(\" ** There are 2 possible reasons for this:\" )\n "
169
- src_func += " print(\" ** - the model has not been added to convert-hf-to-gguf-update.py yet\" )\n "
170
- src_func += " print(\" ** - the pre-tokenization config has changed upstream\" )\n "
171
- src_func += " print(\" ** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\" )\n "
172
- src_func += " print(\" ** ref: https://github.com/ggerganov/llama.cpp/pull/6920\" )\n "
173
- src_func += " print(\" **\" )\n "
174
- src_func += " print(f\" ** chkhsh: {chkhsh}\" )\n "
175
- src_func += " print(\" **************************************************************************************\" )\n "
176
- src_func += " print(\" \\ n\" )\n "
177
- src_func += " raise NotImplementedError(\" BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\" )\n "
178
- src_func += "\n "
179
- src_func += " print(f\" tokenizer.ggml.pre: {res}\" )\n "
180
- src_func += " print(f\" chkhsh: {chkhsh}\" )\n "
181
- src_func += "\n "
182
- src_func += " return res\n "
183
-
184
- print (src_func )
185
-
186
- print ("\n " )
187
- print ("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!" )
188
- print ("\n " )
150
+ src_func = f"""
151
+ def get_vocab_base_pre(self, tokenizer) -> str:
152
+ # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
153
+ # is specific for the BPE pre-tokenizer used by the model
154
+ # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
155
+ # use in llama.cpp to implement the same pre-tokenizer
156
+
157
+ chktxt = { repr (chktxt )}
158
+
159
+ chktok = tokenizer.encode(chktxt)
160
+ chkhsh = sha256(str(chktok).encode()).hexdigest()
161
+
162
+ print(f"chktok: {{chktok}}")
163
+ print(f"chkhsh: {{chkhsh}}")
164
+
165
+ res = None
166
+
167
+ # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
168
+ # or pull the latest version of the model from Huggingface
169
+ # don't edit the hashes manually!
170
+ { src_ifs }
171
+ if res is None:
172
+ print("\\ n")
173
+ print("**************************************************************************************")
174
+ print("** WARNING: The BPE pre-tokenizer was not recognized!")
175
+ print("** There are 2 possible reasons for this:")
176
+ print("** - the model has not been added to convert-hf-to-gguf-update.py yet")
177
+ print("** - the pre-tokenization config has changed upstream")
178
+ print("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
179
+ print("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
180
+ print("**")
181
+ print(f"** chkhsh: {{chkhsh}}")
182
+ print("**************************************************************************************")
183
+ print("\\ n")
184
+ raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
185
+
186
+ print(f"tokenizer.ggml.pre: {{repr(res)}}")
187
+ print(f"chkhsh: {{chkhsh}}")
188
+
189
+ return res
190
+ """
191
+
192
+ print (src_func ) # noqa: NP100
193
+
194
+ logger .info ("\n " )
195
+ logger .info ("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!" )
196
+ logger .info ("\n " )
189
197
190
198
# generate tests for each tokenizer model
191
199
@@ -253,7 +261,6 @@ def download_file_with_auth(url, token, save_path):
253
261
tokt = model ["tokt" ]
254
262
255
263
# create the tokenizer
256
- from transformers import AutoTokenizer
257
264
tokenizer = AutoTokenizer .from_pretrained (f"models/tokenizers/{ name } " )
258
265
259
266
with open (f"models/ggml-vocab-{ name } .gguf.inp" , "w" , encoding = "utf-8" ) as f :
@@ -268,15 +275,15 @@ def download_file_with_auth(url, token, save_path):
268
275
f .write (f" { r } " )
269
276
f .write ("\n " )
270
277
271
- print (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
278
+ logger . info (f"Tests for { name } written in ./models/ggml-vocab-{ name } .gguf.*" )
272
279
273
280
# generate commands for creating vocab files
274
281
275
- print ("\n Run the following commands to generate the vocab files for testing:\n " )
282
+ logger . info ("\n Run the following commands to generate the vocab files for testing:\n " )
276
283
277
284
for model in models :
278
285
name = model ["name" ]
279
286
280
- print (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
287
+ logger . info (f"python3 convert-hf-to-gguf.py models/tokenizers/{ name } / --outfile models/ggml-vocab-{ name } .gguf --vocab-only" )
281
288
282
- print ("\n " )
289
+ logger . info ("\n " )
0 commit comments