@@ -238,22 +238,58 @@ def load(model_plus: 'ModelPlus') -> 'Params':
238
238
return params
239
239
240
240
241
- class SentencePieceVocab :
242
- def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ], vocabtype : Optional [str ]) -> None :
243
- self .vocabtype = vocabtype
244
- if self .vocabtype == "bpe" :
245
- self .sentencepiece_tokenizer = json .loads (open (str (fname_tokenizer )).read ())
246
- else :
247
- self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
241
+ class BpeVocab :
242
+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ]) -> None :
243
+ self .bpe_tokenizer = json .loads (open (str (fname_tokenizer ), encoding = "utf-8" ).read ())
248
244
added_tokens : Dict [str , int ]
249
245
if fname_added_tokens is not None :
250
- added_tokens = json .load (open (fname_added_tokens ))
246
+ added_tokens = json .load (open (fname_added_tokens , encoding = "utf-8" ))
251
247
else :
252
248
added_tokens = {}
253
- if self .vocabtype == "bpe" :
254
- vocab_size : int = len (self .sentencepiece_tokenizer )
249
+ vocab_size : int = len (self .bpe_tokenizer )
250
+ expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
251
+ actual_ids = sorted (added_tokens .values ())
252
+ if expected_ids != actual_ids :
253
+ raise Exception (f"Expected added token IDs to be sequential and start at { len (added_tokens )} ; got { actual_ids } " )
254
+ items = sorted (added_tokens .items (), key = lambda text_idx : text_idx [1 ])
255
+ self .added_tokens_list = [text for (text , idx ) in items ]
256
+ self .vocab_size_base : int = vocab_size
257
+ self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_list )
258
+ self .fname_tokenizer = fname_tokenizer
259
+ self .fname_added_tokens = fname_added_tokens
260
+
261
+ def bpe_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
262
+ tokenizer = self .bpe_tokenizer
263
+ from transformers .models .gpt2 import tokenization_gpt2
264
+ byte_encoder = tokenization_gpt2 .bytes_to_unicode ()
265
+ byte_decoder = {v : k for k , v in byte_encoder .items ()}
266
+ for i , item in enumerate (tokenizer ):
267
+ text : bytes = item .encode ("utf-8" )
268
+ score : float = - i
269
+ yield text , score
270
+
271
+ def added_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
272
+ for text in self .added_tokens_list :
273
+ score = - 1000.0
274
+ yield text .encode ("utf-8" ), score
275
+
276
+ def all_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
277
+ yield from self .bpe_tokens ()
278
+ yield from self .added_tokens ()
279
+
280
+ def __repr__ (self ) -> str :
281
+ return f"BpeVocab with { self .vocab_size_base } base tokens and { len (self .added_tokens_list )} added tokens>"
282
+
283
+
284
+ class SentencePieceVocab :
285
+ def __init__ (self , fname_tokenizer : Path , fname_added_tokens : Optional [Path ]) -> None :
286
+ self .sentencepiece_tokenizer = SentencePieceProcessor (str (fname_tokenizer ))
287
+ added_tokens : Dict [str , int ]
288
+ if fname_added_tokens is not None :
289
+ added_tokens = json .load (open (fname_added_tokens , encoding = "utf-8" ))
255
290
else :
256
- vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
291
+ added_tokens = {}
292
+ vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
257
293
expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
258
294
actual_ids = sorted (added_tokens .values ())
259
295
if expected_ids != actual_ids :
@@ -267,32 +303,11 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vo
267
303
268
304
def sentencepiece_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
269
305
tokenizer = self .sentencepiece_tokenizer
270
- if self .vocabtype == "bpe" :
271
- from transformers .models .gpt2 import tokenization_gpt2
272
- byte_encoder = tokenization_gpt2 .bytes_to_unicode ()
273
- byte_decoder = {v : k for k , v in byte_encoder .items ()}
274
- for i , item in enumerate (tokenizer ):
275
- text : bytes
276
- text = b'' .join ([x .to_bytes (1 , byteorder = 'big' ) for x in [byte_decoder [y ] for y in item ]])
277
- score : float = - i
306
+ for i in range (tokenizer .vocab_size ()):
307
+ piece = tokenizer .id_to_piece (i )
308
+ text : bytes = piece .encode ("utf-8" )
309
+ score : float = tokenizer .get_score (i )
278
310
yield text , score
279
- else :
280
- for i in range (tokenizer .vocab_size ()):
281
- text : bytes
282
- if tokenizer .is_unknown (i ):
283
- text = " \u2047 " .encode ("utf-8" )
284
- elif tokenizer .is_control (i ):
285
- text = b""
286
- elif tokenizer .is_byte (i ):
287
- piece = tokenizer .id_to_piece (i )
288
- if len (piece ) != 6 :
289
- raise Exception (f"Invalid token: { piece } " )
290
- byte_value = int (piece [3 :- 1 ], 16 )
291
- text = struct .pack ("B" , byte_value )
292
- else :
293
- text = tokenizer .id_to_piece (i ).replace ("\u2581 " , " " ).encode ("utf-8" )
294
- score : float = tokenizer .get_score (i )
295
- yield text , score
296
311
297
312
def added_tokens (self ) -> Iterable [Tuple [bytes , float ]]:
298
313
for text in self .added_tokens_list :
@@ -319,7 +334,7 @@ def __repr__(self) -> str:
319
334
return f"<GGMLVocab with { self .vocab_size } tokens>"
320
335
321
336
322
- Vocab = Union [SentencePieceVocab , GGMLVocab ]
337
+ Vocab = Union [BpeVocab , SentencePieceVocab , GGMLVocab ]
323
338
324
339
325
340
def permute (weights : NDArray , n_head : int , n_kv_head : Optional [int ] = None ) -> NDArray :
@@ -1044,7 +1059,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
1044
1059
def check_vocab_size (params : Params , vocab : Vocab ) -> None :
1045
1060
if params .n_vocab != vocab .vocab_size :
1046
1061
# GGMLVocab comes from the same file as the model so shouldn't mismatch:
1047
- assert isinstance (vocab , SentencePieceVocab )
1062
+ assert isinstance (vocab , BpeVocab ) or isinstance ( vocab , SentencePieceVocab )
1048
1063
if params .n_vocab == vocab .vocab_size_base :
1049
1064
print ("Ignoring added_tokens.json since model matches vocab size without it." )
1050
1065
vocab .added_tokens_list = []
@@ -1093,7 +1108,7 @@ def write_vocab(self, vocab: Vocab) -> None:
1093
1108
@staticmethod
1094
1109
def write_vocab_only (fname_out : Path , vocab : Vocab ) -> None :
1095
1110
of = OutputFile (fname_out )
1096
- params = Params (n_vocab = vocab .vocab_size , n_embd = 0 , n_mult = 0 , n_head = 1 , n_layer = 0 )
1111
+ params = Params (n_vocab = vocab .vocab_size , n_embd = 0 , n_mult = 0 , n_head = 1 , n_layer = 0 , n_kv_head = None )
1097
1112
of = OutputFile (fname_out )
1098
1113
of .write_file_header (params , file_type = GGMLFileType .AllF32 )
1099
1114
of .write_vocab (vocab )
@@ -1228,7 +1243,7 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
1228
1243
return {name : model [name ] for name in TENSORS_LIST if name in model }
1229
1244
1230
1245
1231
- def load_vocab (path : Path , vocabtype : Optional [str ]) -> SentencePieceVocab :
1246
+ def load_vocab (path : Path , vocabtype : Optional [str ]) -> Union [ BpeVocab , SentencePieceVocab ] :
1232
1247
print (f"vocabtype: { vocabtype } " )
1233
1248
# Be extra-friendly and accept either a file or a directory. Also, if it's
1234
1249
# a directory, it might be the model directory, and tokenizer.model might
@@ -1250,8 +1265,12 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
1250
1265
"if it's in another directory, pass the directory as --vocab-dir" )
1251
1266
added_tokens_path = path .parent / "added_tokens.json"
1252
1267
print (f"Loading vocab file { path } " )
1253
- return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None ,
1254
- vocabtype )
1268
+ if vocabtype == "bpe" :
1269
+ return BpeVocab (path , added_tokens_path if added_tokens_path .exists () else None )
1270
+ elif vocabtype == "spm" :
1271
+ return SentencePieceVocab (path , added_tokens_path if added_tokens_path .exists () else None )
1272
+ else :
1273
+ raise ValueError (f"Unsupported vocabulary type { vocabtype } " )
1255
1274
1256
1275
1257
1276
def default_outfile (model_paths : List [Path ], file_type : GGMLFileType ) -> Path :
0 commit comments