@@ -779,20 +779,29 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
779
779
break
780
780
yield result
781
781
782
- def check_vocab_size (params : Params , vocab : Vocab ) -> None :
782
+ def check_vocab_size (params : Params , vocab : Vocab , pad_vocab : bool = False ) -> None :
783
783
if params .n_vocab != vocab .vocab_size :
784
784
assert isinstance (vocab , BpeVocab ) or isinstance (vocab , SentencePieceVocab )
785
785
if params .n_vocab == vocab .vocab_size_base :
786
786
print ("Ignoring added_tokens.json since model matches vocab size without it." )
787
787
vocab .added_tokens_list = []
788
788
vocab .vocab_size = vocab .vocab_size_base
789
789
return
790
+ if pad_vocab and params .n_vocab > vocab .vocab_size :
791
+ pad_count = params .n_vocab - vocab .vocab_size
792
+ print (f'Padding vocab with { pad_count } token(s) - <dummy00001> through <dummy{ pad_count :05} >' )
793
+ for i in range (1 , (params .n_vocab - vocab .vocab_size ) + 1 ):
794
+ vocab .added_tokens_list .append (f'<dummy{ i :05} >' )
795
+ vocab .vocab_size = params .n_vocab
796
+ return
790
797
msg = f"Vocab size mismatch (model has { params .n_vocab } , but { vocab .fname_tokenizer } "
791
798
if vocab .fname_added_tokens is not None :
792
799
msg += f" combined with { vocab .fname_added_tokens } "
793
800
msg += f" has { vocab .vocab_size } )."
794
801
if vocab .vocab_size < params .n_vocab < vocab .vocab_size + 20 and vocab .fname_added_tokens is None :
795
802
msg += f" Most likely you are missing added_tokens.json (should be in { vocab .fname_tokenizer .parent } )."
803
+ if vocab .vocab_size < params .n_vocab :
804
+ msg += " Possibly try using the --padvocab option."
796
805
raise Exception (msg )
797
806
798
807
@@ -877,8 +886,12 @@ def close(self) -> None:
877
886
self .gguf .close ()
878
887
879
888
@staticmethod
880
- def write_vocab_only (fname_out : Path , params : Params , vocab : Vocab , svocab : gguf .SpecialVocab , endianess :gguf .GGUFEndian = gguf .GGUFEndian .LITTLE ) -> None :
881
- check_vocab_size (params , vocab )
889
+ def write_vocab_only (
890
+ fname_out : Path , params : Params , vocab : Vocab , svocab : gguf .SpecialVocab ,
891
+ endianess : gguf .GGUFEndian = gguf .GGUFEndian .LITTLE ,
892
+ pad_vocab : bool = False ,
893
+ ) -> None :
894
+ check_vocab_size (params , vocab , pad_vocab = pad_vocab )
882
895
883
896
of = OutputFile (fname_out , endianess = endianess )
884
897
@@ -905,8 +918,14 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
905
918
return dt .quantize (arr )
906
919
907
920
@staticmethod
908
- def write_all (fname_out : Path , ftype : GGMLFileType , params : Params , model : LazyModel , vocab : Vocab , svocab : gguf .SpecialVocab , concurrency : int = DEFAULT_CONCURRENCY , endianess = gguf .GGUFEndian .LITTLE ) -> None :
909
- check_vocab_size (params , vocab )
921
+ def write_all (
922
+ fname_out : Path , ftype : GGMLFileType , params : Params ,
923
+ model : LazyModel , vocab : Vocab , svocab : gguf .SpecialVocab ,
924
+ concurrency : int = DEFAULT_CONCURRENCY ,
925
+ endianess : gguf .GGUFEndian = gguf .GGUFEndian .LITTLE ,
926
+ pad_vocab : bool = False ,
927
+ ) -> None :
928
+ check_vocab_size (params , vocab , pad_vocab = pad_vocab )
910
929
911
930
of = OutputFile (fname_out , endianess = endianess )
912
931
@@ -1126,6 +1145,7 @@ def main(args_in: list[str] | None = None) -> None:
1126
1145
parser .add_argument ("--ctx" , type = int , help = "model training context (default: based on input)" )
1127
1146
parser .add_argument ("--concurrency" , type = int , help = f"concurrency used for conversion (default: { DEFAULT_CONCURRENCY } )" , default = DEFAULT_CONCURRENCY )
1128
1147
parser .add_argument ("--bigendian" , action = "store_true" , help = "model is executed on big endian machine" )
1148
+ parser .add_argument ("--padvocab" , action = "store_true" , help = "add pad tokens when model vocab expects more than tokenizer metadata provides" )
1129
1149
1130
1150
args = parser .parse_args (args_in )
1131
1151
if args .dump_single :
@@ -1173,7 +1193,8 @@ def main(args_in: list[str] | None = None) -> None:
1173
1193
load_merges = args .vocabtype == 'bpe' ,
1174
1194
n_vocab = vocab .vocab_size )
1175
1195
outfile = args .outfile
1176
- OutputFile .write_vocab_only (outfile , params , vocab , special_vocab )
1196
+ OutputFile .write_vocab_only (outfile , params , vocab , special_vocab ,
1197
+ endianess = endianess , pad_vocab = args .padvocab )
1177
1198
print (f"Wrote { outfile } " )
1178
1199
return
1179
1200
@@ -1196,7 +1217,8 @@ def main(args_in: list[str] | None = None) -> None:
1196
1217
params .ftype = ftype
1197
1218
print (f"Writing { outfile } , format { ftype } " )
1198
1219
1199
- OutputFile .write_all (outfile , ftype , params , model , vocab , special_vocab , concurrency = args .concurrency , endianess = endianess )
1220
+ OutputFile .write_all (outfile , ftype , params , model , vocab , special_vocab ,
1221
+ concurrency = args .concurrency , endianess = endianess , pad_vocab = args .padvocab )
1200
1222
print (f"Wrote { outfile } " )
1201
1223
1202
1224
0 commit comments