@@ -2743,13 +2743,15 @@ def set_vocab(self):
2743
2743
2744
2744
text = piece .encode ("utf-8" )
2745
2745
score = 0.0
2746
- if len (piece ) != 0 and token_id < 64789 :
2746
+ # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
2747
+ # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
2748
+ if len (piece ) != 0 and token_id < tokenizer .tokenizer .sp_model .vocab_size ():
2747
2749
score = tokenizer .tokenizer .sp_model .get_score (token_id )
2748
2750
2749
2751
if len (piece ) == 0 :
2750
2752
text = f"[PAD{ token_id } ]" .encode ("utf-8" )
2751
2753
2752
- if token_id >= 64789 :
2754
+ if token_id >= tokenizer . tokenizer . sp_model . vocab_size () :
2753
2755
toktype = SentencePieceTokenTypes .UNKNOWN
2754
2756
tokens .append (text )
2755
2757
scores .append (score )
@@ -2779,7 +2781,7 @@ def set_vocab(self):
2779
2781
special_vocab .add_to_gguf (self .gguf_writer )
2780
2782
2781
2783
def set_gguf_parameters (self ):
2782
- self .gguf_writer .add_name ("ChatGLM-6b-chat" )
2784
+ self .gguf_writer .add_name (self . dir_model . name )
2783
2785
n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
2784
2786
n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
2785
2787
n_head_kv = self .hparams .get ("multi_query_group_num" , n_head )
@@ -2795,16 +2797,12 @@ def set_gguf_parameters(self):
2795
2797
self .gguf_writer .add_add_bos_token (False )
2796
2798
2797
2799
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2798
- if name .endswith (".rotary_pos_emb.inv_freq" ):
2799
- return []
2800
-
2801
2800
del bid # unused
2802
2801
2803
- name = re .sub (r'transformer\.' , '' , name )
2804
-
2805
- if name == "word_embeddings.weight" :
2806
- assert self .tensor_names is not None
2802
+ if name .endswith (".rotary_pos_emb.inv_freq" ):
2803
+ return []
2807
2804
2805
+ name = name .removeprefix ("transformer." )
2808
2806
return [(self .map_tensor_name (name ), data_torch )]
2809
2807
2810
2808
0 commit comments