Skip to content

Commit bccb68f

Browse files
committed
optimize convert-hf-to-gguf.py for chatglm model
Signed-off-by: XingXing Qiao <[email protected]>
1 parent 1db42c8 commit bccb68f

File tree

1 file changed

+8
-10
lines changed

1 file changed

+8
-10
lines changed

convert-hf-to-gguf.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2743,13 +2743,15 @@ def set_vocab(self):
27432743

27442744
text = piece.encode("utf-8")
27452745
score = 0.0
2746-
if len(piece) != 0 and token_id < 64789:
2746+
# Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
2747+
# it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
2748+
if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
27472749
score = tokenizer.tokenizer.sp_model.get_score(token_id)
27482750

27492751
if len(piece) == 0:
27502752
text = f"[PAD{token_id}]".encode("utf-8")
27512753

2752-
if token_id >= 64789:
2754+
if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
27532755
toktype = SentencePieceTokenTypes.UNKNOWN
27542756
tokens.append(text)
27552757
scores.append(score)
@@ -2779,7 +2781,7 @@ def set_vocab(self):
27792781
special_vocab.add_to_gguf(self.gguf_writer)
27802782

27812783
def set_gguf_parameters(self):
2782-
self.gguf_writer.add_name("ChatGLM-6b-chat")
2784+
self.gguf_writer.add_name(self.dir_model.name)
27832785
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
27842786
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
27852787
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
@@ -2795,16 +2797,12 @@ def set_gguf_parameters(self):
27952797
self.gguf_writer.add_add_bos_token(False)
27962798

27972799
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2798-
if name.endswith(".rotary_pos_emb.inv_freq"):
2799-
return []
2800-
28012800
del bid # unused
28022801

2803-
name = re.sub(r'transformer\.', '', name)
2804-
2805-
if name == "word_embeddings.weight":
2806-
assert self.tensor_names is not None
2802+
if name.endswith(".rotary_pos_emb.inv_freq"):
2803+
return []
28072804

2805+
name = name.removeprefix("transformer.")
28082806
return [(self.map_tensor_name(name), data_torch)]
28092807

28102808

0 commit comments

Comments
 (0)