Skip to content

Commit be55134

Browse files
authored
convert : refactor vocab selection logic (#6355)
1 parent 66ba560 commit be55134

File tree

4 files changed

+204
-176
lines changed

4 files changed

+204
-176
lines changed

convert-hf-to-gguf.py

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
2424
import gguf
2525

26-
from convert import HfVocab
26+
from convert import LlamaHfVocab
2727

2828

2929
###### MODEL DEFINITIONS ######
@@ -230,7 +230,7 @@ def _get_part_names(self):
230230
def _set_vocab_gpt2(self):
231231
dir_model = self.dir_model
232232
hparams = self.hparams
233-
tokens: list[bytearray] = []
233+
tokens: list[str] = []
234234
toktypes: list[int] = []
235235

236236
from transformers import AutoTokenizer
@@ -243,8 +243,7 @@ def _set_vocab_gpt2(self):
243243

244244
for i in range(vocab_size):
245245
if i not in reverse_vocab:
246-
pad_token = f"[PAD{i}]".encode('utf-8')
247-
tokens.append(bytearray(pad_token))
246+
tokens.append(f"[PAD{i}]")
248247
toktypes.append(gguf.TokenType.USER_DEFINED)
249248
elif reverse_vocab[i] in added_vocab:
250249
tokens.append(reverse_vocab[i])
@@ -266,7 +265,7 @@ def _set_vocab_gpt2(self):
266265
def _set_vocab_qwen(self):
267266
dir_model = self.dir_model
268267
hparams = self.hparams
269-
tokens: list[bytearray] = []
268+
tokens: list[str] = []
270269
toktypes: list[int] = []
271270

272271
from transformers import AutoTokenizer
@@ -291,8 +290,7 @@ def _set_vocab_qwen(self):
291290

292291
for i in range(vocab_size):
293292
if i not in reverse_vocab:
294-
pad_token = f"[PAD{i}]".encode("utf-8")
295-
tokens.append(bytearray(pad_token))
293+
tokens.append(f"[PAD{i}]")
296294
toktypes.append(gguf.TokenType.USER_DEFINED)
297295
elif reverse_vocab[i] in added_vocab:
298296
tokens.append(reverse_vocab[i])
@@ -372,12 +370,8 @@ def _set_vocab_sentencepiece(self):
372370
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
373371
special_vocab.add_to_gguf(self.gguf_writer)
374372

375-
def _set_vocab_hf(self):
376-
path = self.dir_model
377-
added_tokens_path = self.dir_model
378-
vocab = HfVocab(
379-
path, added_tokens_path if added_tokens_path.exists() else None
380-
)
373+
def _set_vocab_llama_hf(self):
374+
vocab = LlamaHfVocab(self.dir_model)
381375
tokens = []
382376
scores = []
383377
toktypes = []
@@ -1099,7 +1093,7 @@ def set_gguf_parameters(self):
10991093
self.gguf_writer.add_file_type(self.ftype)
11001094

11011095
def set_vocab(self):
1102-
self._set_vocab_hf()
1096+
self._set_vocab_llama_hf()
11031097

11041098
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
11051099
if n_kv_head is not None and n_head != n_kv_head:
@@ -1700,11 +1694,8 @@ def set_gguf_parameters(self):
17001694
self.gguf_writer.add_pooling_type(pooling_type)
17011695

17021696
def set_vocab(self):
1703-
path = self.dir_model
1704-
added_tokens_path = self.dir_model if self.dir_model.exists() else None
1705-
17061697
# use huggingface vocab to get all tokens
1707-
vocab = HfVocab(path, added_tokens_path)
1698+
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
17081699
tokens, scores, toktypes = zip(*vocab.all_tokens())
17091700
assert len(tokens) == vocab.vocab_size
17101701
self.vocab_size = vocab.vocab_size

convert-persimmon-to-gguf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,12 @@ def main():
106106
tensor_map = gguf.get_tensor_name_map(arch, block_count)
107107
print(tensor_map)
108108
for name in tensors.keys():
109-
data = tensors[name]
109+
data_torch = tensors[name]
110110
if name.endswith(".self_attention.rotary_emb.inv_freq"):
111111
continue
112-
old_dtype = data.dtype
112+
old_dtype = data_torch.dtype
113113
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
114-
data = data.to(torch.float32).squeeze().numpy()
114+
data = data_torch.to(torch.float32).squeeze().numpy()
115115
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
116116
if new_name is None:
117117
print("Can not map tensor '" + name + "'")

0 commit comments

Comments
 (0)