Skip to content

Commit ef24df6

Browse files
committed
gguf : make token scores and types optional (ggml-org#3347)
(cherry picked from commit ecf90b1)
1 parent a69f9c2 commit ef24df6

File tree

3 files changed

+8
-22
lines changed

3 files changed

+8
-22
lines changed

convert-falcon-hf-to-gguf.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,6 @@ def parse_args() -> argparse.Namespace:
133133
print("gguf: get tokenizer metadata")
134134

135135
tokens: list[bytearray] = []
136-
scores: list[float] = []
137-
toktypes: list[int] = []
138136

139137
tokenizer_json_file = dir_model / 'tokenizer.json'
140138
if not tokenizer_json_file.is_file():
@@ -177,12 +175,8 @@ def parse_args() -> argparse.Namespace:
177175
text = bytearray(pad_token)
178176

179177
tokens.append(text)
180-
scores.append(0.0) # dymmy
181-
toktypes.append(gguf.TokenType.NORMAL) # dummy
182178

183179
gguf_writer.add_token_list(tokens)
184-
gguf_writer.add_token_scores(scores)
185-
gguf_writer.add_token_types(toktypes)
186180

187181
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
188182
special_vocab.add_to_gguf(gguf_writer)

convert-starcoder-hf-to-gguf.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,6 @@ def parse_args() -> argparse.Namespace:
117117
print("gguf: get tokenizer metadata")
118118

119119
tokens: list[bytearray] = []
120-
scores: list[float] = []
121-
toktypes: list[int] = []
122120

123121
tokenizer_json_file = dir_model / 'tokenizer.json'
124122
if not tokenizer_json_file.is_file():
@@ -161,12 +159,8 @@ def parse_args() -> argparse.Namespace:
161159
text = bytearray(pad_token)
162160

163161
tokens.append(text)
164-
scores.append(0.0) # dymmy
165-
toktypes.append(gguf.TokenType.NORMAL) # dummy
166162

167163
gguf_writer.add_token_list(tokens)
168-
gguf_writer.add_token_scores(scores)
169-
gguf_writer.add_token_types(toktypes)
170164

171165
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
172166
special_vocab.add_to_gguf(gguf_writer)

llama.cpp

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1801,20 +1801,18 @@ static void llm_load_vocab(
18011801
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
18021802
}
18031803

1804+
const float * scores = nullptr;
18041805
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
1805-
if (score_idx == -1) {
1806-
throw std::runtime_error("cannot find tokenizer scores in model file\n");
1806+
if (score_idx != -1) {
1807+
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
18071808
}
18081809

1809-
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1810-
1810+
const int * toktypes = nullptr;
18111811
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
1812-
if (toktype_idx == -1) {
1813-
throw std::runtime_error("cannot find token type list in GGUF file\n");
1812+
if (toktype_idx != -1) {
1813+
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
18141814
}
18151815

1816-
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1817-
18181816
// determine vocab type
18191817
{
18201818
std::string tokenizer_name;
@@ -1882,8 +1880,8 @@ static void llm_load_vocab(
18821880

18831881
auto & token_data = vocab.id_to_token[i];
18841882
token_data.text = std::move(word);
1885-
token_data.score = scores[i];
1886-
token_data.type = (llama_token_type) toktypes[i];
1883+
token_data.score = scores ? scores[i] : 0.0f;
1884+
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
18871885
}
18881886

18891887
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'

0 commit comments

Comments
 (0)