@@ -327,6 +327,10 @@ enum llm_kv {
327
327
LLM_KV_TOKENIZER_ADD_PREFIX,
328
328
LLM_KV_TOKENIZER_HF_JSON,
329
329
LLM_KV_TOKENIZER_RWKV,
330
+ LLM_KV_TOKENIZER_PREFIX_ID,
331
+ LLM_KV_TOKENIZER_SUFFIX_ID,
332
+ LLM_KV_TOKENIZER_MIDDLE_ID,
333
+ LLM_KV_TOKENIZER_EOT_ID,
330
334
};
331
335
332
336
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -399,6 +403,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
399
403
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
400
404
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
401
405
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
406
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
407
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
408
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
409
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
402
410
};
403
411
404
412
struct LLM_KV {
@@ -2055,10 +2063,10 @@ struct llama_vocab {
2055
2063
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2056
2064
2057
2065
id linefeed_id = 13;
2058
- id special_prefix_id = 32007 ;
2059
- id special_middle_id = 32009 ;
2060
- id special_suffix_id = 32008 ;
2061
- id special_eot_id = 32010 ;
2066
+ id special_prefix_id = -1 ;
2067
+ id special_suffix_id = -1 ;
2068
+ id special_middle_id = -1 ;
2069
+ id special_eot_id = -1 ;
2062
2070
2063
2071
bool add_space_prefix = true;
2064
2072
@@ -4072,6 +4080,30 @@ static void llm_load_vocab(
4072
4080
vocab.special_cls_id = -1;
4073
4081
vocab.special_mask_id = -1;
4074
4082
4083
+ // For Fill-In-the-Middle (FIM)/infill models which where converted
4084
+ // prior to support of FIM special tokens in GGUF, the following
4085
+ // will allow those models to continue to work. The general names
4086
+ // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
4087
+ // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
4088
+ // new versions of these models have been published.
4089
+ std::string gen_name;
4090
+ ml.get_key(LLM_KV_GENERAL_NAME, gen_name);
4091
+ std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
4092
+ [](unsigned char c){ return std::tolower(c); });
4093
+ if (gen_name.find("code") != std::string::npos) {
4094
+ if (model.arch == LLM_ARCH_LLAMA) {
4095
+ vocab.special_prefix_id = 32007;
4096
+ vocab.special_suffix_id = 32008;
4097
+ vocab.special_middle_id = 32009;
4098
+ vocab.special_eot_id = 32010;
4099
+ } else if (model.arch == LLM_ARCH_GEMMA) {
4100
+ vocab.special_prefix_id = 67;
4101
+ vocab.special_suffix_id = 69;
4102
+ vocab.special_middle_id = 68;
4103
+ vocab.special_eot_id = 70;
4104
+ }
4105
+ }
4106
+
4075
4107
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4076
4108
if (add_space_prefix_keyidx != -1) {
4077
4109
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
@@ -4185,13 +4217,17 @@ static void llm_load_vocab(
4185
4217
// special tokens
4186
4218
{
4187
4219
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
4188
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4189
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4190
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4191
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4192
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4193
- { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4194
- { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4220
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
4221
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
4222
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
4223
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
4224
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
4225
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
4226
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
4227
+ { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
4228
+ { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
4229
+ { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
4230
+ { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
4195
4231
};
4196
4232
for (const auto & it : special_token_types) {
4197
4233
const std::string & key = kv(std::get<0>(it));
0 commit comments