Skip to content

Commit b7ede48

Browse files
committed
llama : fix pre-tokenizers
1 parent e59b546 commit b7ede48

File tree

3 files changed

+56
-42
lines changed

3 files changed

+56
-42
lines changed

convert-hf-to-gguf-update.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ class TOKENIZER_TYPE(IntEnum):
5555

5656
# TODO: add models here, base models preferred
5757
models = [
58-
#{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
59-
#{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
58+
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
59+
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
6060
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
6161
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
6262
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
@@ -70,8 +70,9 @@ class TOKENIZER_TYPE(IntEnum):
7070
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
7171
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
7272
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
73-
{"name": "jina-embeddings-v2-base-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
74-
{"name": "jina-embeddings-v2-base-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
73+
{"name": "jina-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
74+
{"name": "jina-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
75+
{"name": "jina-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
7576
]
7677

7778
# make directory "models/tokenizers" if it doesn't exist

convert-hf-to-gguf.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,12 +322,17 @@ def get_vocab_base_pre(self, tokenizer) -> str:
322322
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
323323
res = "olmo"
324324
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
325-
# ref: https://huggingface.co/databricks/dbrx-instruct
325+
# ref: https://huggingface.co/databricks/dbrx-base
326326
res = "dbrx"
327+
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
328+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
329+
res = "jina-en"
327330
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
328-
res = "default"
331+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
332+
res = "jina-es"
329333
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
330-
res = "default"
334+
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
335+
res = "jina-de"
331336

332337
if res is None:
333338
logger.warning("\n")

llama.cpp

Lines changed: 43 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -229,40 +229,40 @@ enum llm_arch {
229229
};
230230

231231
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
232-
{ LLM_ARCH_LLAMA, "llama" },
233-
{ LLM_ARCH_FALCON, "falcon" },
234-
{ LLM_ARCH_GROK, "grok" },
235-
{ LLM_ARCH_GPT2, "gpt2" },
236-
{ LLM_ARCH_GPTJ, "gptj" },
237-
{ LLM_ARCH_GPTNEOX, "gptneox" },
238-
{ LLM_ARCH_MPT, "mpt" },
239-
{ LLM_ARCH_BAICHUAN, "baichuan" },
240-
{ LLM_ARCH_STARCODER, "starcoder" },
241-
{ LLM_ARCH_PERSIMMON, "persimmon" },
242-
{ LLM_ARCH_REFACT, "refact" },
243-
{ LLM_ARCH_BERT, "bert" },
244-
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
245-
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2"},
246-
{ LLM_ARCH_BLOOM, "bloom" },
247-
{ LLM_ARCH_STABLELM, "stablelm" },
248-
{ LLM_ARCH_QWEN, "qwen" },
249-
{ LLM_ARCH_QWEN2, "qwen2" },
250-
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
251-
{ LLM_ARCH_PHI2, "phi2" },
252-
{ LLM_ARCH_PHI3, "phi3" },
253-
{ LLM_ARCH_PLAMO, "plamo" },
254-
{ LLM_ARCH_CODESHELL, "codeshell" },
255-
{ LLM_ARCH_ORION, "orion" },
256-
{ LLM_ARCH_INTERNLM2, "internlm2" },
257-
{ LLM_ARCH_MINICPM, "minicpm" },
258-
{ LLM_ARCH_GEMMA, "gemma" },
259-
{ LLM_ARCH_STARCODER2, "starcoder2" },
260-
{ LLM_ARCH_MAMBA, "mamba" },
261-
{ LLM_ARCH_XVERSE, "xverse" },
262-
{ LLM_ARCH_COMMAND_R, "command-r" },
263-
{ LLM_ARCH_DBRX, "dbrx" },
264-
{ LLM_ARCH_OLMO, "olmo" },
265-
{ LLM_ARCH_UNKNOWN, "(unknown)" },
232+
{ LLM_ARCH_LLAMA, "llama" },
233+
{ LLM_ARCH_FALCON, "falcon" },
234+
{ LLM_ARCH_GROK, "grok" },
235+
{ LLM_ARCH_GPT2, "gpt2" },
236+
{ LLM_ARCH_GPTJ, "gptj" },
237+
{ LLM_ARCH_GPTNEOX, "gptneox" },
238+
{ LLM_ARCH_MPT, "mpt" },
239+
{ LLM_ARCH_BAICHUAN, "baichuan" },
240+
{ LLM_ARCH_STARCODER, "starcoder" },
241+
{ LLM_ARCH_PERSIMMON, "persimmon" },
242+
{ LLM_ARCH_REFACT, "refact" },
243+
{ LLM_ARCH_BERT, "bert" },
244+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
245+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
246+
{ LLM_ARCH_BLOOM, "bloom" },
247+
{ LLM_ARCH_STABLELM, "stablelm" },
248+
{ LLM_ARCH_QWEN, "qwen" },
249+
{ LLM_ARCH_QWEN2, "qwen2" },
250+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
251+
{ LLM_ARCH_PHI2, "phi2" },
252+
{ LLM_ARCH_PHI3, "phi3" },
253+
{ LLM_ARCH_PLAMO, "plamo" },
254+
{ LLM_ARCH_CODESHELL, "codeshell" },
255+
{ LLM_ARCH_ORION, "orion" },
256+
{ LLM_ARCH_INTERNLM2, "internlm2" },
257+
{ LLM_ARCH_MINICPM, "minicpm" },
258+
{ LLM_ARCH_GEMMA, "gemma" },
259+
{ LLM_ARCH_STARCODER2, "starcoder2" },
260+
{ LLM_ARCH_MAMBA, "mamba" },
261+
{ LLM_ARCH_XVERSE, "xverse" },
262+
{ LLM_ARCH_COMMAND_R, "command-r" },
263+
{ LLM_ARCH_DBRX, "dbrx" },
264+
{ LLM_ARCH_OLMO, "olmo" },
265+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
266266
};
267267

268268
enum llm_kv {
@@ -3800,6 +3800,12 @@ static void llm_load_hparams(
38003800

38013801
// get hparams kv
38023802
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
3803+
3804+
// everything past this point is not vocab-related
3805+
if (hparams.vocab_only) {
3806+
return;
3807+
}
3808+
38033809
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
38043810
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
38053811
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
@@ -4417,7 +4423,9 @@ static void llm_load_vocab(
44174423
tokenizer_pre == "starcoder") {
44184424
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
44194425
} else if (
4420-
tokenizer_pre == "gpt-2") {
4426+
tokenizer_pre == "gpt-2" ||
4427+
tokenizer_pre == "jina-es" ||
4428+
tokenizer_pre == "jina-de") {
44214429
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
44224430
} else if (
44234431
tokenizer_pre == "refact") {

0 commit comments

Comments
 (0)