Skip to content

Commit 9aa6724

Browse files
author
Joan Fontanals
authored
llama : rename jina tokenizers to v2 (#7249)
* refactor: rename jina tokenizers to v2 * refactor: keep refactoring non-breaking
1 parent b1f8af1 commit 9aa6724

File tree

3 files changed

+9
-7
lines changed

3 files changed

+9
-7
lines changed

convert-hf-to-gguf-update.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,9 @@ class TOKENIZER_TYPE(IntEnum):
7474
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
7575
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
7676
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
77-
{"name": "jina-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
78-
{"name": "jina-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
79-
{"name": "jina-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
77+
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
78+
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
79+
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
8080
]
8181

8282
# make directory "models/tokenizers" if it doesn't exist

convert-hf-to-gguf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -475,13 +475,13 @@ def get_vocab_base_pre(self, tokenizer) -> str:
475475
res = "dbrx"
476476
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
477477
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
478-
res = "jina-en"
478+
res = "jina-v2-en"
479479
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
480480
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
481-
res = "jina-es"
481+
res = "jina-v2-es"
482482
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
483483
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
484-
res = "jina-de"
484+
res = "jina-v2-de"
485485

486486
if res is None:
487487
logger.warning("\n")

llama.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4424,7 +4424,9 @@ static void llm_load_vocab(
44244424
} else if (
44254425
tokenizer_pre == "gpt-2" ||
44264426
tokenizer_pre == "jina-es" ||
4427-
tokenizer_pre == "jina-de") {
4427+
tokenizer_pre == "jina-de" ||
4428+
tokenizer_pre == "jina-v2-es" ||
4429+
tokenizer_pre == "jina-v2-de") {
44284430
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
44294431
} else if (
44304432
tokenizer_pre == "refact") {

0 commit comments

Comments
 (0)