diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ec3b5697d8f6f..4c566af5fca13 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4064,8 +4064,15 @@ def _is_tokenizer_xlmroberta(self) -> bool: class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): + hparams = kwargs.pop("hparams", None) + if hparams is None: + hparams = ModelBase.load_hparams(dir_model) + + if hparams.get("lora_adaptations"): + self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 + + super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) self._xlmroberta_tokenizer_init() def set_vocab(self): @@ -4077,13 +4084,34 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("roberta."): name = name[8:] + # jina-embeddings-v3 + if ".parametrizations." in name: + name = name.replace(".parametrizations.", ".") + if name.endswith(".original"): + name = name[:-9] + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": if self._position_offset is not None: data_torch = data_torch[self._position_offset:,:] + if name.endswith(".lora_A"): + # TODO: convert loras + return [] + + if name.endswith(".lora_B"): + # TODO: convert loras + return [] + return super().modify_tensors(data_torch, name, bid) + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # jina-embeddings-v3 + if rotary_emb_base := self.hparams.get("rotary_emb_base"): + self.gguf_writer.add_rope_freq_base(rotary_emb_base) + @ModelBase.register("GemmaForCausalLM") class GemmaModel(TextModel): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3ee2b2064e1b4..ae30129b313fd 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -292,6 +292,7 @@ class MODEL_ARCH(IntEnum): NOMIC_BERT = auto() NOMIC_BERT_MOE = auto() JINA_BERT_V2 = auto() + JINA_BERT_V3 = auto() BLOOM = auto() STABLELM = auto() QWEN = auto() @@ -572,6 +573,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe", MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", + MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", @@ -1095,6 +1097,18 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.LAYER_OUT_NORM, MODEL_TENSOR.CLS, ], + MODEL_ARCH.JINA_BERT_V3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.TOKEN_TYPES, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_OUT_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.LAYER_OUT_NORM, + ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index c0590e105c889..0ef81054b0373 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -21,6 +21,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, + { LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, @@ -516,6 +517,20 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_CLS, "cls" }, }, }, + { + LLM_ARCH_JINA_BERT_V3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + }, + }, { LLM_ARCH_BLOOM, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 930cb4eca33ab..1dcd4fa3510ba 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -25,6 +25,7 @@ enum llm_arch { LLM_ARCH_NOMIC_BERT, LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_JINA_BERT_V2, + LLM_ARCH_JINA_BERT_V3, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 50264a69aac0e..9ac88c27c5f79 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -44,6 +44,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_410M: return "410M"; case LLM_TYPE_450M: return "450M"; case LLM_TYPE_475M: return "475M"; + case LLM_TYPE_558M: return "558M"; case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; case LLM_TYPE_0_5B: return "0.5B"; @@ -717,6 +718,18 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_JINA_BERT_V3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); + + switch (hparams.n_layer) { + case 24: + type = LLM_TYPE_558M; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: { @@ -2115,6 +2128,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: + case LLM_ARCH_JINA_BERT_V3: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED); @@ -2150,24 +2164,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) { - layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); } else { - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - - if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) { - layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); - layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); - } else { + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + + if (arch == LLM_ARCH_NOMIC_BERT) { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); } } @@ -5949,7 +5961,7 @@ struct llm_build_bert : public llm_graph_context { Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // RoPE - if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { + if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) { Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -6009,7 +6021,7 @@ struct llm_build_bert : public llm_graph_context { 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur, "ffn_moe_out", il); - } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { + } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) { cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, @@ -13193,6 +13205,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, switch (arch) { case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: + case LLM_ARCH_JINA_BERT_V3: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_WAVTOKENIZER_DEC: @@ -13298,6 +13311,7 @@ llm_graph_result_ptr llama_model::build_graph( } break; case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: + case LLM_ARCH_JINA_BERT_V3: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: { @@ -13669,6 +13683,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GROK: case LLM_ARCH_DBRX: case LLM_ARCH_BERT: + case LLM_ARCH_JINA_BERT_V3: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_STABLELM: diff --git a/src/llama-model.h b/src/llama-model.h index cbea2cb331b62..77e36f12f252b 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -37,6 +37,7 @@ enum llm_type { LLM_TYPE_410M, LLM_TYPE_450M, LLM_TYPE_475M, + LLM_TYPE_558M, LLM_TYPE_770M, LLM_TYPE_780M, LLM_TYPE_0_5B, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index b51976699ca7b..00d26ff85a494 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2096,7 +2096,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { // set attributes by model/tokenizer/architecture name if (false || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"}) - || _contains_any(general_arch, {"nomic-bert-moe"}) + || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"}) ) { _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); } else if (_contains_any(model_name, {"phi-3", "phi3"})) {