@@ -1720,6 +1720,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1720
1720
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
1721
1721
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
1722
1722
1723
+ auto add_lora_tensors = [&](const std::string & lora_name, const std::string & tensor_name) -> void {
1724
+ std::string base_name = tensor_name.substr(0, tensor_name.size() - 6);
1725
+
1726
+ ggml_tensor * lora_a = ml.get_tensor_meta((base_name + "<" + lora_name + ">lora_a").c_str());
1727
+ ggml_tensor * lora_b = ml.get_tensor_meta((base_name + "<" + lora_name + ">lora_b").c_str());
1728
+ loras[lora_name]->ab_map[tensor_name] = llama_adapter_lora_weight(lora_a, lora_b);
1729
+
1730
+ ml.n_created += 2;
1731
+ };
1732
+
1723
1733
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
1724
1734
ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
1725
1735
@@ -2246,6 +2256,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2246
2256
case LLM_ARCH_NOMIC_BERT_MOE:
2247
2257
case LLM_ARCH_JINA_BERT_V3:
2248
2258
{
2259
+ std::vector<std::string> lora_names;
2260
+
2249
2261
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2250
2262
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
2251
2263
@@ -2262,6 +2274,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2262
2274
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
2263
2275
tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
2264
2276
2277
+ if (arch == LLM_ARCH_JINA_BERT_V3) {
2278
+ float lora_alpha = 1.0f;
2279
+ std::vector<std::string> lora_prompt_prefixes;
2280
+
2281
+ ml.get_key(LLM_KV_ADAPTER_LORA_ALPHA, lora_alpha, false);
2282
+ ml.get_arr(LLM_KV_ADAPTER_LORA_NAMES, lora_names, false);
2283
+ ml.get_arr(LLM_KV_ADAPTER_LORA_PROMPT_PREFIXES, lora_prompt_prefixes, false);
2284
+ GGML_ASSERT(lora_names.size() == lora_prompt_prefixes.size());
2285
+
2286
+ for (size_t i = 0; i < lora_names.size(); ++i) {
2287
+ llama_adapter_lora * adapter = new llama_adapter_lora();
2288
+ std::string lora_name = lora_names[i];
2289
+
2290
+ adapter->alpha = lora_alpha;
2291
+ adapter->prompt_prefix = lora_prompt_prefixes[i];
2292
+ loras[lora_name] = adapter;
2293
+
2294
+ add_lora_tensors(lora_name, tok_embd->name);
2295
+
2296
+ if (type_embd) {
2297
+ add_lora_tensors(lora_name, type_embd->name);
2298
+ }
2299
+ }
2300
+ }
2301
+
2265
2302
for (int i = 0; i < n_layer; ++i) {
2266
2303
auto & layer = layers[i];
2267
2304
@@ -2300,6 +2337,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2300
2337
}
2301
2338
}
2302
2339
2340
+ if (arch == LLM_ARCH_JINA_BERT_V3) {
2341
+ GGML_ASSERT(layer.wqkv != nullptr);
2342
+
2343
+ for (const auto & lora_name : lora_names) {
2344
+ add_lora_tensors(lora_name, layer.wqkv->name);
2345
+ add_lora_tensors(lora_name, layer.wo->name);
2346
+ add_lora_tensors(lora_name, layer.ffn_up->name);
2347
+ add_lora_tensors(lora_name, layer.ffn_down->name);
2348
+ }
2349
+ }
2350
+
2303
2351
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
2304
2352
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
2305
2353
}
0 commit comments