@@ -195,6 +195,7 @@ enum llm_arch {
195
195
LLM_ARCH_BLOOM,
196
196
LLM_ARCH_STABLELM,
197
197
LLM_ARCH_QWEN,
198
+ LLM_ARCH_PHI2,
198
199
LLM_ARCH_UNKNOWN,
199
200
};
200
201
@@ -212,6 +213,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
212
213
{ LLM_ARCH_BLOOM, "bloom" },
213
214
{ LLM_ARCH_STABLELM, "stablelm" },
214
215
{ LLM_ARCH_QWEN, "qwen" },
216
+ { LLM_ARCH_PHI2, "phi2" },
215
217
};
216
218
217
219
enum llm_kv {
@@ -550,6 +552,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
550
552
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
551
553
},
552
554
},
555
+ {
556
+ LLM_ARCH_PHI2,
557
+ {
558
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
559
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
560
+ { LLM_TENSOR_OUTPUT, "output" },
561
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
562
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
563
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
564
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
565
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
566
+ },
567
+ },
553
568
554
569
{
555
570
LLM_ARCH_UNKNOWN,
@@ -1420,6 +1435,7 @@ struct llama_model {
1420
1435
struct ggml_tensor * output_norm;
1421
1436
struct ggml_tensor * output_norm_b;
1422
1437
struct ggml_tensor * output;
1438
+ struct ggml_tensor * output_b;
1423
1439
1424
1440
std::vector<llama_layer> layers;
1425
1441
@@ -3625,7 +3641,77 @@ static void llm_load_tensors(
3625
3641
}
3626
3642
}
3627
3643
} break;
3644
+ case LLM_ARCH_PHI2:
3645
+ {
3646
+ // TODO: CPU-only for now
3647
+
3648
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3649
+
3650
+ // output
3651
+ {
3652
+ ggml_backend_type backend_norm;
3653
+ ggml_backend_type backend_output;
3628
3654
3655
+ if (n_gpu_layers > int(n_layer)) {
3656
+ backend_norm = llama_backend_offload;
3657
+ backend_output = llama_backend_offload_split;
3658
+ } else {
3659
+ backend_norm = GGML_BACKEND_CPU;
3660
+ backend_output = GGML_BACKEND_CPU;
3661
+ }
3662
+
3663
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3664
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3665
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3666
+ model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3667
+
3668
+ if (backend_norm == GGML_BACKEND_GPU) {
3669
+ vram_weights += ggml_nbytes(model.output_norm);
3670
+ vram_weights += ggml_nbytes(model.output_norm_b);
3671
+ }
3672
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3673
+ vram_weights += ggml_nbytes(model.output);
3674
+ vram_weights += ggml_nbytes(model.output_b);
3675
+ }
3676
+ }
3677
+
3678
+ const uint32_t n_ff = hparams.n_ff;
3679
+
3680
+ const int i_gpu_start = n_layer - n_gpu_layers;
3681
+
3682
+ model.layers.resize(n_layer);
3683
+
3684
+ for (uint32_t i = 0; i < n_layer; ++i) {
3685
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3686
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3687
+
3688
+ auto & layer = model.layers[i];
3689
+
3690
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3691
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3692
+
3693
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3694
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3695
+
3696
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3697
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3698
+
3699
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3700
+ layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3701
+
3702
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3703
+ layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3704
+
3705
+ if (backend == GGML_BACKEND_GPU) {
3706
+ vram_weights +=
3707
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3708
+ ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3709
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
3710
+ ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
3711
+ ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
3712
+ }
3713
+ }
3714
+ } break;
3629
3715
default:
3630
3716
throw std::runtime_error("unknown architecture");
3631
3717
}
@@ -5417,6 +5503,101 @@ struct llm_build_context {
5417
5503
5418
5504
ggml_build_forward_expand(gf, cur);
5419
5505
5506
+ return gf;
5507
+ }
5508
+ struct ggml_cgraph * build_phi2() {
5509
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5510
+
5511
+ struct ggml_tensor * cur;
5512
+ struct ggml_tensor * attn_norm_output;
5513
+ struct ggml_tensor * ffn_output;
5514
+ struct ggml_tensor * inpL;
5515
+
5516
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5517
+ cb(inpL, "inp_embd", -1);
5518
+
5519
+ // inp_pos - contains the positions
5520
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5521
+ cb(inp_pos, "inp_pos", -1);
5522
+
5523
+ // KQ_scale
5524
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5525
+ cb(KQ_scale, "KQ_scale", -1);
5526
+
5527
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5528
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5529
+ cb(KQ_mask, "KQ_mask", -1);
5530
+
5531
+ for (int il = 0; il < n_layer; ++il) {
5532
+
5533
+ attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
5534
+ model.layers[il].attn_norm,
5535
+ model.layers[il].attn_norm_b,
5536
+ LLM_NORM, cb, il);
5537
+ cb(attn_norm_output, "attn_norm", il);
5538
+
5539
+ // self-attention
5540
+ {
5541
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5542
+ cb(cur, "wqkv", il);
5543
+
5544
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5545
+ cb(cur, "bqkv", il);
5546
+
5547
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5548
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5549
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5550
+
5551
+ cb(Qcur, "Qcur", il);
5552
+ cb(Kcur, "Kcur", il);
5553
+ cb(Vcur, "Vcur", il);
5554
+
5555
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5556
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5557
+ cb(Qcur, "Qcur", il);
5558
+ cb(Kcur, "Kcur", il);
5559
+ // RoPE
5560
+ Qcur = ggml_rope(ctx0, Qcur, inp_pos, 32, 2, 0);
5561
+ Kcur = ggml_rope(ctx0, Kcur, inp_pos, 32, 2, 0);
5562
+ cb(Qcur, "Qcur", il);
5563
+ cb(Kcur, "Kcur", il);
5564
+
5565
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5566
+
5567
+ cur = llm_build_kqv(ctx0, hparams, kv_self,
5568
+ model.layers[il].wo, model.layers[il].bo,
5569
+ Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5570
+ cb(cur, "kqv_out", il);
5571
+ }
5572
+
5573
+ // FF
5574
+ {
5575
+ ffn_output = llm_build_ffn(ctx0, attn_norm_output,
5576
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5577
+ NULL, NULL,
5578
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5579
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5580
+ cb(ffn_output, "ffn_out", il);
5581
+ }
5582
+
5583
+ inpL = ggml_add(ctx0, cur, ggml_add_inplace(ctx0, ffn_output, inpL));
5584
+ cb(inpL, "l_out", il);
5585
+ }
5586
+
5587
+ cur = llm_build_norm(ctx0, inpL, hparams,
5588
+ model.output_norm,
5589
+ model.output_norm_b,
5590
+ LLM_NORM, cb, -1);
5591
+ cb(cur, "result_norm", -1);
5592
+
5593
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5594
+ cb(cur, "result_output", -1);
5595
+
5596
+ cur = ggml_add(ctx0, cur, model.output_b);
5597
+ cb(cur, "result_output", -1);
5598
+
5599
+ ggml_build_forward_expand(gf, cur);
5600
+
5420
5601
return gf;
5421
5602
}
5422
5603
};
@@ -5917,6 +6098,10 @@ static struct ggml_cgraph * llama_build_graph(
5917
6098
{
5918
6099
result = llm.build_qwen();
5919
6100
} break;
6101
+ case LLM_ARCH_PHI2:
6102
+ {
6103
+ result = llm.build_phi2();
6104
+ } break;
5920
6105
default:
5921
6106
GGML_ASSERT(false);
5922
6107
}
@@ -6051,7 +6236,7 @@ static int llama_decode_internal(
6051
6236
ggml_allocr_alloc_graph(lctx.alloc, gf);
6052
6237
6053
6238
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
6054
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2 ];
6239
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 3 ];
6055
6240
6056
6241
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
6057
6242
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
0 commit comments