@@ -194,6 +194,7 @@ enum llm_arch {
194
194
LLM_ARCH_QWEN,
195
195
LLM_ARCH_PHI2,
196
196
LLM_ARCH_PLAMO,
197
+ LLM_ARCH_CODESHELL,
197
198
LLM_ARCH_UNKNOWN,
198
199
};
199
200
@@ -213,6 +214,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
213
214
{ LLM_ARCH_QWEN, " qwen" },
214
215
{ LLM_ARCH_PHI2, " phi2" },
215
216
{ LLM_ARCH_PLAMO, " plamo" },
217
+ { LLM_ARCH_CODESHELL, " codeshell" },
216
218
};
217
219
218
220
enum llm_kv {
@@ -600,6 +602,26 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
600
602
{ LLM_TENSOR_FFN_UP, " blk.%d.ffn_up" },
601
603
},
602
604
},
605
+ {
606
+ LLM_ARCH_CODESHELL,
607
+ {
608
+ { LLM_TENSOR_TOKEN_EMBD, " token_embd" },
609
+ { LLM_TENSOR_OUTPUT_NORM, " output_norm" },
610
+ { LLM_TENSOR_OUTPUT, " output" },
611
+ { LLM_TENSOR_ROPE_FREQS, " rope_freqs" },
612
+ { LLM_TENSOR_ATTN_NORM, " blk.%d.attn_norm" },
613
+ { LLM_TENSOR_ATTN_Q, " blk.%d.attn_q" },
614
+ { LLM_TENSOR_ATTN_K, " blk.%d.attn_k" },
615
+ { LLM_TENSOR_ATTN_V, " blk.%d.attn_v" },
616
+ { LLM_TENSOR_ATTN_QKV, " blk.%d.attn_qkv" },
617
+ { LLM_TENSOR_ATTN_OUT, " blk.%d.attn_output" },
618
+ { LLM_TENSOR_ATTN_ROT_EMBD, " blk.%d.attn_rot_embd" },
619
+ { LLM_TENSOR_FFN_NORM, " blk.%d.ffn_norm" },
620
+ { LLM_TENSOR_FFN_GATE, " blk.%d.ffn_gate" },
621
+ { LLM_TENSOR_FFN_DOWN, " blk.%d.ffn_down" },
622
+ { LLM_TENSOR_FFN_UP, " blk.%d.ffn_up" },
623
+ },
624
+ },
603
625
604
626
{
605
627
LLM_ARCH_UNKNOWN,
@@ -2877,6 +2899,14 @@ static void llm_load_hparams(
2877
2899
default : model.type = e_model::MODEL_UNKNOWN;
2878
2900
}
2879
2901
} break ;
2902
+ case LLM_ARCH_CODESHELL:
2903
+ {
2904
+ ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
2905
+ switch (hparams.n_layer ) {
2906
+ case 42 : model.type = e_model::MODEL_SMALL; break ;
2907
+ default : model.type = e_model::MODEL_UNKNOWN;
2908
+ }
2909
+ } break ;
2880
2910
2881
2911
default : (void )0 ;
2882
2912
}
@@ -3784,6 +3814,42 @@ static bool llm_load_tensors(
3784
3814
layer.ffn_up_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff});
3785
3815
}
3786
3816
} break ;
3817
+ case LLM_ARCH_CODESHELL:
3818
+ {
3819
+ model.tok_embd = ml.create_tensor (ctx_input, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab});
3820
+
3821
+ // output
3822
+ {
3823
+ model.output_norm = ml.create_tensor (ctx_output, tn (LLM_TENSOR_OUTPUT_NORM, " weight" ), {n_embd});
3824
+ model.output_norm_b = ml.create_tensor (ctx_output, tn (LLM_TENSOR_OUTPUT_NORM, " bias" ), {n_embd});
3825
+ model.output = ml.create_tensor (ctx_output_split, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab});
3826
+ }
3827
+
3828
+ for (int i = 0 ; i < n_layer; ++i) {
3829
+ ggml_context * ctx_layer = ctx_for_layer (i);
3830
+ ggml_context * ctx_split = ctx_for_layer_split (i);
3831
+
3832
+ auto & layer = model.layers [i];
3833
+
3834
+ layer.attn_norm = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " weight" , i), {n_embd});
3835
+ layer.attn_norm_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_NORM, " bias" , i), {n_embd});
3836
+
3837
+ layer.wqkv = ml.create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_QKV, " weight" , i), {n_embd, n_embd + 2 *n_embd_gqa});
3838
+ layer.bqkv = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_QKV, " bias" , i), {n_embd + 2 *n_embd_gqa});
3839
+
3840
+ layer.wo = ml.create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd});
3841
+ layer.bo = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd});
3842
+
3843
+ layer.ffn_norm = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd});
3844
+ layer.ffn_norm_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_NORM, " bias" , i), {n_embd});
3845
+
3846
+ layer.ffn_down = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), {n_ff, n_embd});
3847
+ layer.ffn_down_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd});
3848
+
3849
+ layer.ffn_up = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff});
3850
+ layer.ffn_up_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff});
3851
+ }
3852
+ } break ;
3787
3853
default :
3788
3854
throw std::runtime_error (" unknown architecture" );
3789
3855
}
@@ -5965,6 +6031,117 @@ struct llm_build_context {
5965
6031
5966
6032
return gf;
5967
6033
}
6034
+
6035
+ struct ggml_cgraph * build_codeshell () {
6036
+ struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, LLAMA_MAX_NODES, false );
6037
+
6038
+ const int64_t n_embd_head = hparams.n_embd_head_v ;
6039
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
6040
+ GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
6041
+ GGML_ASSERT (n_embd_head == hparams.n_rot );
6042
+
6043
+ struct ggml_tensor * cur;
6044
+ struct ggml_tensor * inpL;
6045
+
6046
+ inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embd , cb);
6047
+ cb (inpL, " inp_embd" , -1 );
6048
+
6049
+ // inp_pos - contains the positions
6050
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens);
6051
+ cb (inp_pos, " inp_pos" , -1 );
6052
+
6053
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6054
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1 );
6055
+ cb (KQ_mask, " KQ_mask" , -1 );
6056
+
6057
+ // shift the entire K-cache if needed
6058
+ if (do_rope_shift) {
6059
+ llm_build_k_shift (ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6060
+ }
6061
+
6062
+ for (int il = 0 ; il < n_layer; ++il) {
6063
+ cur = llm_build_norm (ctx0, inpL, hparams,
6064
+ model.layers [il].attn_norm ,
6065
+ model.layers [il].attn_norm_b ,
6066
+ LLM_NORM, cb, il);
6067
+ cb (cur, " attn_norm" , il);
6068
+
6069
+ // self-attention
6070
+ {
6071
+ cur = ggml_mul_mat (ctx0, model.layers [il].wqkv , cur);
6072
+ cb (cur, " wqkv" , il);
6073
+
6074
+ cur = ggml_add (ctx0, cur, model.layers [il].bqkv );
6075
+ cb (cur, " bqkv" , il);
6076
+
6077
+ struct ggml_tensor * tmpq = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd, n_tokens, cur->nb [1 ], 0 *sizeof (float )*(n_embd)));
6078
+ struct ggml_tensor * tmpk = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd_gqa, n_tokens, cur->nb [1 ], 1 *sizeof (float )*(n_embd)));
6079
+ struct ggml_tensor * Vcur = ggml_cont (ctx0, ggml_view_2d (ctx0, cur, n_embd_gqa, n_tokens, cur->nb [1 ], 1 *sizeof (float )*(n_embd + n_embd_gqa)));
6080
+
6081
+ cb (tmpq, " tmpq" , il);
6082
+ cb (tmpk, " tmpk" , il);
6083
+ cb (Vcur, " Vcur" , il);
6084
+
6085
+ struct ggml_tensor * Qcur = ggml_rope_custom (
6086
+ ctx0, ggml_reshape_3d (ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
6087
+ hparams.n_rot , 2 , 0 , n_orig_ctx, freq_base, freq_scale,
6088
+ ext_factor, attn_factor, beta_fast, beta_slow
6089
+ );
6090
+ cb (Qcur, " Qcur" , il);
6091
+
6092
+ struct ggml_tensor * Kcur = ggml_rope_custom (
6093
+ ctx0, ggml_reshape_3d (ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
6094
+ hparams.n_rot , 2 , 0 , n_orig_ctx, freq_base, freq_scale,
6095
+ ext_factor, attn_factor, beta_fast, beta_slow
6096
+ );
6097
+ cb (Kcur, " Kcur" , il);
6098
+
6099
+ llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
6100
+
6101
+ cur = llm_build_kqv (ctx0, model, hparams, kv_self,
6102
+ model.layers [il].wo , model.layers [il].bo ,
6103
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1 .0f , 1 .0f /sqrtf (float (n_embd_head)), cb, il);
6104
+ cb (cur, " kqv_out" , il);
6105
+ }
6106
+
6107
+ // add the input
6108
+ struct ggml_tensor * ffn_inp = ggml_add (ctx0, cur, inpL);
6109
+ cb (ffn_inp, " ffn_inp" , il);
6110
+
6111
+ // FF
6112
+ {
6113
+ cur = llm_build_norm (ctx0, ffn_inp, hparams,
6114
+ model.layers [il].ffn_norm ,
6115
+ model.layers [il].ffn_norm_b ,
6116
+ LLM_NORM, cb, il);
6117
+ cb (cur, " ffn_norm" , il);
6118
+
6119
+ cur = llm_build_ffn (ctx0, cur,
6120
+ model.layers [il].ffn_up , model.layers [il].ffn_up_b ,
6121
+ NULL , NULL ,
6122
+ model.layers [il].ffn_down , model.layers [il].ffn_down_b ,
6123
+ NULL ,
6124
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6125
+ cb (cur, " ffn_out" , il);
6126
+ }
6127
+
6128
+ inpL = ggml_add (ctx0, cur, ffn_inp);
6129
+ cb (inpL, " l_out" , il);
6130
+ }
6131
+
6132
+ cur = llm_build_norm (ctx0, inpL, hparams,
6133
+ model.output_norm ,
6134
+ model.output_norm_b ,
6135
+ LLM_NORM, cb, -1 );
6136
+ cb (cur, " result_norm" , -1 );
6137
+
6138
+ cur = ggml_mul_mat (ctx0, model.output , cur);
6139
+ cb (cur, " result_output" , -1 );
6140
+
6141
+ ggml_build_forward_expand (gf, cur);
6142
+
6143
+ return gf;
6144
+ }
5968
6145
};
5969
6146
5970
6147
static struct ggml_cgraph * llama_build_graph (
@@ -6159,6 +6336,10 @@ static struct ggml_cgraph * llama_build_graph(
6159
6336
{
6160
6337
result = llm.build_gpt2 ();
6161
6338
} break ;
6339
+ case LLM_ARCH_CODESHELL:
6340
+ {
6341
+ result = llm.build_codeshell ();
6342
+ } break ;
6162
6343
default :
6163
6344
GGML_ASSERT (false );
6164
6345
}
0 commit comments