-
Notifications
You must be signed in to change notification settings - Fork 13.4k
Support StableLM2 12B #6635
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support StableLM2 12B #6635
Changes from 17 commits
d383c0d
13387d9
b89fa97
b5afc44
0eb8492
15a5e7d
91a3db9
29d940b
0ec53cf
8dcd997
0dc779b
f7b40d7
e3f7360
96695fb
13c75c2
412a280
91728fa
bf1a9a5
d2ab693
b7f984a
1f6929e
8a15f93
6ae4dad
e6ec203
94e8c49
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -702,6 +702,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA | |
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, | ||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, | ||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, | ||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, | ||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, | ||
}, | ||
}, | ||
{ | ||
|
@@ -1708,6 +1710,7 @@ enum e_model { | |
MODEL_4B, | ||
MODEL_7B, | ||
MODEL_8B, | ||
MODEL_12B, | ||
ashishdatta marked this conversation as resolved.
Show resolved
Hide resolved
|
||
MODEL_13B, | ||
MODEL_14B, | ||
MODEL_15B, | ||
|
@@ -3564,6 +3567,7 @@ static const char * llama_model_type_name(e_model type) { | |
case MODEL_3B: return "3B"; | ||
case MODEL_7B: return "7B"; | ||
case MODEL_8B: return "8B"; | ||
case MODEL_12B: return "12B"; | ||
case MODEL_13B: return "13B"; | ||
case MODEL_14B: return "14B"; | ||
case MODEL_15B: return "15B"; | ||
|
@@ -3854,6 +3858,7 @@ static void llm_load_hparams( | |
switch (hparams.n_layer) { | ||
case 24: model.type = e_model::MODEL_1B; break; | ||
case 32: model.type = e_model::MODEL_3B; break; | ||
case 40: model.type = e_model::MODEL_12B; break; | ||
default: model.type = e_model::MODEL_UNKNOWN; | ||
} | ||
} break; | ||
|
@@ -5048,8 +5053,13 @@ static bool llm_load_tensors( | |
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); | ||
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); | ||
|
||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); | ||
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); | ||
// optional q and k layernorms, present in StableLM 2 12B | ||
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head}, false); | ||
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM,"weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false); | ||
compilade marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual | ||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false); | ||
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false); | ||
|
||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); | ||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); | ||
|
@@ -8067,7 +8077,7 @@ struct llm_build_context { | |
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); | ||
|
||
for (int il = 0; il < n_layer; ++il) { | ||
struct ggml_tensor * inpSA = inpL; | ||
|
||
|
||
// norm | ||
cur = llm_build_norm(ctx0, inpL, hparams, | ||
|
@@ -8076,6 +8086,8 @@ struct llm_build_context { | |
LLM_NORM, cb, il); | ||
cb(cur, "attn_norm", il); | ||
|
||
struct ggml_tensor * inpSA = cur; | ||
|
||
// self-attention | ||
{ | ||
// compute Q and K and RoPE them | ||
|
@@ -8100,15 +8112,36 @@ struct llm_build_context { | |
cb(Vcur, "Vcur", il); | ||
} | ||
|
||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); | ||
cb(Qcur, "Qcur", il); | ||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||
cb(Kcur, "Kcur", il); | ||
|
||
if (model.layers[il].attn_q_norm) { | ||
Qcur = llm_build_norm(ctx0, Qcur, hparams, | ||
model.layers[il].attn_q_norm, | ||
NULL, | ||
LLM_NORM, cb, il); | ||
cb(Qcur, "Qcur", il); | ||
} | ||
if (model.layers[il].attn_q_norm) { | ||
compilade marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
Kcur = llm_build_norm(ctx0, Kcur, hparams, | ||
model.layers[il].attn_k_norm, | ||
NULL, | ||
LLM_NORM, cb, il); | ||
cb(Kcur, "Kcur", il); | ||
} | ||
compilade marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
Qcur = ggml_rope_custom( | ||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, | ||
ctx0, Qcur, inp_pos, | ||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, | ||
ext_factor, attn_factor, beta_fast, beta_slow | ||
); | ||
cb(Qcur, "Qcur", il); | ||
|
||
Kcur = ggml_rope_custom( | ||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, | ||
ctx0, Kcur, inp_pos, | ||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, | ||
ext_factor, attn_factor, beta_fast, beta_slow | ||
); | ||
|
@@ -8123,30 +8156,44 @@ struct llm_build_context { | |
// skip computing output for unused tokens | ||
struct ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||
cur = ggml_get_rows(ctx0, cur, inp_out_ids); | ||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); | ||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); | ||
} | ||
|
||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); | ||
cb(ffn_inp, "ffn_inp", il); | ||
struct ggml_tensor * attn_out = cur; | ||
// only used for non-parallel residual | ||
ashishdatta marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, attn_out, inpL); | ||
cb(cur, "ffn_inp", il); | ||
|
||
// feed-forward network | ||
{ | ||
cur = llm_build_norm(ctx0, ffn_inp, hparams, | ||
model.layers[il].ffn_norm, | ||
model.layers[il].ffn_norm_b, | ||
LLM_NORM, cb, il); | ||
cb(cur, "ffn_norm", il); | ||
|
||
if (model.layers[il].ffn_norm) { | ||
cur = llm_build_norm(ctx0, ffn_inp, hparams, | ||
model.layers[il].ffn_norm, | ||
model.layers[il].ffn_norm_b, | ||
LLM_NORM, cb, il); | ||
cb(cur, "ffn_norm", il); | ||
} else { | ||
// parallel residual | ||
cur = inpSA; | ||
} | ||
compilade marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
cur = llm_build_ffn(ctx0, cur, | ||
model.layers[il].ffn_up, NULL, | ||
model.layers[il].ffn_gate, NULL, | ||
model.layers[il].ffn_down, NULL, | ||
NULL, | ||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); | ||
model.layers[il].ffn_up, NULL, | ||
model.layers[il].ffn_gate, NULL, | ||
model.layers[il].ffn_down, NULL, | ||
NULL, | ||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); | ||
ashishdatta marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
cb(cur, "ffn_out", il); | ||
} | ||
|
||
cur = ggml_add(ctx0, cur, ffn_inp); | ||
if (model.layers[il].ffn_norm) { | ||
// non-parallel residual | ||
cur = ggml_add(ctx0, cur, ffn_inp); | ||
} else { | ||
// add together residual + FFN + self-attention | ||
cur = ggml_add(ctx0, cur, inpL); | ||
cur = ggml_add(ctx0, cur, attn_out); | ||
} | ||
|
||
|
||
cb(cur, "l_out", il); | ||
|
||
// input for next layer | ||
|
Uh oh!
There was an error while loading. Please reload this page.