Skip to content

Commit 0580f87

Browse files
committed
llama : add phixtral support (wip)
1 parent 15ebe59 commit 0580f87

File tree

4 files changed

+107
-11
lines changed

4 files changed

+107
-11
lines changed

convert-hf-to-gguf.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1080,10 +1080,15 @@ class Phi2Model(Model):
10801080
def set_gguf_parameters(self):
10811081
block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
10821082

1083-
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
10841083
n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
10851084
n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
10861085

1086+
if "partial_rotary_factor" in self.hparams:
1087+
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
1088+
n_rot = int(rot_pct * n_embd) // n_head
1089+
else:
1090+
n_rot = get_key_opts(self.hparams, ["rotary_dim", "n_rot"])
1091+
10871092
self.gguf_writer.add_name("Phi2")
10881093
self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
10891094

@@ -1093,10 +1098,14 @@ def set_gguf_parameters(self):
10931098
self.gguf_writer.add_head_count(n_head)
10941099
self.gguf_writer.add_head_count_kv(n_head)
10951100
self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
1096-
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
1101+
self.gguf_writer.add_rope_dimension_count(n_rot)
10971102
self.gguf_writer.add_file_type(self.ftype)
10981103
self.gguf_writer.add_add_bos_token(False)
10991104

1105+
# phixtral
1106+
self.gguf_writer.add_expert_count(self.hparams.get("num_local_experts", 0))
1107+
self.gguf_writer.add_expert_used_count(self.hparams.get("num_experts_per_tok", 0))
1108+
11001109

11011110
class PlamoModel(Model):
11021111
def set_vocab(self):

gguf-py/gguf/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,9 +393,12 @@ class MODEL_TENSOR(IntEnum):
393393
MODEL_TENSOR.ATTN_K,
394394
MODEL_TENSOR.ATTN_V,
395395
MODEL_TENSOR.ATTN_OUT,
396+
MODEL_TENSOR.FFN_GATE_INP,
396397
MODEL_TENSOR.FFN_NORM,
397398
MODEL_TENSOR.FFN_DOWN,
398399
MODEL_TENSOR.FFN_UP,
400+
MODEL_TENSOR.FFN_DOWN_EXP,
401+
MODEL_TENSOR.FFN_UP_EXP,
399402
]
400403
# TODO
401404
}

gguf-py/gguf/tensor_mapping.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ class TensorNameMap:
173173
MODEL_TENSOR.FFN_GATE_INP: (
174174
"layers.{bid}.feed_forward.gate", # mixtral
175175
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
176+
"transformer.h.{bid}.moe.gate", # phixtral
176177
),
177178

178179
# Feed-forward up
@@ -198,6 +199,7 @@ class TensorNameMap:
198199
MODEL_TENSOR.FFN_UP_EXP: (
199200
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
200201
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
202+
"transformer.h.{bid}.moe.mlp.{xid}.fc1", # phixtral
201203
),
202204

203205
# AWQ-activation gate
@@ -240,6 +242,7 @@ class TensorNameMap:
240242
MODEL_TENSOR.FFN_DOWN_EXP: (
241243
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
242244
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
245+
"transformer.h.{bid}.moe.mlp.{xid}.fc2", # phixtral
243246
),
244247

245248
MODEL_TENSOR.ATTN_Q_NORM: (

llama.cpp

Lines changed: 90 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -578,8 +578,11 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
578578
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
579579
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
580580
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
581+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
581582
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
582583
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
584+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
585+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
583586
},
584587
},
585588
{
@@ -1425,16 +1428,20 @@ struct llama_layer {
14251428
struct ggml_tensor * ffn_down; // w2
14261429
struct ggml_tensor * ffn_up; // w3
14271430

1431+
// ff bias
1432+
struct ggml_tensor * ffn_down_b; // b2
1433+
struct ggml_tensor * ffn_up_b; // b3
1434+
struct ggml_tensor * ffn_act;
1435+
14281436
// ff MoE
14291437
struct ggml_tensor * ffn_gate_inp;
14301438
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
14311439
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
14321440
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
14331441

1434-
// ff bias
1435-
struct ggml_tensor * ffn_down_b; // b2
1436-
struct ggml_tensor * ffn_up_b; // b3
1437-
struct ggml_tensor * ffn_act;
1442+
// ff MoE bias
1443+
struct ggml_tensor * ffn_down_b_exp[LLAMA_MAX_EXPERTS];
1444+
struct ggml_tensor * ffn_up_b_exp [LLAMA_MAX_EXPERTS];
14381445
};
14391446

14401447
struct llama_kv_cell {
@@ -3696,11 +3703,29 @@ static bool llm_load_tensors(
36963703
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
36973704
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
36983705

3699-
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3700-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3706+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
37013707

3702-
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3703-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3708+
if (layer.ffn_gate_inp == nullptr) {
3709+
GGML_ASSERT(hparams.n_expert == 0);
3710+
GGML_ASSERT(hparams.n_expert_used == 0);
3711+
3712+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3713+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3714+
3715+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3716+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3717+
} else {
3718+
GGML_ASSERT(hparams.n_expert > 0);
3719+
GGML_ASSERT(hparams.n_expert_used > 0);
3720+
3721+
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3722+
layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {n_ff, n_embd});
3723+
layer.ffn_down_b_exp[x] = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN_EXP, "bias", i, x), {n_embd});
3724+
3725+
layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
3726+
layer.ffn_up_b_exp[x] = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP_EXP, "bias", i, x), {n_ff});
3727+
}
3728+
}
37043729
}
37053730
} break;
37063731
case LLM_ARCH_PLAMO:
@@ -5704,14 +5729,70 @@ struct llm_build_context {
57045729
}
57055730

57065731
// FF
5707-
{
5732+
if (model.layers[il].ffn_gate_inp == nullptr) {
57085733
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
57095734
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
57105735
NULL, NULL,
57115736
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
57125737
NULL,
57135738
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
57145739
cb(ffn_output, "ffn_out", il);
5740+
} else {
5741+
// MoE branch
5742+
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
5743+
cb(logits, "ffn_moe_logits", il);
5744+
5745+
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
5746+
cb(probs, "ffn_moe_probs", il);
5747+
5748+
// select experts
5749+
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
5750+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
5751+
5752+
ggml_tensor * weights = ggml_get_rows(ctx0,
5753+
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
5754+
cb(weights, "ffn_moe_weights", il);
5755+
5756+
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
5757+
5758+
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
5759+
cb(weights_sum, "ffn_moe_weights_sum", il);
5760+
5761+
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
5762+
cb(weights, "ffn_moe_weights_norm", il);
5763+
5764+
// compute expert outputs
5765+
ggml_tensor * moe_out = nullptr;
5766+
5767+
for (int i = 0; i < n_expert_used; ++i) {
5768+
ggml_tensor * cur_expert;
5769+
5770+
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
5771+
#pragma message "TODO: implement ggml_add_id"
5772+
//cur_up = ggml_add_id(ctx0, cur_up, model.layers[il].ffn_up_exp_b, n_expert, selected_experts, i);
5773+
cb(cur_up, "ffn_moe_up", il);
5774+
5775+
cur_up = ggml_gelu(ctx0, cur_up);
5776+
cb(cur_up, "ffn_moe_gelu", il);
5777+
5778+
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_up); // [n_tokens, n_embd]
5779+
#pragma message "TODO: implement ggml_add_id"
5780+
//cur_expert = ggml_add_id(ctx0, cur_expert, model.layers[il].ffn_down_exp_b, n_expert, selected_experts, i);
5781+
cb(cur_expert, "ffn_moe_down", il);
5782+
5783+
cur_expert = ggml_mul(ctx0, cur_expert,
5784+
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
5785+
cb(cur_expert, "ffn_moe_weighted", il);
5786+
5787+
if (i == 0) {
5788+
moe_out = cur_expert;
5789+
} else {
5790+
moe_out = ggml_add(ctx0, moe_out, cur_expert);
5791+
cb(moe_out, "ffn_moe_out", il);
5792+
}
5793+
}
5794+
5795+
ffn_output = moe_out;
57155796
}
57165797

57175798
cur = ggml_add(ctx0, cur, ffn_output);

0 commit comments

Comments
 (0)