Skip to content

Commit 12cc80c

Browse files
committed
phi2 implementation
1 parent 6744dbe commit 12cc80c

File tree

4 files changed

+226
-1
lines changed

4 files changed

+226
-1
lines changed

convert-hf-to-gguf.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ def from_model_architecture(model_architecture):
182182
return QwenModel
183183
if model_architecture == "MixtralForCausalLM":
184184
return MixtralModel
185+
if model_architecture == "PhiForCausalLM":
186+
return Phi2Model
185187
return Model
186188

187189
def _is_model_safetensors(self) -> bool:
@@ -221,6 +223,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
221223
return gguf.MODEL_ARCH.QWEN
222224
if arch == "MixtralForCausalLM":
223225
return gguf.MODEL_ARCH.LLAMA
226+
if arch == "PhiForCausalLM":
227+
return gguf.MODEL_ARCH.PHI2
224228

225229
raise NotImplementedError(f'Architecture "{arch}" not supported!')
226230

@@ -980,6 +984,21 @@ def write_tensors(self):
980984
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
981985
self.gguf_writer.add_tensor(new_name, data)
982986

987+
class Phi2Model(Model):
988+
def set_gguf_parameters(self):
989+
block_count = self.hparams["n_layer"]
990+
991+
self.gguf_writer.add_name("Phi2")
992+
self.gguf_writer.add_context_length(self.hparams["n_positions"])
993+
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
994+
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
995+
self.gguf_writer.add_block_count(block_count)
996+
self.gguf_writer.add_head_count(self.hparams["n_head"])
997+
self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
998+
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
999+
self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
1000+
self.gguf_writer.add_file_type(self.ftype)
1001+
9831002
###### CONVERSION LOGIC ######
9841003

9851004

gguf-py/gguf/constants.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ class MODEL_ARCH(IntEnum):
9595
BLOOM = auto()
9696
STABLELM = auto()
9797
QWEN = auto()
98+
PHI2 = auto()
9899

99100

100101
class MODEL_TENSOR(IntEnum):
@@ -140,6 +141,7 @@ class MODEL_TENSOR(IntEnum):
140141
MODEL_ARCH.BLOOM: "bloom",
141142
MODEL_ARCH.STABLELM: "stablelm",
142143
MODEL_ARCH.QWEN: "qwen",
144+
MODEL_ARCH.PHI2: "phi2",
143145
}
144146

145147
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -350,6 +352,17 @@ class MODEL_TENSOR(IntEnum):
350352
MODEL_ARCH.GPT2: [
351353
# TODO
352354
],
355+
MODEL_ARCH.PHI2: [
356+
MODEL_TENSOR.TOKEN_EMBD,
357+
MODEL_TENSOR.OUTPUT_NORM,
358+
MODEL_TENSOR.OUTPUT,
359+
MODEL_TENSOR.ATTN_NORM,
360+
MODEL_TENSOR.ATTN_QKV,
361+
MODEL_TENSOR.ATTN_OUT,
362+
MODEL_TENSOR.FFN_NORM,
363+
MODEL_TENSOR.FFN_DOWN,
364+
MODEL_TENSOR.FFN_UP,
365+
]
353366
# TODO
354367
}
355368

gguf-py/gguf/tensor_mapping.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class TensorNameMap:
1717
"tok_embeddings", # llama-pth
1818
"embeddings.word_embeddings", # bert
1919
"language_model.embedding.word_embeddings", # persimmon
20+
"transformer.embd.wte", # phi2
2021
),
2122

2223
# Token type embeddings
@@ -41,6 +42,7 @@ class TensorNameMap:
4142
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen
4243
"output", # llama-pth bloom
4344
"word_embeddings_for_head", # persimmon
45+
"lm_head.linear", # phi2
4446
),
4547

4648
# Output norm
@@ -53,6 +55,7 @@ class TensorNameMap:
5355
"transformer.norm_f", # mpt
5456
"ln_f", # refact bloom qwen
5557
"language_model.encoder.final_layernorm", # persimmon
58+
"lm_head.ln", # phi2
5659
),
5760

5861
# Rope frequencies
@@ -75,6 +78,7 @@ class TensorNameMap:
7578
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
7679
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
7780
"model.layers.{bid}.ln1", # yi
81+
"transformer.h.{bid}.ln", # phi2
7882
),
7983

8084
# Attention norm 2
@@ -90,6 +94,7 @@ class TensorNameMap:
9094
"transformer.h.{bid}.self_attention.query_key_value", # falcon
9195
"h.{bid}.self_attention.query_key_value", # bloom
9296
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
97+
"transformer.h.{bid}.mixer.Wqkv", # phi2
9398
),
9499

95100
# Attention query
@@ -128,6 +133,7 @@ class TensorNameMap:
128133
"encoder.layer.{bid}.attention.output.dense", # bert
129134
"transformer.h.{bid}.attn.out_proj", # gpt-j
130135
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
136+
"transformer.h.{bid}.mixer.out_proj", # phi2
131137
),
132138

133139
# Rotary embeddings
@@ -167,6 +173,7 @@ class TensorNameMap:
167173
"transformer.h.{bid}.mlp.fc_in", # gpt-j
168174
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
169175
"transformer.h.{bid}.mlp.w1", # qwen
176+
"transformer.h.{bid}.mlp.fc1", # phi2
170177
),
171178

172179
MODEL_TENSOR.FFN_UP_EXP: (
@@ -198,6 +205,7 @@ class TensorNameMap:
198205
"encoder.layer.{bid}.output.dense", # bert
199206
"transformer.h.{bid}.mlp.fc_out", # gpt-j
200207
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
208+
"transformer.h.{bid}.mlp.fc2", # phi2
201209
),
202210

203211
MODEL_TENSOR.FFN_DOWN_EXP: (

llama.cpp

Lines changed: 186 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ enum llm_arch {
195195
LLM_ARCH_BLOOM,
196196
LLM_ARCH_STABLELM,
197197
LLM_ARCH_QWEN,
198+
LLM_ARCH_PHI2,
198199
LLM_ARCH_UNKNOWN,
199200
};
200201

@@ -212,6 +213,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
212213
{ LLM_ARCH_BLOOM, "bloom" },
213214
{ LLM_ARCH_STABLELM, "stablelm" },
214215
{ LLM_ARCH_QWEN, "qwen" },
216+
{ LLM_ARCH_PHI2, "phi2" },
215217
};
216218

217219
enum llm_kv {
@@ -550,6 +552,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
550552
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
551553
},
552554
},
555+
{
556+
LLM_ARCH_PHI2,
557+
{
558+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
559+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
560+
{ LLM_TENSOR_OUTPUT, "output" },
561+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
562+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
563+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
564+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
565+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
566+
},
567+
},
553568

554569
{
555570
LLM_ARCH_UNKNOWN,
@@ -1420,6 +1435,7 @@ struct llama_model {
14201435
struct ggml_tensor * output_norm;
14211436
struct ggml_tensor * output_norm_b;
14221437
struct ggml_tensor * output;
1438+
struct ggml_tensor * output_b;
14231439

14241440
std::vector<llama_layer> layers;
14251441

@@ -3625,7 +3641,77 @@ static void llm_load_tensors(
36253641
}
36263642
}
36273643
} break;
3644+
case LLM_ARCH_PHI2:
3645+
{
3646+
// TODO: CPU-only for now
3647+
3648+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3649+
3650+
// output
3651+
{
3652+
ggml_backend_type backend_norm;
3653+
ggml_backend_type backend_output;
36283654

3655+
if (n_gpu_layers > int(n_layer)) {
3656+
backend_norm = llama_backend_offload;
3657+
backend_output = llama_backend_offload_split;
3658+
} else {
3659+
backend_norm = GGML_BACKEND_CPU;
3660+
backend_output = GGML_BACKEND_CPU;
3661+
}
3662+
3663+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3664+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3665+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3666+
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3667+
3668+
if (backend_norm == GGML_BACKEND_GPU) {
3669+
vram_weights += ggml_nbytes(model.output_norm);
3670+
vram_weights += ggml_nbytes(model.output_norm_b);
3671+
}
3672+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3673+
vram_weights += ggml_nbytes(model.output);
3674+
vram_weights += ggml_nbytes(model.output_b);
3675+
}
3676+
}
3677+
3678+
const uint32_t n_ff = hparams.n_ff;
3679+
3680+
const int i_gpu_start = n_layer - n_gpu_layers;
3681+
3682+
model.layers.resize(n_layer);
3683+
3684+
for (uint32_t i = 0; i < n_layer; ++i) {
3685+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3686+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3687+
3688+
auto & layer = model.layers[i];
3689+
3690+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3691+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3692+
3693+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3694+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3695+
3696+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3697+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3698+
3699+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3700+
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3701+
3702+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3703+
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3704+
3705+
if (backend == GGML_BACKEND_GPU) {
3706+
vram_weights +=
3707+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3708+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3709+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
3710+
ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) +
3711+
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b);
3712+
}
3713+
}
3714+
} break;
36293715
default:
36303716
throw std::runtime_error("unknown architecture");
36313717
}
@@ -5417,6 +5503,101 @@ struct llm_build_context {
54175503

54185504
ggml_build_forward_expand(gf, cur);
54195505

5506+
return gf;
5507+
}
5508+
struct ggml_cgraph * build_phi2() {
5509+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5510+
5511+
struct ggml_tensor * cur;
5512+
struct ggml_tensor * attn_norm_output;
5513+
struct ggml_tensor * ffn_output;
5514+
struct ggml_tensor * inpL;
5515+
5516+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5517+
cb(inpL, "inp_embd", -1);
5518+
5519+
// inp_pos - contains the positions
5520+
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5521+
cb(inp_pos, "inp_pos", -1);
5522+
5523+
// KQ_scale
5524+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
5525+
cb(KQ_scale, "KQ_scale", -1);
5526+
5527+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5528+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5529+
cb(KQ_mask, "KQ_mask", -1);
5530+
5531+
for (int il = 0; il < n_layer; ++il) {
5532+
5533+
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
5534+
model.layers[il].attn_norm,
5535+
model.layers[il].attn_norm_b,
5536+
LLM_NORM, cb, il);
5537+
cb(attn_norm_output, "attn_norm", il);
5538+
5539+
// self-attention
5540+
{
5541+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5542+
cb(cur, "wqkv", il);
5543+
5544+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5545+
cb(cur, "bqkv", il);
5546+
5547+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5548+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5549+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5550+
5551+
cb(Qcur, "Qcur", il);
5552+
cb(Kcur, "Kcur", il);
5553+
cb(Vcur, "Vcur", il);
5554+
5555+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5556+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5557+
cb(Qcur, "Qcur", il);
5558+
cb(Kcur, "Kcur", il);
5559+
// RoPE
5560+
Qcur = ggml_rope(ctx0, Qcur, inp_pos, 32, 2, 0);
5561+
Kcur = ggml_rope(ctx0, Kcur, inp_pos, 32, 2, 0);
5562+
cb(Qcur, "Qcur", il);
5563+
cb(Kcur, "Kcur", il);
5564+
5565+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5566+
5567+
cur = llm_build_kqv(ctx0, hparams, kv_self,
5568+
model.layers[il].wo, model.layers[il].bo,
5569+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
5570+
cb(cur, "kqv_out", il);
5571+
}
5572+
5573+
// FF
5574+
{
5575+
ffn_output = llm_build_ffn(ctx0, attn_norm_output,
5576+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5577+
NULL, NULL,
5578+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5579+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5580+
cb(ffn_output, "ffn_out", il);
5581+
}
5582+
5583+
inpL = ggml_add(ctx0, cur, ggml_add_inplace(ctx0, ffn_output, inpL));
5584+
cb(inpL, "l_out", il);
5585+
}
5586+
5587+
cur = llm_build_norm(ctx0, inpL, hparams,
5588+
model.output_norm,
5589+
model.output_norm_b,
5590+
LLM_NORM, cb, -1);
5591+
cb(cur, "result_norm", -1);
5592+
5593+
cur = ggml_mul_mat(ctx0, model.output, cur);
5594+
cb(cur, "result_output", -1);
5595+
5596+
cur = ggml_add(ctx0, cur, model.output_b);
5597+
cb(cur, "result_output", -1);
5598+
5599+
ggml_build_forward_expand(gf, cur);
5600+
54205601
return gf;
54215602
}
54225603
};
@@ -5917,6 +6098,10 @@ static struct ggml_cgraph * llama_build_graph(
59176098
{
59186099
result = llm.build_qwen();
59196100
} break;
6101+
case LLM_ARCH_PHI2:
6102+
{
6103+
result = llm.build_phi2();
6104+
} break;
59206105
default:
59216106
GGML_ASSERT(false);
59226107
}
@@ -6051,7 +6236,7 @@ static int llama_decode_internal(
60516236
ggml_allocr_alloc_graph(lctx.alloc, gf);
60526237

60536238
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
6054-
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
6239+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 3];
60556240

60566241
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
60576242
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);

0 commit comments

Comments
 (0)