Skip to content

Commit 1fb9186

Browse files
committed
llama: define architecture for small granite models
it works only for the small models 3b and 8b. There are enough differences with the base llama arch that it is worth to define a new architecture. To create the .gguf files, it is necessary to specify GraniteSmallForCausalLM in the architectures for the hf model. Signed-off-by: Giuseppe Scrivano <[email protected]>
1 parent d52b4d8 commit 1fb9186

File tree

3 files changed

+98
-0
lines changed

3 files changed

+98
-0
lines changed

convert-hf-to-gguf.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2465,6 +2465,34 @@ def set_vocab(self, *args, **kwargs):
24652465
self.gguf_writer.add_add_bos_token(True)
24662466
self.gguf_writer.add_add_eos_token(True)
24672467

2468+
@Model.register("GraniteSmallForCausalLM")
2469+
class GraniteModel(Model):
2470+
model_arch = gguf.MODEL_ARCH.GRANITE_SMALL
2471+
2472+
def __init__(self, *args, **kwargs):
2473+
super().__init__(*args, **kwargs)
2474+
2475+
def set_vocab(self):
2476+
tokens, toktypes, _ = self.get_vocab_base()
2477+
self.gguf_writer.add_tokenizer_model("gpt2")
2478+
self.gguf_writer.add_tokenizer_pre("starcoder")
2479+
self.gguf_writer.add_token_list(tokens)
2480+
self.gguf_writer.add_token_types(toktypes)
2481+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
2482+
special_vocab.add_to_gguf(self.gguf_writer)
2483+
2484+
def set_gguf_parameters(self):
2485+
super().set_gguf_parameters()
2486+
hparams = self.hparams
2487+
self.gguf_writer.add_name("GraniteSmall")
2488+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2489+
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
2490+
self.gguf_writer.add_add_bos_token(False)
2491+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
2492+
2493+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2494+
return [(self.map_tensor_name(name), data_torch)]
2495+
24682496

24692497
###### CONVERSION LOGIC ######
24702498

gguf-py/gguf/constants.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ class MODEL_ARCH(IntEnum):
139139
COMMAND_R = auto()
140140
DBRX = auto()
141141
OLMO = auto()
142+
GRANITE_SMALL = auto()
142143

143144

144145
class MODEL_TENSOR(IntEnum):
@@ -218,6 +219,7 @@ class MODEL_TENSOR(IntEnum):
218219
MODEL_ARCH.COMMAND_R: "command-r",
219220
MODEL_ARCH.DBRX: "dbrx",
220221
MODEL_ARCH.OLMO: "olmo",
222+
MODEL_ARCH.GRANITE_SMALL: "granite-small",
221223
}
222224

223225
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -732,6 +734,26 @@ class MODEL_TENSOR(IntEnum):
732734
MODEL_TENSOR.FFN_DOWN,
733735
MODEL_TENSOR.FFN_UP,
734736
],
737+
MODEL_ARCH.GRANITE_SMALL: [
738+
MODEL_TENSOR.TOKEN_EMBD,
739+
MODEL_TENSOR.OUTPUT_NORM,
740+
MODEL_TENSOR.OUTPUT,
741+
MODEL_TENSOR.ROPE_FREQS,
742+
MODEL_TENSOR.ATTN_NORM,
743+
MODEL_TENSOR.ATTN_Q,
744+
MODEL_TENSOR.ATTN_K,
745+
MODEL_TENSOR.ATTN_V,
746+
MODEL_TENSOR.ATTN_OUT,
747+
MODEL_TENSOR.ATTN_ROT_EMBD,
748+
MODEL_TENSOR.FFN_GATE_INP,
749+
MODEL_TENSOR.FFN_NORM,
750+
MODEL_TENSOR.FFN_GATE,
751+
MODEL_TENSOR.FFN_DOWN,
752+
MODEL_TENSOR.FFN_UP,
753+
MODEL_TENSOR.FFN_GATE_EXP,
754+
MODEL_TENSOR.FFN_DOWN_EXP,
755+
MODEL_TENSOR.FFN_UP_EXP,
756+
],
735757
# TODO
736758
}
737759

@@ -765,6 +787,10 @@ class MODEL_TENSOR(IntEnum):
765787
MODEL_TENSOR.ROPE_FREQS,
766788
MODEL_TENSOR.ATTN_ROT_EMBD,
767789
],
790+
MODEL_ARCH.GRANITE_SMALL: [
791+
MODEL_TENSOR.ROPE_FREQS,
792+
MODEL_TENSOR.ATTN_ROT_EMBD,
793+
],
768794
}
769795

770796
#

llama.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ enum llm_arch {
221221
LLM_ARCH_COMMAND_R,
222222
LLM_ARCH_DBRX,
223223
LLM_ARCH_OLMO,
224+
LLM_ARCH_GRANITE_SMALL,
224225
LLM_ARCH_UNKNOWN,
225226
};
226227

@@ -257,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
257258
{ LLM_ARCH_COMMAND_R, "command-r" },
258259
{ LLM_ARCH_DBRX, "dbrx" },
259260
{ LLM_ARCH_OLMO, "olmo" },
261+
{ LLM_ARCH_GRANITE_SMALL, "granite-small" },
260262
{ LLM_ARCH_UNKNOWN, "(unknown)" },
261263
};
262264

@@ -1032,6 +1034,32 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
10321034
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
10331035
},
10341036
},
1037+
{
1038+
LLM_ARCH_GRANITE_SMALL,
1039+
{
1040+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1041+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1042+
{ LLM_TENSOR_OUTPUT, "output" },
1043+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1044+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1045+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1046+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1047+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1048+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1049+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1050+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1051+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1052+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1053+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1054+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1055+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
1056+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
1057+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
1058+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1059+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1060+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1061+
},
1062+
},
10351063
{
10361064
LLM_ARCH_UNKNOWN,
10371065
{
@@ -4344,6 +4372,16 @@ static void llm_load_hparams(
43444372
default: model.type = e_model::MODEL_UNKNOWN;
43454373
}
43464374
} break;
4375+
case LLM_ARCH_GRANITE_SMALL:
4376+
{
4377+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4378+
4379+
switch (hparams.n_layer) {
4380+
case 32: model.type = e_model::MODEL_3B; break;
4381+
case 36: model.type = e_model::MODEL_8B; break;
4382+
default: model.type = e_model::MODEL_UNKNOWN;
4383+
}
4384+
} break;
43474385
default: (void)0;
43484386
}
43494387

@@ -4453,6 +4491,9 @@ static void llm_load_vocab(
44534491
} else {
44544492
if (tokenizer_model == "gpt2") {
44554493
vocab.type = LLAMA_VOCAB_TYPE_BPE;
4494+
if (model.arch == LLM_ARCH_GRANITE_SMALL) {
4495+
vocab.add_space_prefix = false;
4496+
}
44564497
} else {
44574498
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
44584499
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -5023,6 +5064,7 @@ static bool llm_load_tensors(
50235064
case LLM_ARCH_LLAMA:
50245065
case LLM_ARCH_REFACT:
50255066
case LLM_ARCH_MINICPM:
5067+
case LLM_ARCH_GRANITE_SMALL:
50265068
{
50275069
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
50285070

@@ -10893,6 +10935,7 @@ static struct ggml_cgraph * llama_build_graph(
1089310935

1089410936
switch (model.arch) {
1089510937
case LLM_ARCH_LLAMA:
10938+
case LLM_ARCH_GRANITE_SMALL:
1089610939
{
1089710940
result = llm.build_llama();
1089810941
} break;
@@ -16038,6 +16081,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1603816081
case LLM_ARCH_GEMMA:
1603916082
case LLM_ARCH_STARCODER2:
1604016083
case LLM_ARCH_GPTNEOX:
16084+
case LLM_ARCH_GRANITE_SMALL:
1604116085
return LLAMA_ROPE_TYPE_NEOX;
1604216086

1604316087
// all model arches should be listed explicitly here

0 commit comments

Comments
 (0)