Skip to content

gguf : add rope_freq_base parameter for CodeLlama #2769

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 25 additions & 18 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ class Params:
n_head_kv: int
f_norm_eps: float

f_rope_freq_base: Optional[float] = None

ftype: Optional[GGMLFileType] = None

# path to the directory containing the model files
Expand Down Expand Up @@ -194,15 +196,16 @@ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path))

n_vocab = config["vocab_size"] if "vocab_size" in config else -1
n_embd = config["dim"]
n_layer = config["n_layers"]
n_mult = config["multiple_of"]
n_ctx = 2048 if config["norm_eps"] == 1e-06 else 4096 # hack to determine LLaMA v1 vs v2
n_ff = -1
n_head = config["n_heads"]
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
f_norm_eps = config["norm_eps"]
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
n_embd = config["dim"]
n_layer = config["n_layers"]
n_mult = config["multiple_of"]
n_ctx = 2048 if config["norm_eps"] == 1e-06 else 4096 # hack to determine LLaMA v1 vs v2
n_ff = -1
n_head = config["n_heads"]
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
f_norm_eps = config["norm_eps"]
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None

if n_vocab == -1:
n_vocab = model["tok_embeddings.weight"].shape[0]
Expand All @@ -211,15 +214,16 @@ def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]

return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer,
n_ctx = n_ctx,
n_ff = n_ff,
n_head = n_head,
n_head_kv = n_head_kv,
f_norm_eps = f_norm_eps,
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_layer = n_layer,
n_ctx = n_ctx,
n_ff = n_ff,
n_head = n_head,
n_head_kv = n_head_kv,
f_norm_eps = f_norm_eps,
f_rope_freq_base = f_rope_freq_base,
)

@staticmethod
Expand Down Expand Up @@ -754,6 +758,9 @@ def add_meta_arch(self, params: Params) -> None:
self.gguf.add_head_count_kv (params.n_head_kv)
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)

if params.f_rope_freq_base:
self.gguf.add_rope_freq_base(params.f_rope_freq_base)

if params.ftype:
self.gguf.add_file_type(params.ftype)

Expand Down
6 changes: 5 additions & 1 deletion gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@

# RoPE
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"

# tokenization
Expand Down Expand Up @@ -663,7 +664,10 @@ def add_rope_dimension_count(self, count: int):
self.add_uint32(
KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)

def add_rope_scale_linear(self, value: float):
def add_rope_freq_base(self, value: float):
self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)

def add_rope_scale_linear(self, value: float):
self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)

def add_tokenizer_model(self, model: str):
Expand Down
20 changes: 18 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ enum llm_kv {
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,

LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_SCALE_LINEAR,

LLM_KV_TOKENIZER_MODEL,
Expand Down Expand Up @@ -238,6 +239,7 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },

{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },

{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
Expand Down Expand Up @@ -1561,12 +1563,26 @@ static void llm_load_hparams(
hparams.n_head_kv = hparams.n_head;
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));

// TODO: manually setting rope scale should override this
// TODO: manually setting rope freq base and scale should override this
// FIXME: partial fix when the param specified is not the default value, but
// will not work for overriding the model value to the params default
Comment on lines +1566 to +1568
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also has this change to allow overriding the model value of freq_base and freq_scale, but it is not perfect.


llama_context_params defaults = llama_context_default_params();

// rope_freq_base
{
float ropebase = 10000.0f;
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
rope_freq_base = ropebase;
}
}

// rope_freq_scale (inverse of the kv) is optional
{
float ropescale = 1.0f;
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
if (ropescale != 1.0f) {
if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
rope_freq_scale = 1.0f/ropescale;
}
}
Expand Down