@@ -328,6 +328,7 @@ enum llm_kv {
328
328
LLM_KV_SSM_CONV_KERNEL,
329
329
LLM_KV_SSM_STATE_SIZE,
330
330
LLM_KV_SSM_TIME_STEP_RANK,
331
+ LLM_KV_SSM_DT_B_C_RMS,
331
332
332
333
LLM_KV_TOKENIZER_MODEL,
333
334
LLM_KV_TOKENIZER_PRE,
@@ -426,6 +427,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
426
427
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
427
428
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
428
429
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
430
+ { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
429
431
430
432
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
431
433
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
@@ -2237,6 +2239,7 @@ struct llama_hparams {
2237
2239
uint32_t ssm_d_inner = 0;
2238
2240
uint32_t ssm_d_state = 0;
2239
2241
uint32_t ssm_dt_rank = 0;
2242
+ bool ssm_dt_b_c_rms = false;
2240
2243
2241
2244
float f_clamp_kqv = 0.0f;
2242
2245
float f_max_alibi_bias = 0.0f;
@@ -2286,6 +2289,7 @@ struct llama_hparams {
2286
2289
if (this->ssm_d_inner != other.ssm_d_inner) return true;
2287
2290
if (this->ssm_d_state != other.ssm_d_state) return true;
2288
2291
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
2292
+ if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
2289
2293
2290
2294
if (this->dec_start_token_id != other.dec_start_token_id) return true;
2291
2295
@@ -5052,6 +5056,7 @@ static void llm_load_hparams(
5052
5056
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
5053
5057
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
5054
5058
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
5059
+ ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
5055
5060
5056
5061
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5057
5062
@@ -5907,6 +5912,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5907
5912
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
5908
5913
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
5909
5914
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
5915
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
5910
5916
}
5911
5917
5912
5918
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
@@ -12165,6 +12171,10 @@ struct llm_build_context {
12165
12171
GGML_ASSERT(2 * d_model == d_inner);
12166
12172
const int64_t d_state = hparams.ssm_d_state;
12167
12173
const int64_t dt_rank = hparams.ssm_dt_rank;
12174
+ // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
12175
+ const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
12176
+ // Use the same RMS norm as the final layer norm
12177
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
12168
12178
12169
12179
struct ggml_tensor * cur;
12170
12180
struct ggml_tensor * inpL;
@@ -12245,6 +12255,13 @@ struct llm_build_context {
12245
12255
struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
12246
12256
struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
12247
12257
12258
+ // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
12259
+ if (ssm_dt_b_c_rms) {
12260
+ dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
12261
+ B = ggml_rms_norm(ctx0, B, norm_rms_eps);
12262
+ C = ggml_rms_norm(ctx0, C, norm_rms_eps);
12263
+ }
12264
+
12248
12265
// {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
12249
12266
dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt);
12250
12267
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
@@ -16109,6 +16126,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16109
16126
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
16110
16127
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
16111
16128
}
16129
+ if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
16130
+ new_type = GGML_TYPE_F16;
16131
+ }
16112
16132
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
16113
16133
++qs.n_fallback;
16114
16134
}
@@ -16437,8 +16457,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16437
16457
// do not quantize Mamba's small yet 2D weights
16438
16458
// NOTE: can't use LLM_TN here because the layer number is not known
16439
16459
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
16440
- quantize &= name.find("ssm_x.weight") == std::string::npos;
16441
- quantize &= name.find("ssm_dt.weight") == std::string::npos;
16442
16460
16443
16461
// do not quantize relative position bias (T5)
16444
16462
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
0 commit comments