@@ -330,6 +330,7 @@ enum llm_kv {
330
330
LLM_KV_SSM_CONV_KERNEL,
331
331
LLM_KV_SSM_STATE_SIZE,
332
332
LLM_KV_SSM_TIME_STEP_RANK,
333
+ LLM_KV_SSM_DT_B_C_RMS,
333
334
334
335
LLM_KV_TOKENIZER_MODEL,
335
336
LLM_KV_TOKENIZER_PRE,
@@ -428,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
428
429
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
429
430
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
430
431
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
432
+ { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
431
433
432
434
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
433
435
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
@@ -2239,6 +2241,7 @@ struct llama_hparams {
2239
2241
uint32_t ssm_d_inner = 0;
2240
2242
uint32_t ssm_d_state = 0;
2241
2243
uint32_t ssm_dt_rank = 0;
2244
+ bool ssm_dt_b_c_rms = false;
2242
2245
2243
2246
float f_clamp_kqv = 0.0f;
2244
2247
float f_max_alibi_bias = 0.0f;
@@ -2288,6 +2291,7 @@ struct llama_hparams {
2288
2291
if (this->ssm_d_inner != other.ssm_d_inner) return true;
2289
2292
if (this->ssm_d_state != other.ssm_d_state) return true;
2290
2293
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
2294
+ if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
2291
2295
2292
2296
if (this->dec_start_token_id != other.dec_start_token_id) return true;
2293
2297
@@ -5100,6 +5104,7 @@ static void llm_load_hparams(
5100
5104
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
5101
5105
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
5102
5106
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
5107
+ ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
5103
5108
5104
5109
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5105
5110
@@ -5962,6 +5967,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5962
5967
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
5963
5968
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
5964
5969
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
5970
+ LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
5965
5971
}
5966
5972
5967
5973
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
@@ -12279,6 +12285,10 @@ struct llm_build_context {
12279
12285
GGML_ASSERT(2 * d_model == d_inner);
12280
12286
const int64_t d_state = hparams.ssm_d_state;
12281
12287
const int64_t dt_rank = hparams.ssm_dt_rank;
12288
+ // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
12289
+ const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
12290
+ // Use the same RMS norm as the final layer norm
12291
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
12282
12292
12283
12293
struct ggml_tensor * cur;
12284
12294
struct ggml_tensor * inpL;
@@ -12359,6 +12369,13 @@ struct llm_build_context {
12359
12369
struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
12360
12370
struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
12361
12371
12372
+ // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
12373
+ if (ssm_dt_b_c_rms) {
12374
+ dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
12375
+ B = ggml_rms_norm(ctx0, B, norm_rms_eps);
12376
+ C = ggml_rms_norm(ctx0, C, norm_rms_eps);
12377
+ }
12378
+
12362
12379
// {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
12363
12380
dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt);
12364
12381
dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
@@ -16480,6 +16497,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16480
16497
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
16481
16498
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
16482
16499
}
16500
+ if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
16501
+ new_type = GGML_TYPE_F16;
16502
+ }
16483
16503
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
16484
16504
++qs.n_fallback;
16485
16505
}
@@ -16822,8 +16842,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16822
16842
// do not quantize Mamba's small yet 2D weights
16823
16843
// NOTE: can't use LLM_TN here because the layer number is not known
16824
16844
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
16825
- quantize &= name.find("ssm_x.weight") == std::string::npos;
16826
- quantize &= name.find("ssm_dt.weight") == std::string::npos;
16827
16845
16828
16846
// do not quantize relative position bias (T5)
16829
16847
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
0 commit comments