Skip to content

Commit aa8c01a

Browse files
committed
Revert moondream vision language model support
This broke the server's LLaVA support in a non-obvious way. See ggml-org/llama.cpp#6899 See ggml-org/llama.cpp#7060
1 parent 8dae978 commit aa8c01a

File tree

1 file changed

+13
-61
lines changed

1 file changed

+13
-61
lines changed

llama.cpp/llava/clip.cpp

Lines changed: 13 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ static std::string format(const char * fmt, ...) {
9898
#define TN_POS_EMBD "%s.position_embd.weight"
9999
#define TN_CLASS_EMBD "v.class_embd"
100100
#define TN_PATCH_EMBD "v.patch_embd.weight"
101-
#define TN_PATCH_BIAS "v.patch_embd.bias"
102101
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
103102
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
104103
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@@ -420,7 +419,6 @@ struct clip_vision_model {
420419
// embeddings
421420
struct ggml_tensor * class_embedding;
422421
struct ggml_tensor * patch_embeddings;
423-
struct ggml_tensor * patch_bias;
424422
struct ggml_tensor * position_embeddings;
425423

426424
struct ggml_tensor * pre_ln_w;
@@ -497,11 +495,6 @@ struct clip_ctx {
497495
bool use_gelu = false;
498496
int32_t ftype = 1;
499497

500-
bool has_class_embedding = true;
501-
bool has_pre_norm = true;
502-
bool has_post_norm = false;
503-
bool has_patch_bias = false;
504-
505498
struct gguf_context * ctx_gguf;
506499
struct ggml_context * ctx_data;
507500

@@ -527,7 +520,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
527520
const int patch_size = hparams.patch_size;
528521
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
529522
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
530-
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
523+
const int num_positions = num_patches + 1;
531524
const int hidden_size = hparams.hidden_size;
532525
const int n_head = hparams.n_head;
533526
const int d_head = hidden_size / n_head;
@@ -558,23 +551,17 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
558551
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
559552
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
560553

561-
if (ctx->has_patch_bias) {
562-
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
563-
inp = ggml_add(ctx0, inp, model.patch_bias);
564-
}
565-
566-
// concat class_embeddings and patch_embeddings
567-
struct ggml_tensor * embeddings = inp;
568-
if (ctx->has_class_embedding) {
569-
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
570-
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
571-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
572-
embeddings = ggml_acc(ctx0, embeddings, inp,
573-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
574-
}
554+
// [jart] revert https://github.com/ggerganov/llama.cpp/pull/6899/files
555+
// do not sync these lines until https://github.com/ggerganov/llama.cpp/issues/7060 is resolved
556+
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
575557
ggml_set_name(embeddings, "embeddings");
576558
ggml_set_input(embeddings);
577559

560+
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
561+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
562+
563+
embeddings = ggml_acc(ctx0, embeddings, inp,
564+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
578565

579566
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
580567
ggml_set_name(positions, "positions");
@@ -584,7 +571,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
584571
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
585572

586573
// pre-layernorm
587-
if (ctx->has_pre_norm) {
574+
{
588575
embeddings = ggml_norm(ctx0, embeddings, eps);
589576
ggml_set_name(embeddings, "pre_ln");
590577

@@ -672,14 +659,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
672659
embeddings = cur;
673660
}
674661

675-
// post-layernorm
676-
if (ctx->has_post_norm) {
677-
embeddings = ggml_norm(ctx0, embeddings, eps);
678-
ggml_set_name(embeddings, "post_ln");
679-
680-
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
681-
}
682-
683662
// llava projector
684663
{
685664
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1162,39 +1141,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11621141

11631142
}
11641143

1165-
try {
1166-
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
1167-
new_clip->has_class_embedding = true;
1168-
} catch (const std::exception& e) {
1169-
new_clip->has_class_embedding = false;
1170-
}
1171-
1172-
try {
1173-
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1174-
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
1175-
new_clip->has_pre_norm = true;
1176-
} catch (std::exception & e) {
1177-
new_clip->has_pre_norm = false;
1178-
}
1179-
1180-
try {
1181-
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
1182-
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
1183-
new_clip->has_post_norm = true;
1184-
} catch (std::exception & e) {
1185-
new_clip->has_post_norm = false;
1186-
}
1187-
1188-
try {
1189-
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
1190-
new_clip->has_patch_bias = true;
1191-
} catch (std::exception & e) {
1192-
new_clip->has_patch_bias = false;
1193-
}
1194-
11951144
try {
11961145
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1146+
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
11971147
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1148+
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1149+
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
11981150
} catch(const std::exception& e) {
11991151
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
12001152
}

0 commit comments

Comments
 (0)