Skip to content

Commit 9da243b

Browse files
committed
Revert "llava : add support for moondream vision language model (ggml-org#6899)"
This reverts commit 46e12c4.
1 parent bd1871f commit 9da243b

File tree

2 files changed

+11
-61
lines changed

2 files changed

+11
-61
lines changed

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ Typically finetunes of the base models below are supported as well.
140140
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
141141
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
142142
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
143-
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
144143

145144
**HTTP server**
146145

examples/llava/clip.cpp

Lines changed: 11 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@ static std::string format(const char * fmt, ...) {
104104
#define TN_POS_EMBD "%s.position_embd.weight"
105105
#define TN_CLASS_EMBD "v.class_embd"
106106
#define TN_PATCH_EMBD "v.patch_embd.weight"
107-
#define TN_PATCH_BIAS "v.patch_embd.bias"
108107
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
109108
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
110109
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@@ -426,7 +425,6 @@ struct clip_vision_model {
426425
// embeddings
427426
struct ggml_tensor * class_embedding;
428427
struct ggml_tensor * patch_embeddings;
429-
struct ggml_tensor * patch_bias;
430428
struct ggml_tensor * position_embeddings;
431429

432430
struct ggml_tensor * pre_ln_w;
@@ -503,11 +501,6 @@ struct clip_ctx {
503501
bool use_gelu = false;
504502
int32_t ftype = 1;
505503

506-
bool has_class_embedding = true;
507-
bool has_pre_norm = true;
508-
bool has_post_norm = false;
509-
bool has_patch_bias = false;
510-
511504
struct gguf_context * ctx_gguf;
512505
struct ggml_context * ctx_data;
513506

@@ -533,7 +526,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
533526
const int patch_size = hparams.patch_size;
534527
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
535528
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
536-
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
529+
const int num_positions = num_patches + 1;
537530
const int hidden_size = hparams.hidden_size;
538531
const int n_head = hparams.n_head;
539532
const int d_head = hidden_size / n_head;
@@ -564,23 +557,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
564557
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
565558
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
566559

567-
if (ctx->has_patch_bias) {
568-
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
569-
inp = ggml_add(ctx0, inp, model.patch_bias);
570-
}
571-
572560
// concat class_embeddings and patch_embeddings
573-
struct ggml_tensor * embeddings = inp;
574-
if (ctx->has_class_embedding) {
575-
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
576-
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
577-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
578-
embeddings = ggml_acc(ctx0, embeddings, inp,
579-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
580-
}
561+
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
581562
ggml_set_name(embeddings, "embeddings");
582563
ggml_set_input(embeddings);
583564

565+
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
566+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
567+
568+
embeddings = ggml_acc(ctx0, embeddings, inp,
569+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
584570

585571
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
586572
ggml_set_name(positions, "positions");
@@ -590,7 +576,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
590576
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
591577

592578
// pre-layernorm
593-
if (ctx->has_pre_norm) {
579+
{
594580
embeddings = ggml_norm(ctx0, embeddings, eps);
595581
ggml_set_name(embeddings, "pre_ln");
596582

@@ -678,14 +664,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
678664
embeddings = cur;
679665
}
680666

681-
// post-layernorm
682-
if (ctx->has_post_norm) {
683-
embeddings = ggml_norm(ctx0, embeddings, eps);
684-
ggml_set_name(embeddings, "post_ln");
685-
686-
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
687-
}
688-
689667
// llava projector
690668
{
691669
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1170,39 +1148,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11701148

11711149
}
11721150

1173-
try {
1174-
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
1175-
new_clip->has_class_embedding = true;
1176-
} catch (const std::exception& e) {
1177-
new_clip->has_class_embedding = false;
1178-
}
1179-
1180-
try {
1181-
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1182-
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
1183-
new_clip->has_pre_norm = true;
1184-
} catch (std::exception & e) {
1185-
new_clip->has_pre_norm = false;
1186-
}
1187-
1188-
try {
1189-
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
1190-
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
1191-
new_clip->has_post_norm = true;
1192-
} catch (std::exception & e) {
1193-
new_clip->has_post_norm = false;
1194-
}
1195-
1196-
try {
1197-
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
1198-
new_clip->has_patch_bias = true;
1199-
} catch (std::exception & e) {
1200-
new_clip->has_patch_bias = false;
1201-
}
1202-
12031151
try {
12041152
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
1153+
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
12051154
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
1155+
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
1156+
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
12061157
} catch(const std::exception& e) {
12071158
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
12081159
}

0 commit comments

Comments
 (0)