diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 8d310fb0271c5..53ac381304765 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -90,8 +90,6 @@ #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" -#define TN_GLM_BOI_W "adapter.boi" -#define TN_GLM_EOI_W "adapter.eoi" enum projector_type { PROJECTOR_TYPE_MLP, diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 4eec4a2646798..9a5ab7c819585 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -244,8 +244,6 @@ struct clip_vision_model { //GLMV-Edge projection struct ggml_tensor * mm_model_adapter_conv_w = nullptr; struct ggml_tensor * mm_model_adapter_conv_b = nullptr; - struct ggml_tensor * boi_w = nullptr; - struct ggml_tensor * eoi_w = nullptr; // MobileVLM projection struct ggml_tensor * mm_model_mlp_1_w = nullptr; @@ -1697,8 +1695,6 @@ struct clip_model_loader { vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight")); vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight")); vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); - vision_model.boi_w = get_tensor(TN_GLM_BOI_W); - vision_model.eoi_w = get_tensor(TN_GLM_EOI_W); } break; case PROJECTOR_TYPE_MERGER: { @@ -2593,8 +2589,7 @@ void clip_free(clip_ctx * ctx) { } size_t clip_embd_nbytes(const struct clip_ctx * ctx) { - int extra_tokens = ctx->has_glm_projector ? 2 : 0; - return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float); + return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); } size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { @@ -2790,9 +2785,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } if (ctx->has_glm_projector) { GGML_ASSERT(batch_size == 1); - ggml_tensor * boi = ctx->vision_model.boi_w; - ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi)); - vec = (float*)(vec+ggml_nelements(boi)); //offset for boi } // build the inference graph @@ -3001,13 +2993,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); - if (ctx->has_glm_projector) { - //eoi - ggml_tensor * eoi = ctx->vision_model.eoi_w; - int offset = ggml_nelements(embeddings); - ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi)); - } - return true; } diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 11ca7b30f1ac6..a994ef0166e6a 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -186,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx, marker_modified = "" + ctx->image_marker + ""; string_replace_all(prompt_modified, ctx->image_marker, marker_modified); + } else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) { + // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|> + marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>"; + string_replace_all(prompt_modified, ctx->image_marker, marker_modified); + } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) { // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215 marker_modified = "" + ctx->image_marker + "";