@@ -98,7 +98,6 @@ static std::string format(const char * fmt, ...) {
9898#define TN_POS_EMBD " %s.position_embd.weight"
9999#define TN_CLASS_EMBD " v.class_embd"
100100#define TN_PATCH_EMBD " v.patch_embd.weight"
101- #define TN_PATCH_BIAS " v.patch_embd.bias"
102101#define TN_ATTN_K " %s.blk.%d.attn_k.%s"
103102#define TN_ATTN_Q " %s.blk.%d.attn_q.%s"
104103#define TN_ATTN_V " %s.blk.%d.attn_v.%s"
@@ -420,7 +419,6 @@ struct clip_vision_model {
420419 // embeddings
421420 struct ggml_tensor * class_embedding;
422421 struct ggml_tensor * patch_embeddings;
423- struct ggml_tensor * patch_bias;
424422 struct ggml_tensor * position_embeddings;
425423
426424 struct ggml_tensor * pre_ln_w;
@@ -497,11 +495,6 @@ struct clip_ctx {
497495 bool use_gelu = false ;
498496 int32_t ftype = 1 ;
499497
500- bool has_class_embedding = true ;
501- bool has_pre_norm = true ;
502- bool has_post_norm = false ;
503- bool has_patch_bias = false ;
504-
505498 struct gguf_context * ctx_gguf;
506499 struct ggml_context * ctx_data;
507500
@@ -527,7 +520,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
527520 const int patch_size = hparams.patch_size ;
528521 const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
529522 const int num_patches_per_side = image_size / patch_size; GGML_UNUSED (num_patches_per_side);
530- const int num_positions = num_patches + (ctx-> has_class_embedding ? 1 : 0 ) ;
523+ const int num_positions = num_patches + 1 ;
531524 const int hidden_size = hparams.hidden_size ;
532525 const int n_head = hparams.n_head ;
533526 const int d_head = hidden_size / n_head;
@@ -558,23 +551,17 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
558551 inp = ggml_reshape_3d (ctx0, inp, num_patches, hidden_size, batch_size);
559552 inp = ggml_cont (ctx0, ggml_permute (ctx0, inp, 1 , 0 , 2 , 3 ));
560553
561- if (ctx->has_patch_bias ) {
562- // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
563- inp = ggml_add (ctx0, inp, model.patch_bias );
564- }
565-
566- // concat class_embeddings and patch_embeddings
567- struct ggml_tensor * embeddings = inp;
568- if (ctx->has_class_embedding ) {
569- embeddings = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
570- embeddings = ggml_acc (ctx0, embeddings, model.class_embedding ,
571- embeddings->nb [1 ], embeddings->nb [2 ], embeddings->nb [3 ], 0 );
572- embeddings = ggml_acc (ctx0, embeddings, inp,
573- embeddings->nb [1 ], embeddings->nb [2 ], embeddings->nb [3 ], model.class_embedding ->nb [1 ]);
574- }
554+ // [jart] revert https://github.com/ggerganov/llama.cpp/pull/6899/files
555+ // do not sync these lines until https://github.com/ggerganov/llama.cpp/issues/7060 is resolved
556+ struct ggml_tensor * embeddings = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
575557 ggml_set_name (embeddings, " embeddings" );
576558 ggml_set_input (embeddings);
577559
560+ embeddings = ggml_acc (ctx0, embeddings, model.class_embedding ,
561+ embeddings->nb [1 ], embeddings->nb [2 ], embeddings->nb [3 ], 0 );
562+
563+ embeddings = ggml_acc (ctx0, embeddings, inp,
564+ embeddings->nb [1 ], embeddings->nb [2 ], embeddings->nb [3 ], model.class_embedding ->nb [1 ]);
578565
579566 struct ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, num_positions);
580567 ggml_set_name (positions, " positions" );
@@ -584,7 +571,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
584571 ggml_add (ctx0, embeddings, ggml_get_rows (ctx0, model.position_embeddings , positions));
585572
586573 // pre-layernorm
587- if (ctx-> has_pre_norm ) {
574+ {
588575 embeddings = ggml_norm (ctx0, embeddings, eps);
589576 ggml_set_name (embeddings, " pre_ln" );
590577
@@ -672,14 +659,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
672659 embeddings = cur;
673660 }
674661
675- // post-layernorm
676- if (ctx->has_post_norm ) {
677- embeddings = ggml_norm (ctx0, embeddings, eps);
678- ggml_set_name (embeddings, " post_ln" );
679-
680- embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.post_ln_w ), model.post_ln_b );
681- }
682-
683662 // llava projector
684663 {
685664 embeddings = ggml_reshape_2d (ctx0, embeddings, embeddings->ne [0 ], embeddings->ne [1 ]);
@@ -1162,39 +1141,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11621141
11631142 }
11641143
1165- try {
1166- vision_model.class_embedding = get_tensor (new_clip->ctx_data , TN_CLASS_EMBD);
1167- new_clip->has_class_embedding = true ;
1168- } catch (const std::exception& e) {
1169- new_clip->has_class_embedding = false ;
1170- }
1171-
1172- try {
1173- vision_model.pre_ln_w = get_tensor (new_clip->ctx_data , format (TN_LN_PRE, " v" , " weight" ));
1174- vision_model.pre_ln_b = get_tensor (new_clip->ctx_data , format (TN_LN_PRE, " v" , " bias" ));
1175- new_clip->has_pre_norm = true ;
1176- } catch (std::exception & e) {
1177- new_clip->has_pre_norm = false ;
1178- }
1179-
1180- try {
1181- vision_model.post_ln_w = get_tensor (new_clip->ctx_data , format (TN_LN_POST, " v" , " weight" ));
1182- vision_model.post_ln_b = get_tensor (new_clip->ctx_data , format (TN_LN_POST, " v" , " bias" ));
1183- new_clip->has_post_norm = true ;
1184- } catch (std::exception & e) {
1185- new_clip->has_post_norm = false ;
1186- }
1187-
1188- try {
1189- vision_model.patch_bias = get_tensor (new_clip->ctx_data , TN_PATCH_BIAS);
1190- new_clip->has_patch_bias = true ;
1191- } catch (std::exception & e) {
1192- new_clip->has_patch_bias = false ;
1193- }
1194-
11951144 try {
11961145 vision_model.patch_embeddings = get_tensor (new_clip->ctx_data , TN_PATCH_EMBD);
1146+ vision_model.class_embedding = get_tensor (new_clip->ctx_data , TN_CLASS_EMBD);
11971147 vision_model.position_embeddings = get_tensor (new_clip->ctx_data , format (TN_POS_EMBD, " v" ));
1148+ vision_model.pre_ln_w = get_tensor (new_clip->ctx_data , format (TN_LN_PRE, " v" , " weight" ));
1149+ vision_model.pre_ln_b = get_tensor (new_clip->ctx_data , format (TN_LN_PRE, " v" , " bias" ));
11981150 } catch (const std::exception& e) {
11991151 LOG_TEE (" %s: failed to load vision model tensors\n " , __func__);
12001152 }
0 commit comments