@@ -108,9 +108,9 @@ struct mtmd_context {
108
108
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
109
109
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
110
110
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
111
- llama_token tok_sli_bm_start = LLAMA_TOKEN_NULL; // single slice start
112
- llama_token tok_sli_bm_end = LLAMA_TOKEN_NULL; // single slice end
113
- llama_token tok_sli_bm_mid = LLAMA_TOKEN_NULL; // between 2 slices
111
+ llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
112
+ llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
113
+ llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
114
114
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
115
115
bool tok_row_end_trail = false ;
116
116
bool ov_img_first = false ;
@@ -156,8 +156,8 @@ struct mtmd_context {
156
156
tok_ov_img_end = lookup_token (" </image>" );
157
157
tok_slices_start = lookup_token (" <slice>" );
158
158
tok_slices_end = lookup_token (" </slice>" );
159
- tok_sli_bm_start = tok_ov_img_start;
160
- tok_sli_bm_end = tok_ov_img_end;
159
+ tok_sli_img_start = tok_ov_img_start;
160
+ tok_sli_img_end = tok_ov_img_end;
161
161
tok_row_end = lookup_token (" \n " );
162
162
tok_row_end_trail = false ; // no trailing end-of-row token
163
163
ov_img_first = true ;
@@ -168,8 +168,8 @@ struct mtmd_context {
168
168
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
169
169
tok_ov_img_start = lookup_token (" <image>" );
170
170
tok_ov_img_end = lookup_token (" </image>" );
171
- tok_sli_bm_start = lookup_token (" <slice>" );
172
- tok_sli_bm_end = lookup_token (" </slice>" );
171
+ tok_sli_img_start = lookup_token (" <slice>" );
172
+ tok_sli_img_end = lookup_token (" </slice>" );
173
173
tok_row_end = lookup_token (" \n " );
174
174
tok_row_end_trail = false ; // no trailing end-of-row token
175
175
ov_img_first = true ;
@@ -186,7 +186,7 @@ struct mtmd_context {
186
186
// <|image_end|>
187
187
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
188
188
tok_ov_img_start = lookup_token (" <|image|>" );
189
- tok_sli_bm_mid = lookup_token (" <|tile_x_separator|>" );
189
+ tok_sli_img_mid = lookup_token (" <|tile_x_separator|>" );
190
190
tok_row_end = lookup_token (" <|tile_y_separator|>" );
191
191
tok_row_end_trail = true ; // add trailing end-of-row token
192
192
ov_img_first = false ; // overview image is last
@@ -196,6 +196,16 @@ struct mtmd_context {
196
196
// TODO @ngxson : check if model n_mel is 128 or 80
197
197
w_filters = whisper_precalc_filters::get_128_bins ();
198
198
}
199
+
200
+ // warning messages
201
+ if (proj == PROJECTOR_TYPE_LLAMA4) {
202
+ LOG_WRN (" %s: llama 4 vision is known to have degraded quality:\n "
203
+ " https://github.com/ggml-org/llama.cpp/pull/13282\n " , __func__);
204
+ }
205
+ if (has_audio) {
206
+ LOG_WRN (" %s: audio input is in experimental stage and may have reduced quality:\n "
207
+ " https://github.com/ggml-org/llama.cpp/pull/13623\n " , __func__);
208
+ }
199
209
}
200
210
201
211
~mtmd_context () {
@@ -441,15 +451,15 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
441
451
for (int y = 0 ; y < n_row; y++) {
442
452
for (int x = 0 ; x < n_col; x++) {
443
453
const bool is_last_in_row = (x == n_col - 1 );
444
- if (ctx->tok_sli_bm_start != LLAMA_TOKEN_NULL) {
445
- add_text_chunk ({ctx->tok_sli_bm_start });
454
+ if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
455
+ add_text_chunk ({ctx->tok_sli_img_start });
446
456
}
447
457
output->entries .emplace_back (std::move (chunks[y * n_col + x]));
448
- if (ctx->tok_sli_bm_end != LLAMA_TOKEN_NULL) {
449
- add_text_chunk ({ctx->tok_sli_bm_end });
458
+ if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
459
+ add_text_chunk ({ctx->tok_sli_img_end });
450
460
}
451
- if (!is_last_in_row && ctx->tok_sli_bm_mid != LLAMA_TOKEN_NULL) {
452
- add_text_chunk ({ctx->tok_sli_bm_mid });
461
+ if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
462
+ add_text_chunk ({ctx->tok_sli_img_mid });
453
463
}
454
464
}
455
465
if ((y != n_row - 1 || ctx->tok_row_end_trail ) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
0 commit comments