Skip to content

Commit e6416b0

Browse files
committed
final touch UX
1 parent 111c820 commit e6416b0

File tree

5 files changed

+54
-28
lines changed

5 files changed

+54
-28
lines changed

convert_hf_to_gguf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ def prepare_tensors(self):
309309
gguf.MODEL_TENSOR.POSNET_NORM1,
310310
gguf.MODEL_TENSOR.POSNET_NORM2,
311311
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
312+
gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
312313
)
313314
)
314315
or not new_name.endswith(".weight")

docs/multimodal.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
44
- [llama-mtmd-cli](../tools/mtmd/README.md)
55
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
66

7+
Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
8+
79
To enable it, can use use one of the 2 methods below:
810

911
- Use `-hf` option with a supported model (see a list of pre-quantized model below)
@@ -37,6 +39,8 @@ Replaces the `(tool_name)` with the name of binary you want to use. For example,
3739

3840
NOTE: some models may require large context window, for example: `-c 8192`
3941

42+
**Vision models**:
43+
4044
```sh
4145
# Gemma 3
4246
(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
@@ -78,3 +82,11 @@ NOTE: some models may require large context window, for example: `-c 8192`
7882
# Llama 4 Scout
7983
(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
8084
```
85+
86+
**Audio models**:
87+
88+
```sh
89+
# Ultravox 0.5
90+
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
91+
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
92+
```

tools/mtmd/clip.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2199,10 +2199,6 @@ struct clip_model_loader {
21992199
LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
22002200
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
22012201
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
2202-
2203-
if (ctx_clip.proj_type == PROJECTOR_TYPE_LLAMA4) {
2204-
LOG_WRN("%s: llama 4 vision is known to have degraded quality: https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
2205-
}
22062202
}
22072203
}
22082204

tools/mtmd/mtmd-cli.cpp

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ static volatile bool g_is_interrupted = false;
3737
static void show_additional_info(int /*argc*/, char ** argv) {
3838
LOG(
3939
"Experimental CLI for multimodal\n\n"
40-
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
40+
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
4141
" -m and --mmproj are required\n"
4242
" -hf user/repo can replace both -m and --mmproj in most cases\n"
43-
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
43+
" --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
4444
" to disable using GPU for mmproj model, add --no-mmproj-offload\n",
4545
argv[0]
4646
);
@@ -142,7 +142,7 @@ struct mtmd_cli_context {
142142
);
143143
}
144144

145-
bool load_image(const std::string & fname) {
145+
bool load_media(const std::string & fname) {
146146
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
147147
if (!bmp.ptr) {
148148
return false;
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
290290
msg.role = "user";
291291
msg.content = params.prompt;
292292
for (const auto & image : params.image) {
293-
if (!ctx.load_image(image)) {
293+
if (!ctx.load_media(image)) {
294294
return 1; // error is already printed by libmtmd
295295
}
296296
}
@@ -303,7 +303,12 @@ int main(int argc, char ** argv) {
303303

304304
} else {
305305
LOG("\n Running in chat mode, available commands:");
306-
LOG("\n /image <path> load an image");
306+
if (mtmd_support_vision(ctx.ctx_vision.get())) {
307+
LOG("\n /image <path> load an image");
308+
}
309+
if (mtmd_support_audio(ctx.ctx_vision.get())) {
310+
LOG("\n /audio <path> load an audio");
311+
}
307312
LOG("\n /clear clear the chat history");
308313
LOG("\n /quit or /exit exit the program");
309314
LOG("\n");
@@ -333,14 +338,16 @@ int main(int argc, char ** argv) {
333338
continue;
334339
}
335340
g_is_generating = true;
336-
if (line == "/image" || line.find("/image ") == 0) {
341+
bool is_image = line == "/image" || line.find("/image ") == 0;
342+
bool is_audio = line == "/audio" || line.find("/audio ") == 0;
343+
if (is_image || is_audio) {
337344
if (line.size() < 8) {
338-
LOG_ERR("ERR: Missing image filename\n");
345+
LOG_ERR("ERR: Missing media filename\n");
339346
continue;
340347
}
341-
std::string image = line.substr(7);
342-
if (ctx.load_image(image)) {
343-
LOG("Image %s loaded\n", image.c_str());
348+
std::string media_path = line.substr(7);
349+
if (ctx.load_media(media_path)) {
350+
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
344351
content += MTMD_DEFAULT_MEDIA_MARKER;
345352
}
346353
// else, error is already printed by libmtmd

tools/mtmd/mtmd.cpp

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,9 @@ struct mtmd_context {
108108
llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
109109
llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
110110
llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
111-
llama_token tok_sli_bm_start = LLAMA_TOKEN_NULL; // single slice start
112-
llama_token tok_sli_bm_end = LLAMA_TOKEN_NULL; // single slice end
113-
llama_token tok_sli_bm_mid = LLAMA_TOKEN_NULL; // between 2 slices
111+
llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
112+
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
113+
llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
114114
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
115115
bool tok_row_end_trail = false;
116116
bool ov_img_first = false;
@@ -156,8 +156,8 @@ struct mtmd_context {
156156
tok_ov_img_end = lookup_token("</image>");
157157
tok_slices_start = lookup_token("<slice>");
158158
tok_slices_end = lookup_token("</slice>");
159-
tok_sli_bm_start = tok_ov_img_start;
160-
tok_sli_bm_end = tok_ov_img_end;
159+
tok_sli_img_start = tok_ov_img_start;
160+
tok_sli_img_end = tok_ov_img_end;
161161
tok_row_end = lookup_token("\n");
162162
tok_row_end_trail = false; // no trailing end-of-row token
163163
ov_img_first = true;
@@ -168,8 +168,8 @@ struct mtmd_context {
168168
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
169169
tok_ov_img_start = lookup_token("<image>");
170170
tok_ov_img_end = lookup_token("</image>");
171-
tok_sli_bm_start = lookup_token("<slice>");
172-
tok_sli_bm_end = lookup_token("</slice>");
171+
tok_sli_img_start = lookup_token("<slice>");
172+
tok_sli_img_end = lookup_token("</slice>");
173173
tok_row_end = lookup_token("\n");
174174
tok_row_end_trail = false; // no trailing end-of-row token
175175
ov_img_first = true;
@@ -186,7 +186,7 @@ struct mtmd_context {
186186
// <|image_end|>
187187
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
188188
tok_ov_img_start = lookup_token("<|image|>");
189-
tok_sli_bm_mid = lookup_token("<|tile_x_separator|>");
189+
tok_sli_img_mid = lookup_token("<|tile_x_separator|>");
190190
tok_row_end = lookup_token("<|tile_y_separator|>");
191191
tok_row_end_trail = true; // add trailing end-of-row token
192192
ov_img_first = false; // overview image is last
@@ -196,6 +196,16 @@ struct mtmd_context {
196196
// TODO @ngxson : check if model n_mel is 128 or 80
197197
w_filters = whisper_precalc_filters::get_128_bins();
198198
}
199+
200+
// warning messages
201+
if (proj == PROJECTOR_TYPE_LLAMA4) {
202+
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
203+
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
204+
}
205+
if (has_audio) {
206+
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
207+
" https://github.com/ggml-org/llama.cpp/pull/13623\n", __func__);
208+
}
199209
}
200210

201211
~mtmd_context() {
@@ -441,15 +451,15 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
441451
for (int y = 0; y < n_row; y++) {
442452
for (int x = 0; x < n_col; x++) {
443453
const bool is_last_in_row = (x == n_col - 1);
444-
if (ctx->tok_sli_bm_start != LLAMA_TOKEN_NULL) {
445-
add_text_chunk({ctx->tok_sli_bm_start});
454+
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
455+
add_text_chunk({ctx->tok_sli_img_start});
446456
}
447457
output->entries.emplace_back(std::move(chunks[y * n_col + x]));
448-
if (ctx->tok_sli_bm_end != LLAMA_TOKEN_NULL) {
449-
add_text_chunk({ctx->tok_sli_bm_end});
458+
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
459+
add_text_chunk({ctx->tok_sli_img_end});
450460
}
451-
if (!is_last_in_row && ctx->tok_sli_bm_mid != LLAMA_TOKEN_NULL) {
452-
add_text_chunk({ctx->tok_sli_bm_mid});
461+
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
462+
add_text_chunk({ctx->tok_sli_img_mid});
453463
}
454464
}
455465
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {

0 commit comments

Comments
 (0)