Skip to content

Commit 89b0bf0

Browse files
authored
llava : use logger in llava-cli (#6797)
This change removes printf() logging so llava-cli is shell scriptable.
1 parent b97bc39 commit 89b0bf0

File tree

3 files changed

+111
-96
lines changed

3 files changed

+111
-96
lines changed

examples/llava/clip.cpp

Lines changed: 61 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// I'll gradually clean and extend it
44
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
55
#include "clip.h"
6+
#include "log.h"
67
#include "ggml.h"
78
#include "ggml-alloc.h"
89
#include "ggml-backend.h"
@@ -23,7 +24,6 @@
2324
#include <cstdlib>
2425
#include <cstring>
2526
#include <fstream>
26-
#include <iostream>
2727
#include <map>
2828
#include <regex>
2929
#include <stdexcept>
@@ -145,7 +145,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
145145
static int get_key_idx(const gguf_context * ctx, const char * key) {
146146
int i = gguf_find_key(ctx, key);
147147
if (i == -1) {
148-
fprintf(stderr, "key %s not found in file\n", key);
148+
LOG_TEE("key %s not found in file\n", key);
149149
throw std::runtime_error(format("Missing required key: %s", key));
150150
}
151151

@@ -247,7 +247,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
247247

248248
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
249249
size_t tensor_size = ggml_nbytes(tensor);
250-
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
250+
LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
251251
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
252252
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
253253
}
@@ -265,7 +265,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
265265
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
266266
std::ofstream file(filename, std::ios::binary);
267267
if (!file.is_open()) {
268-
std::cerr << "Failed to open file for writing: " << filename << std::endl;
268+
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
269269
return;
270270
}
271271

@@ -284,7 +284,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
284284
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
285285
std::ofstream file(filename, std::ios::binary);
286286
if (!file.is_open()) {
287-
std::cerr << "Failed to open file for writing: " << filename << std::endl;
287+
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
288288
return;
289289
}
290290

@@ -515,7 +515,7 @@ struct clip_ctx {
515515

516516
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
517517
if (!ctx->has_vision_encoder) {
518-
printf("This gguf file seems to have no vision encoder\n");
518+
LOG_TEE("This gguf file seems to have no vision encoder\n");
519519
return nullptr;
520520
}
521521

@@ -879,21 +879,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
879879
const int idx_name = gguf_find_key(ctx, KEY_NAME);
880880
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
881881
const std::string name = gguf_get_val_str(ctx, idx_name);
882-
printf("%s: model name: %s\n", __func__, name.c_str());
882+
LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
883883
}
884-
printf("%s: description: %s\n", __func__, description.c_str());
885-
printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
886-
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
887-
printf("%s: n_tensors: %d\n", __func__, n_tensors);
888-
printf("%s: n_kv: %d\n", __func__, n_kv);
889-
printf("%s: ftype: %s\n", __func__, ftype_str.c_str());
890-
printf("\n");
884+
LOG_TEE("%s: description: %s\n", __func__, description.c_str());
885+
LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
886+
LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
887+
LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
888+
LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
889+
LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
890+
LOG_TEE("\n");
891891
}
892892
const int n_tensors = gguf_get_n_tensors(ctx);
893893

894894
// kv
895895
const int n_kv = gguf_get_n_kv(ctx);
896-
printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
896+
LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
897897
__func__, n_kv, n_tensors, fname);
898898
{
899899
std::map<enum ggml_type, uint32_t> n_type;
@@ -904,7 +904,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
904904
n_type[type]++;
905905
}
906906

907-
printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
907+
LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
908908
for (int i = 0; i < n_kv; i++) {
909909
const char * name = gguf_get_key(ctx, i);
910910
const enum gguf_type type = gguf_get_kv_type(ctx, i);
@@ -920,7 +920,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
920920
}
921921
replace_all(value, "\n", "\\n");
922922

923-
printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
923+
LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
924924
}
925925

926926
// print type counts
@@ -929,7 +929,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
929929
continue;
930930
}
931931

932-
printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
932+
LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
933933
}
934934
}
935935

@@ -944,7 +944,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
944944
size_t tensor_size = ggml_nbytes(cur);
945945
model_size += tensor_size;
946946
if (verbosity >= 3) {
947-
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
947+
LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
948948
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
949949
}
950950
}
@@ -971,18 +971,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
971971

972972
#ifdef GGML_USE_CUDA
973973
new_clip->backend = ggml_backend_cuda_init(0);
974-
printf("%s: CLIP using CUDA backend\n", __func__);
974+
LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
975975
#endif
976976

977977
#ifdef GGML_USE_METAL
978978
new_clip->backend = ggml_backend_metal_init();
979-
printf("%s: CLIP using Metal backend\n", __func__);
979+
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
980980
#endif
981981

982982

983983
if (!new_clip->backend) {
984984
new_clip->backend = ggml_backend_cpu_init();
985-
printf("%s: CLIP using CPU backend\n", __func__);
985+
LOG_TEE("%s: CLIP using CPU backend\n", __func__);
986986
}
987987

988988
// model size and capabilities
@@ -1006,15 +1006,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
10061006
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
10071007

10081008
if (verbosity >= 1) {
1009-
printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1010-
printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1011-
printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1012-
printf("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1013-
printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
1009+
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
1010+
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
1011+
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
1012+
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
1013+
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
10141014
}
10151015
}
10161016

1017-
printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
1017+
LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
10181018

10191019
// load tensors
10201020
{
@@ -1027,15 +1027,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
10271027

10281028
new_clip->ctx_data = ggml_init(params);
10291029
if (!new_clip->ctx_data) {
1030-
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
1030+
LOG_TEE("%s: ggml_init() failed\n", __func__);
10311031
clip_free(new_clip);
10321032
gguf_free(ctx);
10331033
return nullptr;
10341034
}
10351035

10361036
auto fin = std::ifstream(fname, std::ios::binary);
10371037
if (!fin) {
1038-
printf("cannot open model file for loading tensors\n");
1038+
LOG_TEE("cannot open model file for loading tensors\n");
10391039
clip_free(new_clip);
10401040
gguf_free(ctx);
10411041
return nullptr;
@@ -1057,7 +1057,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
10571057
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
10581058
fin.seekg(offset, std::ios::beg);
10591059
if (!fin) {
1060-
printf("%s: failed to seek for tensor %s\n", __func__, name);
1060+
LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
10611061
clip_free(new_clip);
10621062
gguf_free(ctx);
10631063
return nullptr;
@@ -1128,23 +1128,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11281128
}
11291129

11301130
if (verbosity >= 2) {
1131-
printf("\n%s: vision model hparams\n", __func__);
1132-
printf("image_size %d\n", hparams.image_size);
1133-
printf("patch_size %d\n", hparams.patch_size);
1134-
printf("v_hidden_size %d\n", hparams.hidden_size);
1135-
printf("v_n_intermediate %d\n", hparams.n_intermediate);
1136-
printf("v_projection_dim %d\n", hparams.projection_dim);
1137-
printf("v_n_head %d\n", hparams.n_head);
1138-
printf("v_n_layer %d\n", hparams.n_layer);
1139-
printf("v_eps %f\n", hparams.eps);
1140-
printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
1141-
printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
1142-
printf("v_image_grid_pinpoints: ");
1131+
LOG_TEE("\n%s: vision model hparams\n", __func__);
1132+
LOG_TEE("image_size %d\n", hparams.image_size);
1133+
LOG_TEE("patch_size %d\n", hparams.patch_size);
1134+
LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
1135+
LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
1136+
LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
1137+
LOG_TEE("v_n_head %d\n", hparams.n_head);
1138+
LOG_TEE("v_n_layer %d\n", hparams.n_layer);
1139+
LOG_TEE("v_eps %f\n", hparams.eps);
1140+
LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
1141+
LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
1142+
LOG_TEE("v_image_grid_pinpoints: ");
11431143
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
1144-
printf("%d ", hparams.image_grid_pinpoints[i]);
1144+
LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
11451145
}
1146-
printf("\n");
1147-
printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
1146+
LOG_TEE("\n");
1147+
LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
11481148

11491149
}
11501150

@@ -1155,7 +1155,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11551155
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
11561156
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
11571157
} catch(const std::exception& e) {
1158-
fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
1158+
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
11591159
}
11601160

11611161
// LLaVA projection
@@ -1184,7 +1184,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11841184
} catch (std::runtime_error & e) { }
11851185
try {
11861186
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
1187-
// fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
1187+
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
11881188
} catch (std::runtime_error & e) { }
11891189
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
11901190
// MobileVLM projection
@@ -1264,7 +1264,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12641264
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
12651265
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
12661266
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
1267-
printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
1267+
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
12681268
}
12691269

12701270
return new_clip;
@@ -1304,7 +1304,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
13041304
int nx, ny, nc;
13051305
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
13061306
if (!data) {
1307-
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
1307+
LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
13081308
return false;
13091309
}
13101310
build_clip_img_from_data(data, nx, ny, img);
@@ -1316,7 +1316,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
13161316
int nx, ny, nc;
13171317
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
13181318
if (!data) {
1319-
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
1319+
LOG_TEE("%s: failed to decode image bytes\n", __func__);
13201320
return false;
13211321
}
13221322
build_clip_img_from_data(data, nx, ny, img);
@@ -1506,7 +1506,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
15061506
int downscaled_height = static_cast<int>(original_height * scale);
15071507
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
15081508
int wasted_resolution = (width * height) - effective_resolution;
1509-
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
1509+
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
15101510
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
15111511
max_effective_resolution = effective_resolution;
15121512
min_wasted_resolution = wasted_resolution;
@@ -1545,7 +1545,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
15451545
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
15461546
bool pad_to_square = true;
15471547
if (!ctx->has_vision_encoder) {
1548-
printf("This gguf file seems to have no vision encoder\n");
1548+
LOG_TEE("This gguf file seems to have no vision encoder\n");
15491549
return false;
15501550
}
15511551
auto & params = ctx->vision_model.hparams;
@@ -1622,7 +1622,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
16221622
}
16231623

16241624
for (size_t i = 0; i < patches.size(); i++) {
1625-
// printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
1625+
// LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
16261626
clip_image_u8_free(patches[i]);
16271627
}
16281628

@@ -1765,7 +1765,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {
17651765

17661766
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
17671767
if (!ctx->has_vision_encoder) {
1768-
printf("This gguf file seems to have no vision encoder\n");
1768+
LOG_TEE("This gguf file seems to have no vision encoder\n");
17691769
return false;
17701770
}
17711771

@@ -1777,7 +1777,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
17771777

17781778
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
17791779
if (!ctx->has_vision_encoder) {
1780-
printf("This gguf file seems to have no vision encoder\n");
1780+
LOG_TEE("This gguf file seems to have no vision encoder\n");
17811781
return false;
17821782
}
17831783

@@ -1939,7 +1939,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
19391939
new_type = type;
19401940
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
19411941
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
1942-
// fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
1942+
// LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
19431943
}
19441944
const size_t n_elms = ggml_nelements(cur);
19451945
float * f32_data;
@@ -1958,7 +1958,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
19581958
f32_data = (float *)conv_buf.data();
19591959
break;
19601960
default:
1961-
printf("Please use an input file in f32 or f16\n");
1961+
LOG_TEE("Please use an input file in f32 or f16\n");
19621962
gguf_free(ctx_out);
19631963
return false;
19641964
}
@@ -1985,7 +1985,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
19851985
fout.put(0);
19861986
}
19871987

1988-
printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
1988+
LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
19891989
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
19901990
}
19911991

@@ -2001,8 +2001,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
20012001
gguf_free(ctx_out);
20022002

20032003
{
2004-
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
2005-
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
2004+
LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
2005+
LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
20062006
}
20072007

20082008
return true;

0 commit comments

Comments
 (0)