Skip to content

Commit f5feac8

Browse files
committed
llama : fix data units
ggml-ci
1 parent 8da4627 commit f5feac8

File tree

3 files changed

+31
-31
lines changed

3 files changed

+31
-31
lines changed

ggml-cuda.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5841,7 +5841,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
58415841
}
58425842
#ifdef DEBUG_CUDA_MALLOC
58435843
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
5844-
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
5844+
(uint32_t)(max_size/1e6), (uint32_t)(tot_size/1e6), (uint32_t)(size/1e6));
58455845
#endif
58465846
void * ptr;
58475847
size_t look_ahead_size = (size_t) (1.05 * size);
@@ -5979,7 +5979,7 @@ void * ggml_cuda_host_malloc(size_t size) {
59795979
// This can fixed the OOM error in WSL.
59805980
cudaGetLastError();
59815981
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
5982-
size/1024.0/1024.0, cudaGetErrorString(err));
5982+
size/1e6, cudaGetErrorString(err));
59835983
return nullptr;
59845984
}
59855985

ggml-metal.m

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -346,9 +346,9 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
346346
}
347347

348348
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
349-
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
349+
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
350350
if (ctx->device.maxTransferRate != 0) {
351-
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
351+
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
352352
} else {
353353
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
354354
}
@@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
541541
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
542542

543543
if (ctx->buffers[ctx->n_buffers].metal == nil) {
544-
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
544+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1e6);
545545
return false;
546546
}
547547

548-
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
548+
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1e6);
549549

550550
++ctx->n_buffers;
551551
} else {
@@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
565565
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
566566

567567
if (ctx->buffers[ctx->n_buffers].metal == nil) {
568-
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
568+
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1e6);
569569
return false;
570570
}
571571

572-
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
572+
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1e6, i);
573573
if (i + size_step < size) {
574574
GGML_METAL_LOG_INFO("\n");
575575
}
@@ -580,16 +580,16 @@ bool ggml_metal_add_buffer(
580580

581581
#if TARGET_OS_OSX
582582
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
583-
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
584-
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
583+
ctx->device.currentAllocatedSize / 1e6,
584+
ctx->device.recommendedMaxWorkingSetSize / 1e6);
585585

586586
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
587587
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
588588
} else {
589589
GGML_METAL_LOG_INFO("\n");
590590
}
591591
#else
592-
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
592+
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1e6);
593593
#endif
594594
}
595595

llama.cpp

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,9 +1083,9 @@ enum e_model {
10831083
MODEL_70B,
10841084
};
10851085

1086-
static const size_t kB = 1024;
1087-
static const size_t MB = 1024*kB;
1088-
static const size_t GB = 1024*MB;
1086+
static const size_t kB = 1000;
1087+
static const size_t MB = 1000*kB;
1088+
static const size_t GB = 1000*MB;
10891089

10901090
struct llama_hparams {
10911091
bool vocab_only;
@@ -1481,7 +1481,7 @@ static bool llama_kv_cache_init(
14811481
vram_kv_cache += ggml_nbytes(cache.k);
14821482
}
14831483
if (vram_kv_cache > 0) {
1484-
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1484+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1e6);
14851485
}
14861486
}
14871487
#endif
@@ -2520,9 +2520,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
25202520
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
25212521
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
25222522
if (ml.n_bytes < GB) {
2523-
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2523+
LLAMA_LOG_INFO("%s: model size = %.2f MB (%.2f BPW) \n", __func__, ml.n_bytes/1e6, ml.n_bytes*8.0/ml.n_elements);
25242524
} else {
2525-
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
2525+
LLAMA_LOG_INFO("%s: model size = %.2f GB (%.2f BPW) \n", __func__, ml.n_bytes/1e9, ml.n_bytes*8.0/ml.n_elements);
25262526
}
25272527

25282528
// general kv
@@ -2558,7 +2558,7 @@ static void llm_load_tensors(
25582558

25592559
ml.calc_sizes(ctx_size, mmapped_size);
25602560

2561-
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
2561+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1e6);
25622562

25632563
// create the ggml context
25642564
{
@@ -3207,7 +3207,7 @@ static void llm_load_tensors(
32073207
ctx_size +
32083208
mmapped_size - vram_weights; // weights in VRAM not in memory
32093209

3210-
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
3210+
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1e6);
32113211

32123212
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
32133213
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3226,7 +3226,7 @@ static void llm_load_tensors(
32263226
#endif // GGML_USE_CUBLAS
32273227

32283228
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3229-
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
3229+
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1e6);
32303230
#else
32313231
(void) n_gpu_layers;
32323232
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -7878,7 +7878,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
78787878
new_type = tensor->type;
78797879
new_data = tensor->data;
78807880
new_size = ggml_nbytes(tensor);
7881-
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
7881+
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1e6);
78827882
} else {
78837883
const size_t nelements = ggml_nelements(tensor);
78847884

@@ -7938,7 +7938,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
79387938
workers.clear();
79397939
}
79407940

7941-
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
7941+
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1e6, new_size/1e6);
79427942
int64_t tot_count = 0;
79437943
for (size_t i = 0; i < hist_cur.size(); i++) {
79447944
hist_all[i] += hist_cur[i];
@@ -7976,8 +7976,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
79767976

79777977
gguf_free(ctx_out);
79787978

7979-
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
7980-
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
7979+
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1e6);
7980+
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1e6);
79817981

79827982
// print histogram for all tensors
79837983
{
@@ -8478,7 +8478,7 @@ struct llama_context * llama_new_context_with_model(
84788478

84798479
{
84808480
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
8481-
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
8481+
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1e6);
84828482
}
84838483

84848484
// resized during inference
@@ -8523,7 +8523,7 @@ struct llama_context * llama_new_context_with_model(
85238523
// measure memory requirements for the graph
85248524
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
85258525

8526-
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
8526+
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1e6);
85278527

85288528
// recreate allocator with exact memory requirements
85298529
ggml_allocr_free(ctx->alloc);
@@ -8537,7 +8537,7 @@ struct llama_context * llama_new_context_with_model(
85378537
#endif
85388538
#ifdef GGML_USE_CUBLAS
85398539
ggml_cuda_set_scratch_size(alloc_size);
8540-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
8540+
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1e6);
85418541

85428542
// calculate total VRAM usage
85438543
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8558,9 +8558,9 @@ struct llama_context * llama_new_context_with_model(
85588558
size_t total_vram_size = model_vram_size + ctx_vram_size;
85598559

85608560
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
8561-
total_vram_size / 1024.0 / 1024.0,
8562-
model_vram_size / 1024.0 / 1024.0,
8563-
ctx_vram_size / 1024.0 / 1024.0);
8561+
total_vram_size / 1e6,
8562+
model_vram_size / 1e6,
8563+
ctx_vram_size / 1e6);
85648564
#endif
85658565
}
85668566

@@ -8581,7 +8581,7 @@ struct llama_context * llama_new_context_with_model(
85818581

85828582
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
85838583

8584-
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
8584+
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1e6);
85858585

85868586
#define LLAMA_METAL_CHECK_BUF(result) \
85878587
if (!(result)) { \

0 commit comments

Comments
 (0)