Skip to content

Commit b5103f4

Browse files
Nexesenexikawrakow
andcommitted
Better model info (ikawrakow#84)
Co-Authored-By: Kawrakow <[email protected]>
1 parent b302561 commit b5103f4

File tree

2 files changed

+103
-13
lines changed

2 files changed

+103
-13
lines changed

examples/quantize-stats/quantize-stats.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <vector>
1818
#include <thread>
1919
#include <mutex>
20+
#include <array>
2021

2122
#if defined(_MSC_VER)
2223
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -222,6 +223,30 @@ static void test_roundtrip_on_layer(
222223
}
223224
}
224225

226+
static void print_fp_stats(const char * msg, const uint64_t * counts) {
227+
printf("===== %s\n", msg);
228+
uint64_t tot = 0; for (int i = 0; i < 32; ++i) tot += counts[i];
229+
double norm = 1./tot;
230+
for (int i = 0; i < 32; ++i) {
231+
if (!counts[i]) continue;
232+
uint16_t val = i << 10;
233+
float f = ggml_fp16_to_fp32(val);
234+
printf("%2d %f %g\n", i, norm*counts[i], f);
235+
}
236+
}
237+
static void analyze_tensor_fp(const ggml_tensor * t, uint64_t * H) {
238+
if (t->type != GGML_TYPE_F16) return;
239+
if (!ggml_is_contiguous(t)) return;
240+
int n = ggml_nelements(t);
241+
const uint16_t * x = (const uint16_t *)t->data;
242+
std::array<uint64_t, 32> counts = {};
243+
for (int j = 0; j < n; ++j) {
244+
++counts[(x[j] >> 10) & 31];
245+
}
246+
for (int i = 0; i < 32; ++i) H[i] += counts[i];
247+
print_fp_stats(t->name, counts.data());
248+
}
249+
225250
int main(int argc, char ** argv) {
226251
ggml_time_init();
227252

@@ -231,6 +256,7 @@ int main(int argc, char ** argv) {
231256

232257
int max_thread = 0;
233258
bool invalid_param = false;
259+
bool analyze_fp = false;
234260
std::string arg;
235261
for (int i = 1; i < argc; i++) {
236262
arg = argv[i];
@@ -244,6 +270,8 @@ int main(int argc, char ** argv) {
244270
params.verbose = true;
245271
} else if (arg == "-p" || arg == "--per-layer-stats") {
246272
params.per_layer_stats = true;
273+
} else if (arg == "-afp" || arg == "--analyze-fp") {
274+
analyze_fp = true;
247275
} else if (arg == "--histogram") {
248276
params.print_histogram = true;
249277
} else if (arg == "-m" || arg == "--model") {
@@ -365,6 +393,22 @@ int main(int argc, char ** argv) {
365393
std::vector<char> quantized_scratch;
366394
std::vector<float> output_scratch;
367395

396+
if (analyze_fp) {
397+
for (const auto& kv_tensor : tensors) {
398+
if (!layer_included(params, kv_tensor.first)) {
399+
continue;
400+
}
401+
if (kv_tensor.second->ne[0] == 1 || kv_tensor.second->ne[1] == 1) {
402+
// we never quantize those
403+
continue;
404+
}
405+
std::array<uint64_t, 32> H = {};
406+
analyze_tensor_fp(kv_tensor.second, H.data());
407+
print_fp_stats("Total", H.data());
408+
}
409+
return 0;
410+
}
411+
368412
// loop throught quantization types
369413
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
370414
const ggml_type type = (ggml_type) i;

src/llama.cpp

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6859,22 +6859,68 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
68596859
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
68606860
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
68616861
if (ml.n_elements >= 1e12) {
6862-
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
6862+
LLAMA_LOG_INFO("%s: model params = %.3f T\n", __func__, ml.n_elements*1e-12);
68636863
} else if (ml.n_elements >= 1e9) {
6864-
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
6864+
LLAMA_LOG_INFO("%s: model params = %.3f B\n", __func__, ml.n_elements*1e-9);
68656865
} else if (ml.n_elements >= 1e6) {
6866-
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
6866+
LLAMA_LOG_INFO("%s: model params = %.3f M\n", __func__, ml.n_elements*1e-6);
68676867
} else {
6868-
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
6869-
}
6870-
6871-
LLAMA_LOG_INFO("%s: model size = %.2f Bytes (%.2f BPW) \n", __func__, ml.n_bytes/1.0, ml.n_bytes*8.0/ml.n_elements);
6872-
LLAMA_LOG_INFO("%s: model size = %.2f KB (%.2f BPW) \n", __func__, ml.n_bytes/1000.0, ml.n_bytes*8.0/ml.n_elements);
6873-
LLAMA_LOG_INFO("%s: model size = %.2f KiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0, ml.n_bytes*8.0/ml.n_elements);
6874-
LLAMA_LOG_INFO("%s: model size = %.2f MB (%.2f BPW) \n", __func__, ml.n_bytes/1000.0/1000.0 , ml.n_bytes*8.0/ml.n_elements);
6875-
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
6876-
LLAMA_LOG_INFO("%s: model size = %.2f GB (%.2f BPW) \n", __func__, ml.n_bytes/1000.0/1000.0/1000.0, ml.n_bytes*8.0/ml.n_elements);
6877-
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
6868+
LLAMA_LOG_INFO("%s: model params = %.3f K\n", __func__, ml.n_elements*1e-3);
6869+
// LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
6870+
// } else if (ml.n_elements >= 1e9) {
6871+
// LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
6872+
// } else if (ml.n_elements >= 1e6) {
6873+
// LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
6874+
// } else {
6875+
// LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
6876+
}
6877+
6878+
LLAMA_LOG_INFO("%s: model size = %.2f Bytes (%.3f BPW) \n", __func__, ml.n_bytes/1.0, ml.n_bytes*8.0/ml.n_elements);
6879+
LLAMA_LOG_INFO("%s: model size = %.2f KB (%.3f BPW) \n", __func__, ml.n_bytes/1000.0, ml.n_bytes*8.0/ml.n_elements);
6880+
LLAMA_LOG_INFO("%s: model size = %.2f KiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0, ml.n_bytes*8.0/ml.n_elements);
6881+
LLAMA_LOG_INFO("%s: model size = %.2f MB (%.3f BPW) \n", __func__, ml.n_bytes/1000.0/1000.0 , ml.n_bytes*8.0/ml.n_elements);
6882+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
6883+
LLAMA_LOG_INFO("%s: model size = %.2f GB (%.3f BPW) \n", __func__, ml.n_bytes/1000.0/1000.0/1000.0, ml.n_bytes*8.0/ml.n_elements);
6884+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
6885+
6886+
// if (ml.n_bytes < GiB) {
6887+
// LLAMA_LOG_INFO("%s: model size = %.3f MiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
6888+
// } else {
6889+
// LLAMA_LOG_INFO("%s: model size = %.3f GiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
6890+
// }
6891+
6892+
{
6893+
auto n_bytes = ml.n_bytes;
6894+
auto n_elements = ml.n_elements;
6895+
auto meta_tke = ml.get_tensor_meta("token_embd.weight");
6896+
auto meta_out = ml.get_tensor_meta("output.weight");
6897+
if (meta_tke && meta_out) {
6898+
n_bytes -= ggml_nbytes(meta_tke);
6899+
n_elements -= ggml_nelements(meta_tke);
6900+
n_bytes -= ggml_nbytes(meta_out);
6901+
n_elements -= ggml_nelements(meta_out);
6902+
6903+
LLAMA_LOG_INFO("%s: repeating layers = %.2f Bytes (%.3f BPW) \n", __func__, n_bytes/1.0, n_bytes*8.0/n_elements);
6904+
LLAMA_LOG_INFO("%s: repeating layers = %.2f KB (%.3f BPW) \n", __func__, n_bytes/1000.0, n_bytes*8.0/n_elements);
6905+
LLAMA_LOG_INFO("%s: repeating layers = %.2f KiB (%.3f BPW) \n", __func__, n_bytes/1024.0, n_bytes*8.0/n_elements);
6906+
LLAMA_LOG_INFO("%s: repeating layers = %.2f MB (%.3f BPW) \n", __func__, n_bytes/1000.0/1000.0, n_bytes*8.0/n_elements);
6907+
LLAMA_LOG_INFO("%s: repeating layers = %.2f MiB (%.3f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
6908+
LLAMA_LOG_INFO("%s: repeating layers = %.2f GB (%.3f BPW) \n", __func__, n_bytes/1000.0/1000.0/1000.0, n_bytes*8.0/n_elements);
6909+
LLAMA_LOG_INFO("%s: repeating layers = %.2f GiB (%.3f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
6910+
6911+
// if (n_bytes < GiB) {
6912+
// LLAMA_LOG_INFO("%s: repeating layers = %.3f MiB (%.3f BPW", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
6913+
// } else {
6914+
// LLAMA_LOG_INFO("%s: repeating layers = %.3f GiB (%.3f BPW", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
6915+
// }
6916+
6917+
if (ml.n_elements >= 1e9) {
6918+
LLAMA_LOG_INFO(", %.3f B parameters)\n", n_elements*1e-9);
6919+
} else {
6920+
LLAMA_LOG_INFO(", %.3f M parameters)\n", n_elements*1e-6);
6921+
}
6922+
}
6923+
}
68786924

68796925
// general kv
68806926
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());

0 commit comments

Comments
 (0)