Skip to content

Commit 42e136a

Browse files
ggerganovngxson
authored andcommitted
llama : llama_perf + option to disable timings during decode (ggml-org#9355)
* llama : llama_perf + option to disable timings during decode ggml-ci * common : add llama_arg * Update src/llama.cpp Co-authored-by: Xuan Son Nguyen <[email protected]> * perf : separate functions in the API ggml-ci * perf : safer pointer handling + naming update ggml-ci * minor : better local var name * perf : abort on invalid sampler pointer ggml-ci --------- Co-authored-by: Xuan Son Nguyen <[email protected]>
1 parent 3252b86 commit 42e136a

File tree

23 files changed

+134
-90
lines changed

23 files changed

+134
-90
lines changed

common/arg.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
720720
params.prompt = value;
721721
}
722722
));
723+
add_opt(llama_arg(
724+
{"--no-perf"},
725+
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
726+
[](gpt_params & params) {
727+
params.no_perf = true;
728+
params.sparams.no_perf = true;
729+
}
730+
).set_env("LLAMA_ARG_NO_PERF"));
723731
add_opt(llama_arg(
724732
{"-f", "--file"}, "FNAME",
725733
"a file containing the prompt (default: none)",

common/common.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
820820
}
821821
llama_kv_cache_clear(lctx);
822822
llama_synchronize(lctx);
823-
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
823+
llama_perf_context_reset(lctx);
824824
}
825825

826826
iparams.model = model;
@@ -916,6 +916,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
916916
cparams.cb_eval_user_data = params.cb_eval_user_data;
917917
cparams.offload_kqv = !params.no_kv_offload;
918918
cparams.flash_attn = params.flash_attn;
919+
cparams.no_perf = params.no_perf;
919920

920921
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
921922
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ struct gpt_sampler_params {
124124
float mirostat_eta = 0.10f; // learning rate
125125
bool penalize_nl = false; // consider newlines as a repeatable token
126126
bool ignore_eos = false;
127+
bool no_perf = false; // disable performance metrics
127128

128129
std::vector<enum gpt_sampler_type> samplers = {
129130
GPT_SAMPLER_TYPE_TOP_K,
@@ -246,6 +247,7 @@ struct gpt_params {
246247
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
247248
bool cont_batching = true; // insert new sequences for decoding on-the-fly
248249
bool flash_attn = false; // flash attention
250+
bool no_perf = false; // disable performance metrics
249251

250252
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
251253
bool logits_all = false; // return logits for all tokens in the batch

common/sampling.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
142142
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
143143
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
144144

145-
lparams.no_perf = false; // TODO: control via params
145+
lparams.no_perf = params.no_perf;
146146

147147
auto * result = new gpt_sampler {
148148
/* .params = */ params,
@@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
257257
// TODO: measure grammar performance
258258

259259
if (gsmpl) {
260-
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
260+
llama_perf_sampler_print(gsmpl->chain);
261261
}
262262
if (ctx) {
263-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
263+
llama_perf_context_print(ctx);
264264
}
265265
}
266266

examples/batched-bench/batched-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
187187
}
188188

189189
LOG_TEE("\n");
190-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
190+
llama_perf_context_print(ctx);
191191

192192
llama_batch_free(batch);
193193

examples/batched.swift/Sources/main.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
200200

201201
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
202202

203-
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
204-
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
203+
llama_perf_sampler_print(smpl)
204+
llama_perf_context_print(context)
205205

206206
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
207207
let utf8Count = text.utf8.count

examples/batched/batched.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
229229
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
230230

231231
LOG_TEE("\n");
232-
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
233-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
232+
llama_perf_sampler_print(smpl);
233+
llama_perf_context_print(ctx);
234234

235235
fprintf(stderr, "\n");
236236

examples/embedding/embedding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ int main(int argc, char ** argv) {
306306
}
307307

308308
LOG_TEE("\n");
309-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
309+
llama_perf_context_print(ctx);
310310

311311
// clean up
312312
llama_batch_free(batch);

examples/eval-callback/eval-callback.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
182182
}
183183

184184
LOG_TEE("\n");
185-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
185+
llama_perf_context_print(ctx);
186186

187187
llama_free(ctx);
188188
llama_free_model(model);

examples/imatrix/imatrix.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
637637
g_collector.save_imatrix();
638638

639639
LOG_TEE("\n");
640-
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
640+
llama_perf_context_print(ctx);
641641

642642
llama_free(ctx);
643643
llama_free_model(model);

0 commit comments

Comments
 (0)