Skip to content

Commit 059fcd3

Browse files
committed
examples : add idle tool for investigating GPU idle overhead
1 parent 4a75d19 commit 059fcd3

File tree

5 files changed

+143
-0
lines changed

5 files changed

+143
-0
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ BUILD_TARGETS = \
1818
llama-gguf-hash \
1919
llama-gguf-split \
2020
llama-gritlm \
21+
llama-idle \
2122
llama-imatrix \
2223
llama-infill \
2324
llama-llava-cli \

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ else()
3030
add_subdirectory(gguf-split)
3131
add_subdirectory(gguf)
3232
add_subdirectory(gritlm)
33+
add_subdirectory(idle)
3334
add_subdirectory(imatrix)
3435
add_subdirectory(infill)
3536
add_subdirectory(llama-bench)

examples/idle/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(TARGET llama-idle)
2+
add_executable(${TARGET} idle.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/idle/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# llama.cpp/example/idle
2+
3+

examples/idle/idle.cpp

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
#include "llama.h"
2+
3+
#include <cmath>
4+
#include <cstdio>
5+
#include <cstring>
6+
#include <string>
7+
#include <thread>
8+
#include <vector>
9+
10+
static void print_usage(int /*argc*/, char ** argv) {
11+
printf("\nexample usage:\n");
12+
printf("\n %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
13+
printf("\n");
14+
}
15+
16+
int main(int argc, char ** argv) {
17+
// path to the model gguf file
18+
std::string model_path;
19+
20+
// number of layers to offload to the GPU
21+
int ngl = 99;
22+
23+
// parse command line arguments
24+
25+
{
26+
int i = 1;
27+
for (; i < argc; i++) {
28+
if (strcmp(argv[i], "-m") == 0) {
29+
if (i + 1 < argc) {
30+
model_path = argv[++i];
31+
} else {
32+
print_usage(argc, argv);
33+
return 1;
34+
}
35+
} else if (strcmp(argv[i], "-ngl") == 0) {
36+
if (i + 1 < argc) {
37+
try {
38+
ngl = std::stoi(argv[++i]);
39+
} catch (...) {
40+
print_usage(argc, argv);
41+
return 1;
42+
}
43+
} else {
44+
print_usage(argc, argv);
45+
return 1;
46+
}
47+
} else {
48+
// prompt starts here
49+
break;
50+
}
51+
}
52+
if (model_path.empty()) {
53+
print_usage(argc, argv);
54+
return 1;
55+
}
56+
}
57+
58+
llama_model_params model_params = llama_model_default_params();
59+
model_params.n_gpu_layers = ngl;
60+
61+
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
62+
if (model == NULL) {
63+
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
64+
return 1;
65+
}
66+
67+
// we need just a dummy token to evaluate
68+
std::vector<llama_token> prompt_tokens(1, llama_token_bos(model));
69+
70+
llama_context_params ctx_params = llama_context_default_params();
71+
ctx_params.n_ctx = 512;
72+
ctx_params.n_batch = 512;
73+
ctx_params.no_perf = false;
74+
75+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
76+
if (ctx == NULL) {
77+
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
78+
return 1;
79+
}
80+
81+
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
82+
83+
const int n_iters = 10;
84+
85+
// warm-up
86+
llama_decode(ctx, batch);
87+
llama_kv_cache_clear (ctx);
88+
llama_kv_cache_update(ctx);
89+
llama_synchronize (ctx);
90+
91+
for (int64_t t_pause_ms = 0; t_pause_ms <= 2200; t_pause_ms += 200) {
92+
double t_sum_us = 0.0;
93+
double t_sum2_us = 0.0;
94+
95+
for (int i = 0; i < n_iters; i++) {
96+
// this pause is important - it simulates "idle GPU"
97+
std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
98+
99+
const int64_t t_start_us = llama_time_us();
100+
101+
// this should take constant time
102+
llama_decode(ctx, batch);
103+
llama_synchronize(ctx);
104+
105+
const int64_t t_end_us = llama_time_us();
106+
107+
const double t_cur_us = t_end_us - t_start_us;
108+
109+
#if 0
110+
// print individual decode times
111+
printf(" - decode time: %8.2f ms\n", t_cur_us / 1000);
112+
#endif
113+
114+
t_sum_us += t_cur_us;
115+
t_sum2_us += t_cur_us * t_cur_us;
116+
117+
llama_kv_cache_clear (ctx);
118+
llama_kv_cache_update(ctx);
119+
llama_synchronize (ctx); // just in case
120+
}
121+
122+
const double t_avg_us = t_sum_us / n_iters;
123+
const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
124+
125+
printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
126+
fflush(stdout);
127+
}
128+
129+
llama_free(ctx);
130+
llama_free_model(model);
131+
132+
return 0;
133+
}

0 commit comments

Comments
 (0)