Skip to content

Add "-e"/"--eval-threads" to distinguish thread counts for single-token eval and prompt eval #744

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
14 changes: 14 additions & 0 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ int32_t get_num_physical_cores() {
}

bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
bool ethreads_set = false;
bool invalid_param = false;
std::string arg;
gpt_params default_params;
Expand All @@ -86,6 +87,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_threads = std::stoi(argv[i]);
} else if (arg == "-e" || arg == "--eval-threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ethreads = std::stoi(argv[i]);
ethreads_set = true;
} else if (arg == "-p" || arg == "--prompt") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -303,6 +311,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
exit(1);
}
}

// ensure that n_ethreads defaults to the system thread-max only when n_threads is not set
if (!ethreads_set) {
params.n_ethreads = params.n_threads;
}

if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
Expand Down
1 change: 1 addition & 0 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ int32_t get_num_physical_cores();
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_ethreads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
Expand Down
2 changes: 1 addition & 1 deletion examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ int main(int argc, char ** argv) {

if (params.embedding){
if (embd_inp.size() > 0) {
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads, params.n_ethreads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
Expand Down
6 changes: 3 additions & 3 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,12 +143,12 @@ int main(int argc, char ** argv) {
if (params.mem_test) {
{
const std::vector<llama_token> tmp(params.n_batch, 0);
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads, params.n_ethreads);
}

{
const std::vector<llama_token> tmp = { 0, };
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads, params.n_ethreads);
}

llama_print_timings(ctx);
Expand Down Expand Up @@ -373,7 +373,7 @@ int main(int argc, char ** argv) {
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads, params.n_ethreads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
Expand Down
3 changes: 2 additions & 1 deletion examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
std::vector<float> logits;
int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
auto start_t = std::chrono::high_resolution_clock::now();

for (int j = 0; j < num_batches; ++j) {
int batch_start = start + j * params.n_batch;
int batch_size = std::min(end - batch_start, params.n_batch);
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) {
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads, params.n_ethreads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
Expand Down
11 changes: 7 additions & 4 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1042,13 +1042,15 @@ static bool llama_model_load(
// - tokens: new batch of tokens to process
// - n_past: the context size so far
// - n_threads: number of threads to use
// - n_ethreads: number of threads to use for single-token eval
//
static bool llama_eval_internal(
llama_context & lctx,
const llama_token * tokens,
const int n_tokens,
const int n_past,
const int n_threads) {
const int n_threads,
const int n_ethreads) {
const int64_t t_start_us = ggml_time_us();

const int N = n_tokens;
Expand Down Expand Up @@ -1081,7 +1083,7 @@ static bool llama_eval_internal(
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ggml_cgraph gf = {};
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : (N == 1 ? n_ethreads : n_threads);

struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
memcpy(embd->data, tokens, N*ggml_element_size(embd));
Expand Down Expand Up @@ -2650,8 +2652,9 @@ int llama_eval(
const llama_token * tokens,
int n_tokens,
int n_past,
int n_threads) {
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
int n_threads,
int n_ethreads) {
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, n_ethreads)) {
fprintf(stderr, "%s: failed to eval\n", __func__);
return 1;
}
Expand Down
3 changes: 2 additions & 1 deletion llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ extern "C" {
const llama_token * tokens,
int n_tokens,
int n_past,
int n_threads);
int n_threads,
int n_ethreads);

// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
Expand Down