From 9ea43d4d9124d6a05ba1027dd05d65c5ffdfeae7 Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Wed, 22 Mar 2023 12:09:42 -0700 Subject: [PATCH 1/6] Add support to batch size for perplexity --- main.cpp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/main.cpp b/main.cpp index fbb43a8cca15b..da42645f37e40 100644 --- a/main.cpp +++ b/main.cpp @@ -90,17 +90,26 @@ void perplexity(llama_context * ctx, const gpt_params & params) { int count = 0; double nll = 0.0; int seq_count = tokens.size() / params.n_ctx; + int n_vocab = llama_n_vocab(ctx); - fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); + fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch); for (int i = 0; i < seq_count; ++i) { int start = i * params.n_ctx; int end = start + params.n_ctx - 1; - std::vector embd(tokens.begin() + start, tokens.begin() + end); + + std::vector logits; + int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch; auto start_t = std::chrono::high_resolution_clock::now(); - if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return; + for (int j = 0; j < num_batches; ++j) { + int batch_start = start + j * params.n_batch; + int batch_size = std::min(end - batch_start, params.n_batch); + if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + auto batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); } auto end_t = std::chrono::high_resolution_clock::now(); if (i == 0) { @@ -120,13 +129,11 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // last 256 tokens. Then, we split the input up into context window size chunks to // process the entire prompt. - auto logits = llama_get_logits(ctx); for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { // Calculate probability of next token, given the previous ones. - int n_vocab = llama_n_vocab(ctx); std::vector tok_logits( - logits + j * n_vocab, - logits + (j + 1) * n_vocab); + logits.begin() + j * n_vocab, + logits.begin() + (j + 1) * n_vocab); double prob = softmax(tok_logits)[tokens[start + j + 1]]; nll += -std::log(prob); ++count; From 57dc4dc68a385712e254a65804b73cd6cd6e75d7 Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Thu, 23 Mar 2023 18:44:48 -0700 Subject: [PATCH 2/6] Revert "Fix memory allocation issues and seg faults" This reverts commit 4870e455b3653f7d7769fa5772b2c90ffad088df. --- llama.cpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/llama.cpp b/llama.cpp index cf796cce396a5..d55219256932a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -102,9 +102,6 @@ struct llama_context { // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; bool logits_all = false; - - // work buffer for transformer evaluation - std::vector buf_eval; }; struct llama_context_params llama_context_default_params() { @@ -630,19 +627,27 @@ static bool llama_eval_internal( const int n_rot = hparams.n_embd/hparams.n_head; auto & mem_per_token = lctx.mem_per_token; - auto & buf_eval = lctx.buf_eval; - if (mem_per_token*(n_past + N + 16) > buf_eval.size()) { - const size_t buf_size_new = 1.618*buf_eval.size(); + // TODO: fix this hardcoded size + static size_t buf_size = 512u*1024*1024; + static void * buf = malloc(buf_size); - //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_eval.size(), buf_size_new); + if (mem_per_token > 0 && mem_per_token*N > buf_size) { + const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead + //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - buf_eval.resize(buf_size_new); + // reallocate + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } } struct ggml_init_params params = { - /*.mem_size =*/ buf_eval.size(), - /*.mem_buffer =*/ buf_eval.data(), + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf, }; struct ggml_context * ctx0 = ggml_init(params); @@ -827,11 +832,10 @@ static bool llama_eval_internal( memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); } - if (N == 1) { - mem_per_token = ggml_used_mem(ctx0)/(n_past + N); + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; } - - //fprintf(stderr, "\nused_mem = %zu, %zu MB\n", ggml_used_mem(ctx0), ggml_used_mem(ctx0)/1024/1024); + //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0)); ggml_free(ctx0); @@ -1412,8 +1416,6 @@ struct llama_context * llama_init_from_file( return nullptr; } - ctx->buf_eval.resize(512u*1024u*1024u); - return ctx; } From 7392ad629d60ee81a44dfdac6289a1b87d7f404f Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Sat, 25 Mar 2023 13:30:40 -0700 Subject: [PATCH 3/6] update from merge --- examples/perplexity/perplexity.cpp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index f617ba365dd05..91f0bf6b9b6b4 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -26,17 +26,26 @@ void perplexity(llama_context * ctx, const gpt_params & params) { int count = 0; double nll = 0.0; int seq_count = tokens.size() / params.n_ctx; + int n_vocab = llama_n_vocab(ctx); - fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); + fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch); for (int i = 0; i < seq_count; ++i) { int start = i * params.n_ctx; int end = start + params.n_ctx - 1; - std::vector embd(tokens.begin() + start, tokens.begin() + end); + + std::vector logits; + int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch; auto start_t = std::chrono::high_resolution_clock::now(); - if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return; + for (int j = 0; j < num_batches; ++j) { + int batch_start = start + j * params.n_batch; + int batch_size = std::min(end - batch_start, params.n_batch); + if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + auto batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); } auto end_t = std::chrono::high_resolution_clock::now(); if (i == 0) { @@ -56,13 +65,11 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // last 256 tokens. Then, we split the input up into context window size chunks to // process the entire prompt. - auto logits = llama_get_logits(ctx); for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { // Calculate probability of next token, given the previous ones. - int n_vocab = llama_n_vocab(ctx); std::vector tok_logits( - logits + j * n_vocab, - logits + (j + 1) * n_vocab); + logits.begin() + j * n_vocab, + logits.begin() + (j + 1) * n_vocab); double prob = softmax(tok_logits)[tokens[start + j + 1]]; nll += -std::log(prob); ++count; From 43523220a4d0718adc406b3845d54b28d96bd120 Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Sat, 25 Mar 2023 13:33:42 -0700 Subject: [PATCH 4/6] Remove perplexity from main --- examples/main/main.cpp | 80 ------------------------------------------ 1 file changed, 80 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index d4cd3fbad8aa6..7bb2b6bc466a8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -74,86 +74,6 @@ void set_console_state(console_state new_st) { } } -std::vector softmax(const std::vector& logits) { - std::vector probs(logits.size()); - float max_logit = logits[0]; - for (float v : logits) max_logit = std::max(max_logit, v); - double sum_exp = 0.0; - for (size_t i = 0; i < logits.size(); i++) { - // Subtract the maximum logit value from the current logit value for numerical stability - float logit = logits[i] - max_logit; - double exp_logit = std::exp(logit); - sum_exp += exp_logit; - probs[i] = exp_logit; - } - for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp; - return probs; -} - -void perplexity(llama_context * ctx, const gpt_params & params) { - // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research - // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` - // Output: `perplexity: 13.5106 [114/114]` - auto tokens = ::llama_tokenize(ctx, params.prompt, true); - - int count = 0; - double nll = 0.0; - int seq_count = tokens.size() / params.n_ctx; - int n_vocab = llama_n_vocab(ctx); - - fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch); - - for (int i = 0; i < seq_count; ++i) { - int start = i * params.n_ctx; - int end = start + params.n_ctx - 1; - - std::vector logits; - int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch; - auto start_t = std::chrono::high_resolution_clock::now(); - for (int j = 0; j < num_batches; ++j) { - int batch_start = start + j * params.n_batch; - int batch_size = std::min(end - batch_start, params.n_batch); - if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return; - } - auto batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); - } - auto end_t = std::chrono::high_resolution_clock::now(); - if (i == 0) { - double seconds = std::chrono::duration(end_t - start_t).count(); - printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); - } - // We get the logits for all the tokens in the context window (params.n_ctx) - // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, - // calculate the perplexity over the last half the window (so the model always has - // some context to predict the token). - // - // We rely on the fact that attention in the forward pass only looks at previous - // tokens here, so the logits returned for each token are an accurate representation - // of what the model would have predicted at that point. - // - // Example, we have a context window of 512, we will compute perplexity for each of the - // last 256 tokens. Then, we split the input up into context window size chunks to - // process the entire prompt. - - for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { - // Calculate probability of next token, given the previous ones. - std::vector tok_logits( - logits.begin() + j * n_vocab, - logits.begin() + (j + 1) * n_vocab); - double prob = softmax(tok_logits)[tokens[start + j + 1]]; - nll += -std::log(prob); - ++count; - } - // perplexity is e^(average negative log-likelihood) - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); - fflush(stdout); - } - printf("\n"); -} - static bool is_interacting = false; #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) From 864dcb26fb2f74453f223a9cc100768530e7a391 Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Sun, 2 Apr 2023 20:16:15 -0700 Subject: [PATCH 5/6] updates --- examples/perplexity/perplexity.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index d8195a1f84400..eb456fcfa67a7 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -34,7 +34,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) { for (int i = 0; i < seq_count; ++i) { int start = i * params.n_ctx; - int end = start + params.n_ctx - 1; + int end = start + params.n_ctx; std::vector logits; int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch; @@ -66,8 +66,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // Example, we have a context window of 512, we will compute perplexity for each of the // last 256 tokens. Then, we split the input up into context window size chunks to // process the entire prompt. - - for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) { + for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) { // Calculate probability of next token, given the previous ones. std::vector tok_logits( logits.begin() + j * n_vocab, From 23fd782d35c3b7f5a3069785b7a2c89031b9d5f3 Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Thu, 13 Apr 2023 08:20:54 -0700 Subject: [PATCH 6/6] Update batch size for efficiency --- examples/perplexity/perplexity.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index c36c597c96fd4..38e3643b1ca5e 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -86,11 +86,13 @@ int main(int argc, char ** argv) { gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; + params.n_batch = 512; if (gpt_params_parse(argc, argv, params) == false) { return 1; } params.perplexity = true; + params.n_batch = std::min(params.n_batch, params.n_ctx); if (params.n_ctx > 2048) { fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"