try fix memory for perplexity

Green-Sky · Green-Sky · commit 3c3129247e6e · 2023-03-23T20:50:09.000+01:00
diff --git a/llama.cpp b/llama.cpp
@@ -632,7 +632,7 @@ static bool llama_eval_internal(
     auto & mem_at_token1 = lctx.mem_at_token1;
 
     // TODO: fix this hardcoded size
-    static size_t buf_size = size_t(n_ctx)*1024*1024;
+    static size_t buf_size = size_t(n_ctx)*size_t(N)*128*1024;
     static void * buf = malloc(buf_size);
 
     const size_t C0 = mem_at_token0; // ~base
diff --git a/main.cpp b/main.cpp
@@ -220,12 +220,14 @@ int main(int argc, char ** argv) {
     // TODO: better way to do that
     // TODO(Green-Sky): move to internal and detect first time usage
     {
+        // perplexity uses context size as batch size (?)
+        const auto tmp_batch_size = params.perplexity ? params.n_ctx : params.n_batch;
         // we make 2 evals, of batchsize to take 2 measurements, to determine base and growth
-        std::vector<llama_token> tmp(params.n_batch*2, 2);
+        std::vector<llama_token> tmp(tmp_batch_size*2, 2);
         tmp[0] = llama_token_bos();
 
-        llama_eval(ctx,                tmp.data(), params.n_batch,              0, params.n_threads);
-        llama_eval(ctx, tmp.data()+params.n_batch, params.n_batch, params.n_batch, params.n_threads);
+        llama_eval(ctx,                tmp.data(), tmp_batch_size,              0, params.n_threads);
+        llama_eval(ctx, tmp.data()+tmp_batch_size, tmp_batch_size, tmp_batch_size, params.n_threads);
     }
 
     if (params.perplexity) {