perplexity.cpp : better way to deal wirh spm prepending space

klosax · web-flow · commit 724fa67d6698 · 2023-08-26T23:28:23.000+02:00
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -391,7 +391,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         hs_data[i].context = prompt_lines[idx*6];
         hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
         for (size_t j=0; j < 4; j++) {
-            hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
+            hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
         }
 
         // Delete the selected random example from the prompt
@@ -415,7 +415,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
         // Do the 1st ending
         // In this case we include the context when evaluating
-        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
+        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[0], add_bos);
         auto query_size = query_embd.size();
         //printf("First query: %d\n",(int)query_size);
 
@@ -462,11 +462,11 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
 
             // Tokenize the query
-            // SPM tokenizer: Do not tokenize the starting space in the ending since it is always added by the tokenizer
+            // SPM tokenizer: Do not prepend a space since the tokenizer always do that
             if (is_spm) {
-                query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx].substr(1,hs_data[task_idx].ending[ending_idx].size()-1), false);
-            } else {
                 query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
+            } else {
+                query_embd = ::llama_tokenize(ctx, " " + hs_data[task_idx].ending[ending_idx], false);
             }
 
             query_size = query_embd.size();