llama : fix typo in <|im_end|> token text (#6745)

ggerganov · ggerganov · commit 8960fe86ae07 · 2024-04-22T15:41:11.000+03:00
diff --git a/llama.cpp b/llama.cpp
@@ -4340,7 +4340,7 @@ static void llm_load_vocab(
             }
         }
 
-        // find EOT token: "<|eot_id|>", "<|im_emd|>", "<end_of_turn>", etc.
+        // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
         //
         // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
         //       for now, we apply this workaround to find the EOT token based on its text
@@ -4351,7 +4351,7 @@ static void llm_load_vocab(
                         //       need to fix convert script
                         //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
                         (t.first == "<|eot_id|>" ||
-                         t.first == "<|im_emd|>" ||
+                         t.first == "<|im_end|>" ||
                          t.first == "<end_of_turn>"
                         )
                    ) {

Original file line number	Diff line number	Diff line change
`@@ -4340,7 +4340,7 @@ static void llm_load_vocab(`
`4340`	`4340`	`}`
`4341`	`4341`	`}`
`4342`	`4342`
`4343`		`- // find EOT token: "<\|eot_id\|>", "<\|im_emd\|>", "<end_of_turn>", etc.`
	`4343`	`+ // find EOT token: "<\|eot_id\|>", "<\|im_end\|>", "<end_of_turn>", etc.`
`4344`	`4344`	`//`
`4345`	`4345`	`// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID`
`4346`	`4346`	`// for now, we apply this workaround to find the EOT token based on its text`
`@@ -4351,7 +4351,7 @@ static void llm_load_vocab(`
`4351`	`4351`	`// need to fix convert script`
`4352`	`4352`	`//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&`
`4353`	`4353`	`(t.first == "<\|eot_id\|>" \|\|`
`4354`		`- t.first == "<\|im_emd\|>" \|\|`
	`4354`	`+ t.first == "<\|im_end\|>" \|\|`
`4355`	`4355`	`t.first == "<end_of_turn>"`
`4356`	`4356`	`)`
`4357`	`4357`	`) {`