llama : fix tokenizer to use llama_char_to_byte

ggerganov · ggerganov · commit 7b6ae8904178 · 2023-08-17T12:27:26.000+03:00
diff --git a/llama.cpp b/llama.cpp
@@ -2303,6 +2303,18 @@ static uint8_t llama_byte_to_char(const llama_vocab & vocab, uint8_t byte) {
     return false;
 }
 
+static uint8_t llama_char_to_byte(const llama_vocab & vocab, uint8_t ch) {
+    if (llama_vocab_type(vocab) == "spm") {
+        return ch + 3;
+    }
+
+    if (llama_vocab_type(vocab) == "bpe") {
+        return ch - 32;
+    }
+
+    return false;
+}
+
 static std::string llama_escape_whitespace(const std::string& text) {
     std::string result;
     bool escaping = false;
@@ -2439,7 +2451,7 @@ struct llama_tokenizer {
         if (p == rev_merge.end()) {
             // output any symbols that did not form tokens as bytes.
             for (int j = 0; j < (int)symbol.n; ++j) {
-                llama_vocab::id token_id = llama_byte_to_char(vocab_, symbol.text[j]);
+                llama_vocab::id token_id = llama_char_to_byte(vocab_, symbol.text[j]);
                 output.push_back(token_id);
             }
             return;
@@ -4871,8 +4883,8 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token
     return 0;
 }
 
-int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * str, int length) {
-    return llama_token_to_str_with_model(&ctx->model, token, str, length);
+int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
+    return llama_token_to_str_with_model(&ctx->model, token, buf, length);
 }
 
 std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
@@ -4889,13 +4901,13 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
     return std::string(result.data(), result.size());
 }
 
-int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * str, int length) {
+int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
     if (0 <= token && token < llama_n_vocab_from_model(&ctx->model)) {
         std::string result = ctx->model.vocab.id_to_token[token].tok;
         if (length < (int) result.length()) {
             return -result.length();
         }
-        memcpy(str, result.c_str(), result.length());
+        memcpy(buf, result.c_str(), result.length());
         return result.length();
     }
     return 0;
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
@@ -89,6 +89,8 @@ int main(int argc, char **argv) {
         return 2;
     }
 
+    bool success = true;
+
     for (const auto & test_kv : k_tests()) {
         std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
         fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
@@ -103,7 +105,8 @@ int main(int argc, char **argv) {
         }
 
         if (!correct) {
-            fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
+            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
+            fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());
             fprintf(stderr, "%s : expected tokens: ", __func__);
             for (const auto & t : test_kv.second) {
                 fprintf(stderr, "%6d, ", t);
@@ -115,9 +118,7 @@ int main(int argc, char **argv) {
             }
             fprintf(stderr, "\n");
 
-            llama_free_model(model);
-            llama_free(ctx);
-            return 3;
+            success = false;
         }
     }
 
@@ -126,5 +127,5 @@ int main(int argc, char **argv) {
 
     llama_backend_free();
 
-    return 0;
+    return success ? 0 : 3;
 }

Original file line number	Diff line number	Diff line change
`@@ -89,6 +89,8 @@ int main(int argc, char **argv) {`
`89`	`89`	`return 2;`
`90`	`90`	`}`
`91`	`91`
	`92`	`+ bool success = true;`
	`93`	`+`
`92`	`94`	`for (const auto & test_kv : k_tests()) {`
`93`	`95`	`std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);`
`94`	`96`	`fprintf(stderr, "%s : '%s' tokenized to '%s'\n",`
`@@ -103,7 +105,8 @@ int main(int argc, char **argv) {`
`103`	`105`	`}`
`104`	`106`
`105`	`107`	`if (!correct) {`
`106`		`- fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());`
	`108`	`+ fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());`
	`109`	`+ fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());`
`107`	`110`	`fprintf(stderr, "%s : expected tokens: ", __func__);`
`108`	`111`	`for (const auto & t : test_kv.second) {`
`109`	`112`	`fprintf(stderr, "%6d, ", t);`
`@@ -115,9 +118,7 @@ int main(int argc, char **argv) {`
`115`	`118`	`}`
`116`	`119`	`fprintf(stderr, "\n");`
`117`	`120`
`118`		`- llama_free_model(model);`
`119`		`- llama_free(ctx);`
`120`		`- return 3;`
	`121`	`+ success = false;`
`121`	`122`	`}`
`122`	`123`	`}`
`123`	`124`
`@@ -126,5 +127,5 @@ int main(int argc, char **argv) {`
`126`	`127`
`127`	`128`	`llama_backend_free();`
`128`	`129`
`129`		`- return 0;`
	`130`	`+ return success ? 0 : 3;`
`130`	`131`	`}`