Fix for ggml-org#2023

goerch · goerch · commit ac793a21e85a · 2023-07-22T00:32:09.000+02:00
diff --git a/convert.py b/convert.py
@@ -231,19 +231,10 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) ->
     def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
         tokenizer = self.sentencepiece_tokenizer
         for i in range(tokenizer.vocab_size()):
-            text: bytes
-            if tokenizer.is_unknown(i):
-                text = " \u2047 ".encode("utf-8")
-            elif tokenizer.is_control(i):
-                text = b""
-            elif tokenizer.is_byte(i):
-                piece = tokenizer.id_to_piece(i)
-                if len(piece) != 6:
-                    raise Exception(f"Invalid token: {piece}")
-                byte_value = int(piece[3:-1], 16)
-                text = struct.pack("B", byte_value)
-            else:
-                text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            # TODO: How do we want to support is_unknown, is_control, is_byte and is_unused(i)?
+            piece = tokenizer.id_to_piece(i)
+            text: bytes = piece.encode("utf-8")
+
             score: float = tokenizer.get_score(i)
             yield text, score
 
diff --git a/llama.cpp b/llama.cpp
@@ -1805,7 +1805,8 @@ struct llama_tokenizer {
         size_t offs = 0;
         while (offs < text.size()) {
             llama_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            assert(utf8_len(text[offs]) <= text.size() - offs);
+            size_t char_len = utf8_len(text[offs]);
             sym.text = text.c_str() + offs;
             sym.n = char_len;
             offs += char_len;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -11,5 +11,6 @@ llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
+llama_add_test(test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
 llama_add_test(test-grad0.c) # SLOW
 # llama_add_test(test-opt.c) # SLOW
diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp
@@ -0,0 +1,103 @@
+#include "llama.h"
+
+#include <cassert>
+#include <cstdio>
+#include <string>
+#include <codecvt>
+#include <map>
+#include <vector>
+
+std::string detokenize(llama_context * ctx, llama_token * tokens, int count) {
+    std::string result;
+    for (int i = 0; i < count; ++i) {
+        result += llama_token_to_str(ctx, tokens[i]);
+        if (i < count - 1) {
+            result += "_";
+        }
+    }
+    return result;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init(false);
+
+    // load the vocab
+    {
+        auto lparams = llama_context_default_params();
+
+        lparams.vocab_only = true;
+
+        model = llama_load_model_from_file(fname.c_str(), lparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        ctx = llama_new_context_with_model(model, lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_free_model(model);
+            return 1;
+        }
+    }
+
+    const int n_vocab = llama_n_vocab(ctx);
+
+    if (n_vocab != 32000) {
+        fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
+        llama_free_model(model);
+        llama_free(ctx);
+        return 2;
+    }
+
+    for (int i = 0; i < n_vocab; ++i) {
+        const char * forward = llama_token_to_str(ctx, i);
+        llama_token tokens[strlen(forward)];
+        auto n = llama_tokenize(ctx, forward, tokens, strlen(forward), false);
+        if (n == 1) {
+            if (i != tokens[0]) {
+                const char* backward = llama_token_to_str(ctx, tokens[0]);
+                fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n", __func__, i, forward, tokens[0], backward);
+            }
+        } else {
+            if (i <= 258) {
+                fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n", __func__, i, forward, detokenize(ctx, tokens, n).c_str());
+            } else {
+                fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n", __func__, i, forward, detokenize(ctx, tokens, n).c_str());
+            }
+        }
+    }
+
+    std::wstring string_to_convert;
+    std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
+    for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
+        std::wstring wstr(1, ch);
+        std::string str = converter.to_bytes(wstr);
+        llama_token tokens[strlen(str.c_str())];
+        auto n = llama_tokenize(ctx, str.c_str(), tokens, str.length(), false);
+        if (n == 1) {
+            fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
+        }
+    }
+
+    llama_free_model(model);
+    llama_free(ctx);
+
+    llama_backend_free();
+
+    return 0;
+}