Added whitespace escaping and unescaping

goerch · goerch · commit 0e74a7222eaf · 2023-07-22T22:24:21.000+02:00
Now we see some resemblence to the Meta-Tokenizer, I think. Only problem: how to integrate this into `llama.cpp` kernel.
diff --git a/convert.py b/convert.py
@@ -233,12 +233,7 @@ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
         for i in range(tokenizer.vocab_size()):
             # TODO: How do we want to support is_unknown, is_control, is_byte and is_unused?
             piece = tokenizer.id_to_piece(i)
-            text: bytes
-            if tokenizer.is_unknown(i) or tokenizer.is_control(i) or tokenizer.is_byte(i):
-                text: bytes = piece.encode("utf-8")
-            else:
-                text = piece.replace("\u2581", " ").encode("utf-8")
-
+            text: bytes = piece.encode("utf-8")
             score: float = tokenizer.get_score(i)
             yield text, score
 
diff --git a/llama.cpp b/llama.cpp
@@ -1832,13 +1832,13 @@ struct llama_tokenizer {
     llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
 
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        // split string into utf8 chars
+        // split string into utf8 chars / token?
         int index = 0;
         size_t offs = 0;
         while (offs < text.size()) {
             llama_sp_symbol sym;
-            // size_t len = utf8_len(text[offs]);
-            size_t len = llama_trie_find(vocab_.trie, text, offs);
+            size_t len = utf8_len(text[offs]);
+            // size_t len = llama_trie_find(vocab_.trie, text, offs);
             if (len == 0) {
                 len = utf8_len(text[offs]);
             }
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
@@ -5,26 +5,59 @@
 #include <map>
 #include <vector>
 
-std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
+static std::string escape_whitespace(const std::string& text) {
     std::string result;
-    for (int i = 0; i < count; ++i) {
-        result += llama_token_to_str(ctx, tokens[i]);
-        if (i < count - 1) {
-            result += "_";
+    bool escaping = false;
+    result += char(0xe2);
+    result += char(0x96);
+    result += char(0x81);
+    for (size_t offs = 0; offs < text.length(); ++offs) {
+        if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
+            if (!escaping) {
+                result += char(0xe2);
+                result += char(0x96);
+                result += char(0x81);
+                escaping = true;
+            }
+        }
+        else {
+            escaping = false;
+            result += text[offs];
         }
     }
     return result;
 }
 
+static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
+    const char* word = llama_token_to_str(ctx, token);
+    if (strlen(word) >= 3 &&
+        word[0] == char(0xe2) &&
+        word[1] == char(0x96) &&
+        word[2] == char(0x81)) {
+        return std::string(" ") + (word + 3);
+    } 
+    return word;
+}
+
+static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
+    std::string result;
+    for (int i = 0; i < count; ++i) {
+        result += unescape_whitespace(ctx, tokens[i]);
+    }
+    return result;
+}
+
 static const std::map<std::string, std::vector<llama_token>> & k_tests()
 {
     static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { "Hello World",        { 1,  10994,   2787, }, },
-        { " Hello World",       { 1,  15043,   2787, }, },
-        { " Hello World!",      { 1,  15043,   2787,  29991, }, },
-        { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
-        { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
-        { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
+        { "Hello world",        { 1,  15043,   3186, }, },
+        { " Hello world",       { 1,  29871,  15043,   3186, }, },
+        { "Hello World",        { 1,  15043,   2787, }, },
+        { " Hello World",       { 1,  29871,  15043,   2787, }, },
+        {" Hello World!",       { 1,  29871,  15043,   2787,  29991, }, },
+        {" this is 🦙.cpp",    { 1,  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
+        {"w048 7tuijk dsdfhu",  { 1,    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
+        {"нещо на Български",   { 1,   1538,   4851,    665,   1386,  29713,   1305, }, },
     };
     return _k_tests;
 };
@@ -77,9 +110,9 @@ int main(int argc, char **argv) {
 
     for (const auto & test_kv : k_tests()) {
         std::vector<llama_token> res(test_kv.first.size());
-        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
+        const int n = llama_tokenize(ctx, escape_whitespace(test_kv.first.c_str()).c_str(), res.data(), int(res.size()), true);
         fprintf(stderr, "%s : '%s' tokenized to '%s'\n", 
-            __func__, test_kv.first.c_str(), detokenize(ctx, res.data(), n).c_str());
+            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str());
         res.resize(n);
 
         bool correct = res.size() == test_kv.second.size();
diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp
@@ -8,17 +8,48 @@
 #include <map>
 #include <vector>
 
-std::string detokenize(llama_context * ctx, const llama_token * tokens, int count) {
+static std::string escape_whitespace(const std::string& text) {
     std::string result;
-    for (int i = 0; i < count; ++i) {
-        result += llama_token_to_str(ctx, tokens[i]);
-        if (i < count - 1) {
-            result += "_";
+    bool escaping = false;
+    result += char(0xe2);
+    result += char(0x96);
+    result += char(0x81);
+    for (size_t offs = 0; offs < text.length(); ++offs) {
+        if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') {
+            if (!escaping) {
+                result += char(0xe2);
+                result += char(0x96);
+                result += char(0x81);
+                escaping = true;
+            }
+        }
+        else {
+            escaping = false;
+            result += text[offs];
         }
     }
     return result;
 }
 
+static std::string unescape_whitespace(llama_context* ctx, llama_token token) {
+    const char* word = llama_token_to_str(ctx, token);
+    if (strlen(word) >= 3 &&
+        word[0] == char(0xe2) &&
+        word[1] == char(0x96) &&
+        word[2] == char(0x81)) {
+        return std::string(" ") + (word + 3);
+    }
+    return word;
+}
+
+static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
+    std::string result;
+    for (int i = 0; i < count; ++i) {
+        result += unescape_whitespace(ctx, tokens[i]);
+    }
+    return result;
+}
+
 int main(int argc, char **argv) {
     if (argc < 2) {
         fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
@@ -66,22 +97,22 @@ int main(int argc, char **argv) {
     }
 
     for (int i = 0; i < n_vocab; ++i) {
-        const char * forward = llama_token_to_str(ctx, i);
-        std::vector<llama_token> tokens(strlen(forward));
-        auto n = llama_tokenize(ctx, forward, tokens.data(), strlen(forward), false);
+        std::string forward = llama_token_to_str(ctx, i);
+        std::vector<llama_token> tokens(forward.length());
+        int n = llama_tokenize(ctx, forward.c_str(), tokens.data(), forward.length(), false);
         if (n == 1) {
             if (i != tokens[0]) {
-                const char* backward = llama_token_to_str(ctx, tokens[0]);
+                std::string backward = unescape_whitespace(ctx, tokens[0]);
                 fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n", 
-                    __func__, i, forward, tokens[0], backward);
+                    __func__, i, unescape_whitespace(ctx, i).c_str(), tokens[0], backward.c_str());
             }
         } else {
             if (i <= 258) {
                 fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n", 
-                    __func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
+                    __func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
             } else {
                 fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n", 
-                    __func__, i, forward, detokenize(ctx, tokens.data(), n).c_str());
+                    __func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
             }
         }
     }
@@ -91,7 +122,7 @@ int main(int argc, char **argv) {
         std::wstring wstr(1, ch);
         std::string str = converter.to_bytes(wstr);
         std::vector<llama_token> tokens(strlen(str.c_str()));
-        auto n = llama_tokenize(ctx, str.c_str(), tokens.data(), str.length(), false);
+        auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length(), false);
         if (n == 1) {
             fprintf(stderr, "%s : info: %s tokenized to %d \n", 
                 __func__, str.c_str(), tokens[0]);