Fix for ggml-org#2310

goerch · goerch · commit e6b1a5003e94 · 2023-07-23T18:17:32.000+02:00
Waiting for the fallout ...
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -564,7 +564,7 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
 // TODO: not great allocating this every time
 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
     // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int) add_bos);
+    std::vector<llama_token> res(text.size() + (int) add_bos + 1);
     const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
     assert(n >= 0);
     res.resize(n);
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
         }
         fprintf(stderr, "\n");
     }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -196,10 +196,6 @@ int main(int argc, char ** argv) {
 
     // tokenize the prompt
     std::vector<llama_token> embd_inp;
-
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         embd_inp = ::llama_tokenize(ctx, params.prompt, true);
     } else {
@@ -283,22 +279,22 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
         }
 
         if (ctx_guidance) {
             fprintf(stderr, "\n");
             fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
             fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
             for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
             }
         }
 
         if (params.n_keep > 0) {
         fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
+                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
             }
             fprintf(stderr, "'\n");
         }
@@ -636,7 +632,7 @@ int main(int argc, char ** argv) {
         // display text
         if (input_echo) {
             for (auto id : embd) {
-                printf("%s", llama_token_to_str(ctx, id));
+                printf("%s", llama_token_to_str(ctx, id).c_str());
             }
             fflush(stdout);
         }
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
@@ -91,7 +91,7 @@ int main(int argc, char ** argv) {
         auto next_token_str = llama_token_to_str(ctx, next_token);
         last_n_tokens_data.push_back(next_token);
 
-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
         if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_free(ctx);
@@ -151,7 +151,7 @@ int main(int argc, char ** argv) {
         auto next_token_str = llama_token_to_str(ctx2, next_token);
         last_n_tokens_data.push_back(next_token);
 
-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
         if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_free(ctx2);
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -102,7 +102,7 @@ int main(int argc, char ** argv)
 
     for( auto id : tokens_list )
     {
-        printf( "%s" , llama_token_to_str( ctx , id ) );
+        printf( "%s" , llama_token_to_str( ctx , id ).c_str() );
     }
 
     fflush(stdout);
@@ -162,7 +162,7 @@ int main(int argc, char ** argv)
         }
 
         // Print the new token :
-        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
+        printf( "%s" , llama_token_to_str( ctx , new_token_id ).c_str() );
         fflush( stdout );
 
         // Push this new token for next evaluation :
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1959,7 +1959,7 @@ void print_matrix(struct ggml_tensor * probs) {
 
 
 void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token));
+    printf("%s", llama_token_to_str(ctx, token).c_str());
 }
 
 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@@ -2198,17 +2198,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
         const char * in  = buf.data();
         const char * end = buf.data() + buf.size();
         for (int i = 0; i < (int) out.size(); ++i) {
-            const char * s = llama_token_to_str(lctx, out[i]);
-            int len = strlen(s);
+            std::string s = llama_token_to_str(lctx, out[i]);
+            int len = s.length();
             if (in >= end) {
                 printf("%s: unexpected end of original text.\n", __func__);
                 break;
             }
-            const bool matches = (strncmp(in, s, len) == 0);
+            const bool matches = (strncmp(in, s.c_str(), len) == 0);
             if (matches) {
                 in += len;
             } else {
-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
             }
         }
     }
diff --git a/llama.cpp b/llama.cpp
@@ -242,13 +242,6 @@ struct llama_kv_cache {
     }
 };
 
-struct llama_trie {
-    std::unordered_map<std::string, llama_trie> map;
-};
-
-void llama_trie_insert(struct llama_trie& trie, const std::string& text, size_t offs);
-size_t llama_trie_find(const struct llama_trie& trie, const std::string& text, size_t offs);
-
 struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
@@ -260,7 +253,6 @@ struct llama_vocab {
 
     std::unordered_map<token, id> token_to_id;
     std::vector<token_score> id_to_token;
-    struct llama_trie trie;
 };
 
 struct llama_model {
@@ -524,13 +516,12 @@ struct llama_file_loader {
             float score = 0.0f;
             file.read_raw(&score, sizeof(score));
 
+            assert(vocab.token_to_id.find(word) == vocab.token_to_id.end());
             vocab.token_to_id[word] = i;
 
             auto & tok_score = vocab.id_to_token[i];
             tok_score.tok = word;
             tok_score.score = score;
-
-            llama_trie_insert(vocab.trie, word, 0);
         }
     }
     void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
@@ -1804,26 +1795,37 @@ struct llama_sp_bigram {
     size_t size;
 };
 
-void llama_trie_insert(struct llama_trie& trie, const std::string& text, size_t offs) {
-    if (offs < text.size()) {
-        size_t char_len = utf8_len(text[offs]);
-        std::string key = text.substr(offs, char_len);
-        if (trie.map.find(key) == trie.map.end()) {
-            trie.map[key] = llama_trie();
+static std::string llama_escape_whitespace(const std::string& text) {
+    std::string result;
+    bool escaping = false;
+    result += char(0xe2);
+    result += char(0x96);
+    result += char(0x81);
+    for (size_t offs = 0; offs < text.length(); ++offs) {
+        if (text[offs] == ' ') {
+            if (!escaping) {
+                result += char(0xe2);
+                result += char(0x96);
+                result += char(0x81);
+                escaping = true;
+            }
+        }
+        else {
+            escaping = false;
+            result += text[offs];
         }
-        llama_trie_insert(trie.map.at(key), text, offs + char_len);
     }
+    return result;
 }
 
-size_t llama_trie_find(const struct llama_trie& trie, const std::string & text, size_t offs) {
-    if (offs < text.size()) {
-        size_t char_len = utf8_len(text[offs]);
-        std::string key = text.substr(offs, char_len);
-        if (trie.map.find(key) != trie.map.end()) {
-            return char_len + llama_trie_find(trie.map.at(key), text, offs + char_len);
-        }
-    }
-    return 0;
+static std::string llama_unescape_whitespace(const std::string& word) {
+    if (word.length() >= 3 &&
+        word[0] == char(0xe2) &&
+        word[1] == char(0x96) &&
+        word[2] == char(0x81)) {
+        return std::string(" ") + word.substr(3);
+    } 
+    return word;
 }
 
 // original implementation:
@@ -1832,13 +1834,12 @@ struct llama_tokenizer {
     llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
 
     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        // split string into utf8 chars / token?
+        // split string into utf8 chars
         int index = 0;
         size_t offs = 0;
         while (offs < text.size()) {
             llama_sp_symbol sym;
             size_t len = utf8_len(text[offs]);
-            // size_t len = llama_trie_find(vocab_.trie, text, offs);
             if (len == 0) {
                 len = utf8_len(text[offs]);
             }
@@ -1908,7 +1909,7 @@ struct llama_tokenizer {
 
         if (p == rev_merge.end()) {
             // output any symbols that did not form tokens as bytes.
-            for (int j = 0; j < (int) symbol.n; ++j) {
+            for (int j = 0; j < (int)symbol.n; ++j) {
                 llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
                 output.push_back(token_id);
             }
@@ -1954,18 +1955,25 @@ struct llama_tokenizer {
     std::map<std::string, std::pair<int, int> > rev_merge;
 };
 
-static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
+static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
     llama_tokenizer tokenizer(vocab);
     std::vector<llama_vocab::id> output;
 
-    if (text.empty()) {
+    if (raw_text.empty()) {
         return output;
     }
 
     if (bos) {
         output.push_back(llama_token_bos());
     }
 
+    std::string text;
+    if (escape) {
+        text = llama_escape_whitespace(raw_text);
+    } else {
+        text = raw_text;
+    }
+
     tokenizer.tokenize(text, output);
     return output;
 }
@@ -3620,7 +3628,7 @@ int llama_tokenize_with_model(
                  llama_token * tokens,
                          int   n_max_tokens,
                         bool   add_bos) {
-    auto res = llama_tokenize(model->vocab, text, add_bos);
+    auto res = llama_tokenize(model->vocab, text, add_bos, true);
 
     if (n_max_tokens < (int) res.size()) {
         fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3643,6 +3651,27 @@ int llama_tokenize(
     return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
 }
 
+int llama_tokenize_bpe(
+        struct llama_context * ctx,
+                  const char * text,
+                 llama_token * tokens,
+                         int   n_max_tokens,
+                        bool   add_bos) {
+    auto res = llama_tokenize(ctx->model.vocab, text, add_bos, false);
+
+    if (n_max_tokens < (int) res.size()) {
+        fprintf(stderr, "%s: too many tokens\n", __func__);
+        return -((int) res.size());
+    }
+
+    for (size_t i = 0; i < res.size(); i++) {
+        tokens[i] = res[i];
+    }
+
+    return res.size();
+}
+
+
 int llama_n_vocab_from_model(const struct llama_model * model) {
     return model->vocab.id_to_token.size();
 }
@@ -3696,18 +3725,26 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
 
-const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
+std::string llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
     if (token >= llama_n_vocab_from_model(model)) {
         return nullptr;
     }
 
-    return model->vocab.id_to_token[token].tok.c_str();
+    return llama_unescape_whitespace(model->vocab.id_to_token[token].tok);
 }
 
-const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
     return llama_token_to_str_with_model(&ctx->model, token);
 }
 
+std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
+    if (token >= llama_n_vocab_from_model(&ctx->model)) {
+        return nullptr;
+    }
+
+    return ctx->model.vocab.id_to_token[token].tok;
+}
+
 llama_token llama_token_bos() {
     return 1;
 }
diff --git a/llama.h b/llama.h
@@ -11,6 +11,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
+#include <string>
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -278,6 +279,13 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
 
+    LLAMA_API int llama_tokenize_bpe(
+            struct llama_context * ctx,
+                      const char * text,
+                     llama_token * tokens,
+                             int   n_max_tokens,
+                            bool   add_bos);
+
     LLAMA_API int llama_tokenize_with_model(
         const struct llama_model * model,
                       const char * text,
@@ -319,11 +327,15 @@ extern "C" {
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 
     // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(
+    LLAMA_API std::string llama_token_to_str(
+            const struct llama_context * ctx,
+                           llama_token   token);
+
+    LLAMA_API std::string llama_token_to_str_bpe(
             const struct llama_context * ctx,
                            llama_token   token);
 
-    LLAMA_API const char * llama_token_to_str_with_model(
+    LLAMA_API std::string llama_token_to_str_with_model(
               const struct llama_model * model,
                            llama_token   token);
 
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp

Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ int main(int argc, char ** argv) {`
`67`	`67`	`fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());`
`68`	`68`	`fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());`
`69`	`69`	`for (int i = 0; i < (int) embd_inp.size(); i++) {`
`70`		`- fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));`
	`70`	`+ fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());`
`71`	`71`	`}`
`72`	`72`	`fprintf(stderr, "\n");`
`73`	`73`	`}`
Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ int main(int argc, char ** argv)`
`102`	`102`
`103`	`103`	`for( auto id : tokens_list )`
`104`	`104`	`{`
`105`		`- printf( "%s" , llama_token_to_str( ctx , id ) );`
	`105`	`+ printf( "%s" , llama_token_to_str( ctx , id ).c_str() );`
`106`	`106`	`}`
`107`	`107`
`108`	`108`	`fflush(stdout);`
`@@ -162,7 +162,7 @@ int main(int argc, char ** argv)`
`162`	`162`	`}`
`163`	`163`
`164`	`164`	`// Print the new token :`
`165`		`- printf( "%s" , llama_token_to_str( ctx , new_token_id ) );`
	`165`	`+ printf( "%s" , llama_token_to_str( ctx , new_token_id ).c_str() );`
`166`	`166`	`fflush( stdout );`
`167`	`167`
`168`	`168`	`// Push this new token for next evaluation :`
Original file line number	Diff line number	Diff line change
`@@ -1959,7 +1959,7 @@ void print_matrix(struct ggml_tensor * probs) {`
`1959`	`1959`
`1960`	`1960`
`1961`	`1961`	`void print_token(struct llama_context * ctx, llama_token token) {`
`1962`		`- printf("%s", llama_token_to_str(ctx, token));`
	`1962`	`+ printf("%s", llama_token_to_str(ctx, token).c_str());`
`1963`	`1963`	`}`
`1964`	`1964`
`1965`	`1965`	`void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {`
`@@ -2198,17 +2198,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto`
`2198`	`2198`	`const char * in = buf.data();`
`2199`	`2199`	`const char * end = buf.data() + buf.size();`
`2200`	`2200`	`for (int i = 0; i < (int) out.size(); ++i) {`
`2201`		`- const char * s = llama_token_to_str(lctx, out[i]);`
`2202`		`- int len = strlen(s);`
	`2201`	`+ std::string s = llama_token_to_str(lctx, out[i]);`
	`2202`	`+ int len = s.length();`
`2203`	`2203`	`if (in >= end) {`
`2204`	`2204`	`printf("%s: unexpected end of original text.\n", __func__);`
`2205`	`2205`	`break;`
`2206`	`2206`	`}`
`2207`		`- const bool matches = (strncmp(in, s, len) == 0);`
	`2207`	`+ const bool matches = (strncmp(in, s.c_str(), len) == 0);`
`2208`	`2208`	`if (matches) {`
`2209`	`2209`	`in += len;`
`2210`	`2210`	`} else {`
`2211`		`- printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);`
	`2211`	`+ printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());`
`2212`	`2212`	`}`
`2213`	`2213`	`}`
`2214`	`2214`	`}`