Implement non-greedy tokenizer that tries to maximize token lengths (ggml-org#242)

thement · jxhor · gyohng · commit a93297c35dd4 · 2023-03-19T04:19:56.000+08:00
* Implement non-greedy tokenizer that tries to maximize token lengths

* Insert single space in front of the prompt

- this is to match original llama tokenizer behavior

---------

Co-authored-by: Jakub Horak &lt;jakub.horak@ibawizard.net&gt;
diff --git a/utils.cpp b/utils.cpp
@@ -302,7 +302,7 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
     // Forward pass
     for (int i = 0; i < len; i++) {
         int max_len = std::min(len - i, MAX_TOKEN_LEN);
-        for (int sub_len = 1; sub_len <= max_len; sub_len++) {
+        for (int sub_len = 1; sub_len <= len - i; sub_len++) {
             auto sub = text.substr(i, sub_len);
             auto token = vocab.token_to_id.find(sub);
             if (token != vocab.token_to_id.end()) {