From a34ace9f525ba2d90a2cddf151da8e1df15cc8ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?DAN=E2=84=A2?= <dranger003@gmail.com>
Date: Wed, 1 May 2024 21:17:08 -0400
Subject: [PATCH 1/3] Add BPE pre-tokenization for Command-R.

---
 convert-hf-to-gguf-update.py | 1 +
 convert-hf-to-gguf.py        | 4 ++++
 llama.cpp                    | 8 ++++++++
 llama.h                      | 1 +
 4 files changed, 14 insertions(+)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index b019c1e3dc59f..f4774003787ae 100644
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -56,6 +56,7 @@ class TOKENIZER_TYPE(IntEnum):
         { "name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
         { "name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
         { "name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+        { "name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
         ]
 
 # make directory "models/tokenizers" if it doesn't exist
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 2f146d7302a78..9ce88d2b3f70e 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -263,6 +263,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
     # NOTE: this function is generated by convert-hf-to-gguf-update.py
     #       do not modify it manually!
     # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
+
     def get_vocab_base_pre(self, tokenizer) -> str:
         # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
         # is specific for the BPE pre-tokenizer used by the model
@@ -306,6 +307,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
             # ref: https://huggingface.co/openai-community/gpt2
             res = "gpt-2"
+        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
+            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
+            res = "command-r"
 
         if res is None:
             print("\n")
diff --git a/llama.cpp b/llama.cpp
index 18d6297ce1dfd..cafb46071047b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4383,6 +4383,9 @@ static void llm_load_vocab(
             } else if (
                     tokenizer_pre == "gpt-2") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                tokenizer_pre == "command-r") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -12240,6 +12243,11 @@ struct llm_tokenizer_bpe {
                             "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                         });
                         break;
+                    case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
+                        word_collection = unicode_regex_split(text, {
+                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                        });
+                        break;
                     default:
                         // default regex for BPE tokenization pre-processing
                         word_collection = unicode_regex_split(text, {
diff --git a/llama.h b/llama.h
index 059d78f115c6d..62afe9b512e4b 100644
--- a/llama.h
+++ b/llama.h
@@ -79,6 +79,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
         LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
         LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 8,
     };
 
     // note: these values should be synchronized with ggml_rope

From 8242447b7bdea4f4f37a35468114ae905f4748c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?DAN=E2=84=A2?= <dranger003@gmail.com>
Date: Thu, 2 May 2024 07:17:05 -0400
Subject: [PATCH 2/3] Support handling of LFS for download.

---
 convert-hf-to-gguf-update.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index f4774003787ae..4ad4d8672fbd7 100644
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -95,6 +95,14 @@ def download_file_with_auth(url, token, save_path):
     save_path = f"models/tokenizers/{name}/tokenizer.json"
     download_file_with_auth(url, token, save_path)
 
+    # if downloaded file is less than 1KB, we likely need to download an LFS instead
+    if os.path.getsize(save_path) < 1024:
+        # remove the file
+        os.remove(save_path)
+        url = f"{repo}/resolve/main/tokenizer.json"
+        save_path = f"models/tokenizers/{name}/tokenizer.json"
+        download_file_with_auth(url, token, save_path)
+
     if tokt == TOKENIZER_TYPE.SPM:
         url = f"{repo}/resolve/main/tokenizer.model"
         save_path = f"models/tokenizers/{name}/tokenizer.model"

From 9cbad1b2cf4852fc6cd7ff8eab3c41734cea6e07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?DAN=E2=84=A2?= <dranger003@gmail.com>
Date: Thu, 2 May 2024 07:21:11 -0400
Subject: [PATCH 3/3] Add test for command-r tokenizer.

---
 tests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d23e7f771d054..edcbf5054f1ad 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -81,6 +81,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE
 #llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
 
 # build test-tokenizer-1-bpe target once and add many tests
 add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)