From a34ace9f525ba2d90a2cddf151da8e1df15cc8ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Wed, 1 May 2024 21:17:08 -0400 Subject: [PATCH 1/3] Add BPE pre-tokenization for Command-R. --- convert-hf-to-gguf-update.py | 1 + convert-hf-to-gguf.py | 4 ++++ llama.cpp | 8 ++++++++ llama.h | 1 + 4 files changed, 14 insertions(+) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index b019c1e3dc59f..f4774003787ae 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -56,6 +56,7 @@ class TOKENIZER_TYPE(IntEnum): { "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, { "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, { "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, + { "name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, ] # make directory "models/tokenizers" if it doesn't exist diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 2f146d7302a78..9ce88d2b3f70e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -263,6 +263,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: # NOTE: this function is generated by convert-hf-to-gguf-update.py # do not modify it manually! # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + def get_vocab_base_pre(self, tokenizer) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # is specific for the BPE pre-tokenizer used by the model @@ -306,6 +307,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" + if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" if res is None: print("\n") diff --git a/llama.cpp b/llama.cpp index 18d6297ce1dfd..cafb46071047b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4383,6 +4383,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "gpt-2") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; + } else if ( + tokenizer_pre == "command-r") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -12240,6 +12243,11 @@ struct llm_tokenizer_bpe { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }); break; + case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + word_collection = unicode_regex_split(text, { + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; default: // default regex for BPE tokenization pre-processing word_collection = unicode_regex_split(text, { diff --git a/llama.h b/llama.h index 059d78f115c6d..62afe9b512e4b 100644 --- a/llama.h +++ b/llama.h @@ -79,6 +79,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_MPT = 5, LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 8, }; // note: these values should be synchronized with ggml_rope From 8242447b7bdea4f4f37a35468114ae905f4748c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Thu, 2 May 2024 07:17:05 -0400 Subject: [PATCH 2/3] Support handling of LFS for download. --- convert-hf-to-gguf-update.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index f4774003787ae..4ad4d8672fbd7 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -95,6 +95,14 @@ def download_file_with_auth(url, token, save_path): save_path = f"models/tokenizers/{name}/tokenizer.json" download_file_with_auth(url, token, save_path) + # if downloaded file is less than 1KB, we likely need to download an LFS instead + if os.path.getsize(save_path) < 1024: + # remove the file + os.remove(save_path) + url = f"{repo}/resolve/main/tokenizer.json" + save_path = f"models/tokenizers/{name}/tokenizer.json" + download_file_with_auth(url, token, save_path) + if tokt == TOKENIZER_TYPE.SPM: url = f"{repo}/resolve/main/tokenizer.model" save_path = f"models/tokenizers/{name}/tokenizer.model" From 9cbad1b2cf4852fc6cd7ff8eab3c41734cea6e07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Thu, 2 May 2024 07:21:11 -0400 Subject: [PATCH 3/3] Add test for command-r tokenizer. --- tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d23e7f771d054..edcbf5054f1ad 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -81,6 +81,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE #llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf) # build test-tokenizer-1-bpe target once and add many tests add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)