diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index b019c1e3dc59f..4ad4d8672fbd7 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -56,6 +56,7 @@ class TOKENIZER_TYPE(IntEnum): { "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, { "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, { "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, + { "name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, ] # make directory "models/tokenizers" if it doesn't exist @@ -94,6 +95,14 @@ def download_file_with_auth(url, token, save_path): save_path = f"models/tokenizers/{name}/tokenizer.json" download_file_with_auth(url, token, save_path) + # if downloaded file is less than 1KB, we likely need to download an LFS instead + if os.path.getsize(save_path) < 1024: + # remove the file + os.remove(save_path) + url = f"{repo}/resolve/main/tokenizer.json" + save_path = f"models/tokenizers/{name}/tokenizer.json" + download_file_with_auth(url, token, save_path) + if tokt == TOKENIZER_TYPE.SPM: url = f"{repo}/resolve/main/tokenizer.model" save_path = f"models/tokenizers/{name}/tokenizer.model" diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 2f146d7302a78..9ce88d2b3f70e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -263,6 +263,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: # NOTE: this function is generated by convert-hf-to-gguf-update.py # do not modify it manually! # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + def get_vocab_base_pre(self, tokenizer) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # is specific for the BPE pre-tokenizer used by the model @@ -306,6 +307,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" + if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" if res is None: print("\n") diff --git a/llama.cpp b/llama.cpp index 18d6297ce1dfd..cafb46071047b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4383,6 +4383,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "gpt-2") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; + } else if ( + tokenizer_pre == "command-r") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -12240,6 +12243,11 @@ struct llm_tokenizer_bpe { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }); break; + case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + word_collection = unicode_regex_split(text, { + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; default: // default regex for BPE tokenization pre-processing word_collection = unicode_regex_split(text, { diff --git a/llama.h b/llama.h index 059d78f115c6d..62afe9b512e4b 100644 --- a/llama.h +++ b/llama.h @@ -79,6 +79,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_MPT = 5, LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 8, }; // note: these values should be synchronized with ggml_rope diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d23e7f771d054..edcbf5054f1ad 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -81,6 +81,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE #llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf) # build test-tokenizer-1-bpe target once and add many tests add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)