From a808370c5813ae41aed0028e5b298f608df0538f Mon Sep 17 00:00:00 2001 From: Anisse Astier Date: Mon, 29 Apr 2024 21:57:07 +0200 Subject: [PATCH] examples : new program to verify gguf tokenizer parameters This program verifies that a given gguf model file can tokenize all potential valid characters. Since llama.cpp currently raises an exception when tokenization is not possible[1], this tool helps verifying that valid ascii and utf-8 will always be properly tokenized. [1] https://github.com/ggerganov/llama.cpp/issues/2580 --- examples/CMakeLists.txt | 1 + examples/tokenizer-verifier/CMakeLists.txt | 5 ++ .../tokenizer-verifier/tokenizer-verifier.cpp | 78 +++++++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 examples/tokenizer-verifier/CMakeLists.txt create mode 100644 examples/tokenizer-verifier/tokenizer-verifier.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f421769cc2f0a..a917dbfda9d36 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -31,6 +31,7 @@ else() endif() add_subdirectory(main) add_subdirectory(tokenize) + add_subdirectory(tokenizer-verifier) add_subdirectory(parallel) add_subdirectory(perplexity) add_subdirectory(quantize) diff --git a/examples/tokenizer-verifier/CMakeLists.txt b/examples/tokenizer-verifier/CMakeLists.txt new file mode 100644 index 0000000000000..b29bb36c44a67 --- /dev/null +++ b/examples/tokenizer-verifier/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET tokenizer-verifier) +add_executable(${TARGET} tokenizer-verifier.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/tokenizer-verifier/tokenizer-verifier.cpp b/examples/tokenizer-verifier/tokenizer-verifier.cpp new file mode 100644 index 0000000000000..eb2f1ba4a658e --- /dev/null +++ b/examples/tokenizer-verifier/tokenizer-verifier.cpp @@ -0,0 +1,78 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include + +static int unicode_to_utf8(int codepoint, char *dest) { + // https://stackoverflow.com/a/4609989 — who needs iconv? + if (codepoint < 0x80) { + *dest++ = codepoint; + } else if (codepoint < 0x800) { + *dest++ = 192 + codepoint / 64, *dest++ = 128 + codepoint % 64; + // we also support reserved utf-16 surrogates 0xd800 - 0xdfff for simplicity + } else if (codepoint < 0x10000) { + *dest++ = 224 + codepoint / 4096, *dest++ = 128 + codepoint / 64 % 64, + *dest++ = 128 + codepoint % 64; + } else if (codepoint < 0x110000) { + *dest++ = 240 + codepoint / 262144, *dest++ = 128 + codepoint / 4096 % 64, + *dest++ = 128 + codepoint / 64 % 64, *dest++ = 128 + codepoint % 64; + } else { + return 1; + } + return 0; +} + +int main(int argc, char **argv) { + if (argc < 2) { + printf("usage: %s MODEL_PATH\n", argv[0]); + return 1; + } + + const char *model_path = argv[1]; + + llama_backend_init(); + + llama_model_params model_params = llama_model_default_params(); + model_params.vocab_only = true; + llama_model *model = llama_load_model_from_file(model_path, model_params); + + std::vector tokens; + + int failed_ascii = 0; + int ascii_max = 127; + for (int c = 0; c <= ascii_max; c++) { + const char prompt[] = {(char)c, '\0'}; + try { + tokens = ::llama_tokenize(model, prompt, false, true); + } catch (...) { + printf("%#x -> Tokenization failed for char '%c'\n", c, (char)c); + failed_ascii += 1; + continue; + } + } + printf("%d/%d 7-bit ascii characters could not be tokenized\n", failed_ascii, ascii_max); + + int failed_unicode = 0; + int utf8_max = 0x10FFFF; + // Now let's do all potential codepoints + for (int cp = 0; cp <= utf8_max; cp++) { + char buf[5] = {}; + if (unicode_to_utf8(cp, buf)) { + printf("Impossible to encode codepoint %#x\n", cp); + continue; + } + try { + tokens = ::llama_tokenize(model, buf, false, true); + } catch (...) { + // printf("%#x -> Tokenization failed for codepoint '%s'\n", cp, buf); + failed_unicode += 1; + continue; + } + } + printf("%d/%d potential unicode codepoints not tokenized\n", failed_unicode, + utf8_max); + + return (failed_ascii != 0 || failed_unicode != 0); +}