Skip to content

Commit b8092c9

Browse files
ngxsonhodlen
authored andcommitted
llama : add llama_chat_apply_template() (ggml-org#5538)
* llama: add llama_chat_apply_template * test-chat-template: remove dedundant vector * chat_template: do not use std::string for buffer * add clarification for llama_chat_apply_template * llama_chat_apply_template: add zephyr template * llama_chat_apply_template: correct docs * llama_chat_apply_template: use term "chat" everywhere * llama_chat_apply_template: change variable name to "tmpl"
1 parent 2d020d8 commit b8092c9

File tree

5 files changed

+211
-0
lines changed

5 files changed

+211
-0
lines changed

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -867,3 +867,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
867867
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
868868
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
869869
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
870+
871+
tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
872+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
873+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12508,6 +12508,123 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
1250812508
return 0;
1250912509
}
1251012510

12511+
// trim whitespace from the beginning and end of a string
12512+
static std::string trim(const std::string & str) {
12513+
size_t start = 0;
12514+
size_t end = str.size();
12515+
while (start < end && isspace(str[start])) {
12516+
start += 1;
12517+
}
12518+
while (end > start && isspace(str[end - 1])) {
12519+
end -= 1;
12520+
}
12521+
return str.substr(start, end - start);
12522+
}
12523+
12524+
// Simple version of "llama_apply_chat_template" that only works with strings
12525+
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
12526+
static int32_t llama_chat_apply_template_internal(
12527+
const std::string & tmpl,
12528+
const std::vector<const llama_chat_message *> & chat,
12529+
std::string & dest, bool add_ass) {
12530+
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
12531+
std::stringstream ss;
12532+
if (tmpl.find("<|im_start|>") != std::string::npos) {
12533+
// chatml template
12534+
for (auto message : chat) {
12535+
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
12536+
}
12537+
if (add_ass) {
12538+
ss << "<|im_start|>assistant\n";
12539+
}
12540+
} else if (tmpl.find("[INST]") != std::string::npos) {
12541+
// llama2 template and its variants
12542+
// [variant] support system message
12543+
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
12544+
// [variant] space before + after response
12545+
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
12546+
// [variant] add BOS inside history
12547+
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
12548+
// [variant] trim spaces from the input message
12549+
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
12550+
// construct the prompt
12551+
bool is_inside_turn = true; // skip BOS at the beginning
12552+
ss << "[INST] ";
12553+
for (auto message : chat) {
12554+
std::string content = strip_message ? trim(message->content) : message->content;
12555+
std::string role(message->role);
12556+
if (!is_inside_turn) {
12557+
is_inside_turn = true;
12558+
ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
12559+
}
12560+
if (role == "system") {
12561+
if (support_system_message) {
12562+
ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
12563+
} else {
12564+
// if the model does not support system message, we still include it in the first message, but without <<SYS>>
12565+
ss << content << "\n";
12566+
}
12567+
} else if (role == "user") {
12568+
ss << content << " [/INST]";
12569+
} else {
12570+
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
12571+
is_inside_turn = false;
12572+
}
12573+
}
12574+
// llama2 templates seem to not care about "add_generation_prompt"
12575+
} else if (tmpl.find("<|user|>") != std::string::npos) {
12576+
// zephyr template
12577+
for (auto message : chat) {
12578+
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
12579+
}
12580+
if (add_ass) {
12581+
ss << "<|assistant|>\n";
12582+
}
12583+
} else {
12584+
// template not supported
12585+
return -1;
12586+
}
12587+
dest = ss.str();
12588+
return dest.size();
12589+
}
12590+
12591+
LLAMA_API int32_t llama_chat_apply_template(
12592+
const struct llama_model * model,
12593+
const char * tmpl,
12594+
const struct llama_chat_message * chat,
12595+
size_t n_msg,
12596+
bool add_ass,
12597+
char * buf,
12598+
int32_t length) {
12599+
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
12600+
if (tmpl == nullptr) {
12601+
GGML_ASSERT(model != nullptr);
12602+
// load template from model
12603+
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
12604+
std::string template_key = "tokenizer.chat_template";
12605+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
12606+
if (res < 0) {
12607+
// worst case: there is no information about template, we will use chatml by default
12608+
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
12609+
} else {
12610+
curr_tmpl = std::string(model_template.data(), model_template.size());
12611+
}
12612+
}
12613+
// format the chat to string
12614+
std::vector<const llama_chat_message *> chat_vec;
12615+
chat_vec.resize(n_msg);
12616+
for (size_t i = 0; i < n_msg; i++) {
12617+
chat_vec[i] = &chat[i];
12618+
}
12619+
std::string formatted_chat;
12620+
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
12621+
if (res < 0) {
12622+
return res;
12623+
}
12624+
strncpy(buf, formatted_chat.c_str(), length);
12625+
return res;
12626+
}
12627+
1251112628
struct llama_timings llama_get_timings(struct llama_context * ctx) {
1251212629
struct llama_timings result = {
1251312630
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,

llama.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,12 @@ extern "C" {
305305
int32_t n_eval;
306306
};
307307

308+
// used in chat template
309+
typedef struct llama_chat_message {
310+
const char * role;
311+
const char * content;
312+
} llama_chat_message;
313+
308314
// Helpers for getting default parameters
309315
LLAMA_API struct llama_model_params llama_model_default_params(void);
310316
LLAMA_API struct llama_context_params llama_context_default_params(void);
@@ -699,6 +705,25 @@ extern "C" {
699705
char * buf,
700706
int32_t length);
701707

708+
/// Apply chat template. Inspired by hf apply_chat_template() on python.
709+
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
710+
/// NOTE: This function only support some known jinja templates. It is not a jinja parser.
711+
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
712+
/// @param chat Pointer to a list of multiple llama_chat_message
713+
/// @param n_msg Number of llama_chat_message in this chat
714+
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
715+
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
716+
/// @param length The size of the allocated buffer
717+
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
718+
LLAMA_API int32_t llama_chat_apply_template(
719+
const struct llama_model * model,
720+
const char * tmpl,
721+
const struct llama_chat_message * chat,
722+
size_t n_msg,
723+
bool add_ass,
724+
char * buf,
725+
int32_t length);
726+
702727
//
703728
// Grammar
704729
//

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ endfunction()
2828
llama_build_and_test_executable(test-quantize-fns.cpp)
2929
llama_build_and_test_executable(test-quantize-perf.cpp)
3030
llama_build_and_test_executable(test-sampling.cpp)
31+
llama_build_and_test_executable(test-chat-template.cpp)
3132

3233
llama_build_executable(test-tokenizer-0-llama.cpp)
3334
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)

tests/test-chat-template.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#include <iostream>
2+
#include <string>
3+
#include <vector>
4+
#include <sstream>
5+
6+
#undef NDEBUG
7+
#include <cassert>
8+
9+
#include "llama.h"
10+
11+
int main(void) {
12+
llama_chat_message conversation[] = {
13+
{"system", "You are a helpful assistant"},
14+
{"user", "Hello"},
15+
{"assistant", "Hi there"},
16+
{"user", "Who are you"},
17+
{"assistant", " I am an assistant "},
18+
{"user", "Another question"},
19+
};
20+
size_t message_count = 6;
21+
std::vector<std::string> templates = {
22+
// teknium/OpenHermes-2.5-Mistral-7B
23+
"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
24+
// mistralai/Mistral-7B-Instruct-v0.2
25+
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
26+
// TheBloke/FusionNet_34Bx2_MoE-AWQ
27+
"{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
28+
// bofenghuang/vigogne-2-70b-chat
29+
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
30+
};
31+
std::vector<std::string> expected_substr = {
32+
"<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant",
33+
"[/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
34+
"</s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
35+
"[/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
36+
};
37+
std::vector<char> formatted_chat(1024);
38+
int32_t res;
39+
40+
// test invalid chat template
41+
res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
42+
assert(res < 0);
43+
44+
for (size_t i = 0; i < templates.size(); i++) {
45+
std::string custom_template = templates[i];
46+
std::string substr = expected_substr[i];
47+
formatted_chat.resize(1024);
48+
res = llama_chat_apply_template(
49+
nullptr,
50+
custom_template.c_str(),
51+
conversation,
52+
message_count,
53+
true,
54+
formatted_chat.data(),
55+
formatted_chat.size()
56+
);
57+
formatted_chat.resize(res);
58+
std::string output(formatted_chat.data(), formatted_chat.size());
59+
std::cout << output << "\n-------------------------\n";
60+
// expect the "formatted_chat" to contain pre-defined strings
61+
assert(output.find(substr) != std::string::npos);
62+
}
63+
return 0;
64+
}

0 commit comments

Comments
 (0)