@@ -2618,6 +2618,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
2618
2618
const std::vector<llama_chat_msg> & msgs,
2619
2619
bool add_ass) {
2620
2620
int alloc_size = 0 ;
2621
+ bool fallback = false ; // indicate if we must fallback to default chatml
2621
2622
std::vector<llama_chat_message> chat;
2622
2623
for (auto & msg : msgs) {
2623
2624
chat.push_back ({msg.role .c_str (), msg.content .c_str ()});
@@ -2630,10 +2631,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
2630
2631
// run the first time to get the total output length
2631
2632
int32_t res = llama_chat_apply_template (model, ptr_tmpl, chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
2632
2633
2634
+ // error: chat template is not supported
2635
+ if (res < 0 ) {
2636
+ if (ptr_tmpl != nullptr ) {
2637
+ // if the custom "tmpl" is not supported, we throw an error
2638
+ // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
2639
+ throw std::runtime_error (" this custom template is not supported" );
2640
+ } else {
2641
+ // If the built-in template is not supported, we default to chatml
2642
+ res = llama_chat_apply_template (nullptr , " chatml" , chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
2643
+ fallback = true ;
2644
+ }
2645
+ }
2646
+
2633
2647
// if it turns out that our buffer is too small, we resize it
2634
2648
if ((size_t ) res > buf.size ()) {
2635
2649
buf.resize (res);
2636
- res = llama_chat_apply_template (model, ptr_tmpl, chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
2650
+ res = llama_chat_apply_template (
2651
+ fallback ? nullptr : model,
2652
+ fallback ? " chatml" : ptr_tmpl,
2653
+ chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
2637
2654
}
2638
2655
2639
2656
std::string formatted_chat (buf.data (), res);
0 commit comments