From c3aac4ef6a3e91cbf54906da706c92783f4195b5 Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Sun, 9 Mar 2025 13:50:19 -0400 Subject: [PATCH 01/15] `tool-call`: Phi-4 support - Add system message if needed (per template requirement) - Add tools to system message (req'd by template) - Parse output: -- add tools to response when there is valid JSON between <|tool_call|> and -- content outside of tool_call tags is added to the text portion of the response -- if there is no valid JSON, the entire content is added to the text portion of the response --- common/chat.cpp | 186 ++++++++++++++++++ common/chat.h | 3 +- models/templates/README.md | 1 + .../microsoft-Phi-4-mini-instruct.jinja | 1 + tests/test-chat.cpp | 30 +++ 5 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 models/templates/microsoft-Phi-4-mini-instruct.jinja diff --git a/common/chat.cpp b/common/chat.cpp index 62ca26ad7609c..3ddf94545f8ff 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -448,6 +448,7 @@ std::string common_chat_format_name(common_chat_format format) { case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)"; case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B"; case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)"; + case COMMON_CHAT_FORMAT_PHI_4: return "Phi-4"; default: throw std::runtime_error("Unknown chat format"); } @@ -1356,6 +1357,184 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); } +static common_chat_params common_chat_params_init_phi_4(const common_chat_template & tmpl, const struct templates_params & inputs) { + // Phi-4 has a unique format that expects tools in the system message with <|tool|> tags + // and returns function calls as a JSON object after <|tool_call|> tag + common_chat_params data; + + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + std::vector tool_rules; + std::vector tool_call_alts; + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); + builder.resolve_refs(parameters); + tool_rules.push_back(builder.add_schema(name + "-call", { + {"type", "object"}, + {"properties", { + {"name", {{"const", name}}}, + {"arguments", parameters}, + }}, + {"required", json::array({"name", "arguments"})}, + })); + }); + auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space"); + std::vector alt_tags { + any_tool_call, + }; + tool_call_alts.push_back(any_tool_call); + auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | ")); + builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_call|>"}); + data.preserved_tokens = { + "<|tool_call|>", + "", + }; + }); + + // For Phi-4, we need to inject tools into the system message + // because the template expects tools in the system message with <|tool|> tags + if (inputs.tools.empty()) { + // No tools, use normal approach + data.prompt = apply(tmpl, inputs.messages, json::array(), inputs.add_generation_prompt); + } else { + // Make a copy of messages that we can modify + json adjusted_messages = inputs.messages; + + // Extract just the function part of the OpenAI-formatted tools + json phi4_tools = json::array(); + foreach_function(inputs.tools, [&](const json & tool) { + phi4_tools.push_back(tool.at("function")); + }); + + // Phi-4 template expects tools in the system message with <|tool|> tags. + // Find the system message, or add one if it doesn't exist + bool found_system_msg = false; + for (auto & message : adjusted_messages) { + if (message.contains("role") && message["role"] == "system") { + // Add tools to the existing system message and update content to mention tools + message["tools"] = phi4_tools; + + // If the system message doesn't mention tools, append that information + std::string content = message["content"]; + if (content.find("tool") == std::string::npos && + content.find("function") == std::string::npos) { + message["content"] = content + " You have access to some tools."; + } + + found_system_msg = true; + break; + } + } + + // If no system message, add one with tools + if (!found_system_msg && !adjusted_messages.empty()) { + json system_msg = { + {"role", "system"}, + {"content", "You are a helpful assistant with access to tools.\nTo use a tool, respond in this format: <|tool_call|>{\"name\": \"foo\", \"arguments\": {\"a\": 1}}<|/tool_call|>"}, + {"tools", phi4_tools} + }; + // Insert system message at the beginning + adjusted_messages.insert(adjusted_messages.begin(), system_msg); + } + + // Apply template with tools embedded in system message, passing empty tools separately + data.prompt = apply(tmpl, adjusted_messages, json(), inputs.add_generation_prompt); + } + + data.format = COMMON_CHAT_FORMAT_PHI_4; + return data; +} + +static common_chat_msg common_chat_parse_phi_4(const std::string & input) { + common_chat_msg result; + result.role = "assistant"; + + std::string final_content = ""; + + const std::string opening_tag = "<|tool_call|>"; + const std::string closing_tag = ""; + + size_t start_pos = 0; + while (true) { + // Find next tool call + size_t tool_start = input.find(opening_tag, start_pos); + if (tool_start == std::string::npos) { + // No more tool calls. + + // Is start_pos within string bounds? + if (start_pos < input.length()) { + // Add the rest of the string to final_content + final_content += input.substr(start_pos); + } + break; + } + + // Add content before the tool call to final_content + final_content += input.substr(start_pos, tool_start - start_pos); + + // Find closing tag + size_t content_start = tool_start + opening_tag.length(); + size_t tool_end = input.find(closing_tag, content_start); + + if (tool_end == std::string::npos) { + // No closing tag found, so just include the rest of the string as tool. + tool_end = input.length(); + } + + // Extract tool call content + std::string tool_content = input.substr( + content_start, + tool_end - content_start + ); + + // Try to parse the tool call + try { + auto tool_call = json::parse(tool_content); + + // Verify the required fields exist + if (!tool_call.contains("name")) { + throw std::runtime_error("Missing 'name' field in tool call"); + } + + if (!tool_call.contains("arguments")) { + throw std::runtime_error("Missing 'arguments' field in tool call"); + } + + std::string name = tool_call["name"].get(); + + std::string arguments; + try { + arguments = tool_call["arguments"].dump(); + } catch (const std::exception & e) { + LOG_ERR("Failed to serialize arguments: %s\n", e.what()); + arguments = "{}"; + } + + result.tool_calls.push_back({ + name, + arguments, + /* id= */ "", + }); + } catch (const std::exception & e) { + // If parsing fails, include the entire tool call in the content + final_content += input.substr( + tool_start, + tool_end + closing_tag.length() - tool_start + ); + } + + // Move past this tool call for next iteration + start_pos = tool_end + closing_tag.length(); + } + + result.content = final_content; + return result; +} + + static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; // (content)?({"name": "foo", "arguments": {"a": 1}})* @@ -1642,6 +1821,11 @@ static common_chat_params common_chat_templates_apply_jinja( return common_chat_params_init_firefunction_v2(tmpl, params); } + // Phi-4 mini. + if (src.find("<|tool|>") != std::string::npos) { + return common_chat_params_init_phi_4(tmpl, params); + } + // Plain handler (no tools) if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) { return common_chat_params_init_without_tools(tmpl, params); @@ -1773,6 +1957,8 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format return common_chat_parse_command_r7b(input, /* extract_reasoning= */ false); case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return common_chat_parse_command_r7b(input, /* extract_reasoning= */ true); + case COMMON_CHAT_FORMAT_PHI_4: + return common_chat_parse_phi_4(input); default: throw std::runtime_error("Unsupported format: " + common_chat_format_name(format)); } diff --git a/common/chat.h b/common/chat.h index 9aad84e880448..5e804b6d0ef35 100644 --- a/common/chat.h +++ b/common/chat.h @@ -56,7 +56,8 @@ enum common_chat_format { COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING, COMMON_CHAT_FORMAT_COMMAND_R7B, COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING, - + COMMON_CHAT_FORMAT_PHI_4, + COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats }; diff --git a/models/templates/README.md b/models/templates/README.md index e4fd104fc9fe6..66b56e4679f4a 100644 --- a/models/templates/README.md +++ b/models/templates/README.md @@ -19,4 +19,5 @@ These templates can be updated with the following commands: ./scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use > models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja ./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja ./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +./scripts/get_chat_template.py microsoft/Phi-4-mini-instruct > models/templates/microsoft-Phi-4-mini-instruct.jinja ``` \ No newline at end of file diff --git a/models/templates/microsoft-Phi-4-mini-instruct.jinja b/models/templates/microsoft-Phi-4-mini-instruct.jinja new file mode 100644 index 0000000000000..a9c00dd9bbd97 --- /dev/null +++ b/models/templates/microsoft-Phi-4-mini-instruct.jinja @@ -0,0 +1 @@ +{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %} \ No newline at end of file diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index a1034b1a41b12..e47580274ea9b 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -820,6 +820,36 @@ static void test_template_output_parsers() { test_templates(tmpls.get(), end_tokens, message_assist_call, tools, "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}"); } + { + auto tmpls = read_templates("models/templates/microsoft-Phi-4-mini-instruct.jinja"); + std::vector end_tokens{ "<|end|>" }; + + assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format); + + // Test normal message without tools + test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); + + // Test with content before tool call + assert_msg_equals( + common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""}, + common_chat_parse( + "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}", + COMMON_CHAT_FORMAT_PHI_4)); + + // Test with content after tool call + assert_msg_equals( + common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""}, + common_chat_parse( + "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}I'll help with that.", + COMMON_CHAT_FORMAT_PHI_4)); + + // Test with newlines. + assert_msg_equals(message_assist_call, common_chat_parse( + "<|tool_call|>\n" + "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" + "", + COMMON_CHAT_FORMAT_PHI_4)); + } { auto tmpls = read_templates("models/templates/meetkai-functionary-medium-v3.1.jinja"); std::vector end_tokens{ "<|eom_id|>", "<|eot_id|>" }; From eae5d97dafbaa5ce963e0b9f91220125e4b6e969 Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Fri, 14 Mar 2025 15:10:19 -0700 Subject: [PATCH 02/15] Merge pull request #1 from ochafik/Telosnex_phi4_tools_template Fixes for phi-4 support --- common/chat.cpp | 99 ++----------------- docs/function-calling.md | 14 ++- examples/server/server.cpp | 46 +++------ ...ma-cpp-microsoft-Phi-4-mini-instruct.jinja | 35 +++++++ 4 files changed, 68 insertions(+), 126 deletions(-) create mode 100644 models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja diff --git a/common/chat.cpp b/common/chat.cpp index 3ddf94545f8ff..fdc6aded3151b 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -584,10 +584,7 @@ static common_chat_msg parse_json_tool_calls( } if (!result.tool_calls.empty()) { - if (!string_strip(result.content).empty()) { - LOG_WRN("Content found with tool calls: %s\n", result.content.c_str()); - } - result.content = ""; + result.content = string_strip(result.content); } return result; } @@ -1371,14 +1368,15 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa std::string name = function.at("name"); auto parameters = function.at("parameters"); builder.resolve_refs(parameters); - tool_rules.push_back(builder.add_schema(name + "-call", { + auto call_rule = builder.add_schema(name + "-call", { {"type", "object"}, {"properties", { {"name", {{"const", name}}}, {"arguments", parameters}, }}, {"required", json::array({"name", "arguments"})}, - })); + }); + tool_rules.push_back(builder.add_rule(name + "-call", "\"<|tool_call|>\" " + call_rule + " \"<|/tool_call|>\"")); }); auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space"); std::vector alt_tags { @@ -1391,6 +1389,9 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa data.preserved_tokens = { "<|tool_call|>", "", + "<|tool_response|>", + "<|tool|>", + "", }; }); @@ -1449,89 +1450,9 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa } static common_chat_msg common_chat_parse_phi_4(const std::string & input) { - common_chat_msg result; - result.role = "assistant"; - - std::string final_content = ""; - - const std::string opening_tag = "<|tool_call|>"; - const std::string closing_tag = ""; - - size_t start_pos = 0; - while (true) { - // Find next tool call - size_t tool_start = input.find(opening_tag, start_pos); - if (tool_start == std::string::npos) { - // No more tool calls. - - // Is start_pos within string bounds? - if (start_pos < input.length()) { - // Add the rest of the string to final_content - final_content += input.substr(start_pos); - } - break; - } - - // Add content before the tool call to final_content - final_content += input.substr(start_pos, tool_start - start_pos); - - // Find closing tag - size_t content_start = tool_start + opening_tag.length(); - size_t tool_end = input.find(closing_tag, content_start); - - if (tool_end == std::string::npos) { - // No closing tag found, so just include the rest of the string as tool. - tool_end = input.length(); - } - - // Extract tool call content - std::string tool_content = input.substr( - content_start, - tool_end - content_start - ); - - // Try to parse the tool call - try { - auto tool_call = json::parse(tool_content); - - // Verify the required fields exist - if (!tool_call.contains("name")) { - throw std::runtime_error("Missing 'name' field in tool call"); - } - - if (!tool_call.contains("arguments")) { - throw std::runtime_error("Missing 'arguments' field in tool call"); - } - - std::string name = tool_call["name"].get(); - - std::string arguments; - try { - arguments = tool_call["arguments"].dump(); - } catch (const std::exception & e) { - LOG_ERR("Failed to serialize arguments: %s\n", e.what()); - arguments = "{}"; - } - - result.tool_calls.push_back({ - name, - arguments, - /* id= */ "", - }); - } catch (const std::exception & e) { - // If parsing fails, include the entire tool call in the content - final_content += input.substr( - tool_start, - tool_end + closing_tag.length() - tool_start - ); - } - - // Move past this tool call for next iteration - start_pos = tool_end + closing_tag.length(); - } - - result.content = final_content; - return result; + static std::regex function_regex("<\\|tool_call\\|>\\s*\\{\\s*\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"arguments\"\\s*:"); + static std::regex close_regex(R"(\}\s*()?)"); + return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); } diff --git a/docs/function-calling.md b/docs/function-calling.md index c3873c3fa63d1..62261077dd508 100644 --- a/docs/function-calling.md +++ b/docs/function-calling.md @@ -12,11 +12,12 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2 - Functionary v3.1 / v3.2 - Hermes 2/3, Qwen 2.5 - - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034) + - Qwen 2.5 Coder (#12034) - Mistral Nemo - Firefunction v2 - - Command R7B - - DeepSeek R1 (WIP / seems reluctant to call any tools?) + - Command R7B (#11585) + - DeepSeek R1 (#11607) + - Phi 4 (#12288) - Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs). - Use `--chat-template-file` to override the template when appropriate (see examples below) @@ -297,9 +298,14 @@ llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \ llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \ --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja +# Native support for Phi 4 also needs a template override (official template is buggy) + +llama-server --jinja -fa -hf bartowski/microsoft_Phi-4-mini-instruct-GGUF \ + --chat-template-file models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja + # Native support requires the right template for these GGUFs: -llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M +llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M \ --chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 71e053b202cd2..585bc2065c1eb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -386,7 +386,7 @@ struct server_task { trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN; trigger.value = word; trigger.token = token; - params.sampling.grammar_triggers.push_back(std::move(trigger)); + params.sampling.grammar_triggers.push_back(trigger); } else { SRV_DBG("Grammar trigger word: `%s`\n", word.c_str()); params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); @@ -751,10 +751,7 @@ struct server_task_result_cmpl_final : server_task_result { {"name", tc.name}, {"arguments", tc.arguments}, }}, - // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo). - // We only generate a random id for the ones that don't generate one by themselves - // (they also won't get to see it as their template likely doesn't use it, so it's all for the client) - {"id", tc.id.empty() ? gen_tool_call_id() : tc.id}, + {"id", tc.id}, }); } message["tool_calls"] = tool_calls; @@ -2040,18 +2037,6 @@ struct server_context { return ret; } - bool can_be_detokenized(const struct llama_context * ctx, const std::vector & tokens) { - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - const int32_t n_vocab = llama_vocab_n_tokens(vocab); - for (const auto & token : tokens) { - if (token < 0 || token >= n_vocab) { - return false; - } - } - return true; - } - bool launch_slot_with_task(server_slot & slot, const server_task & task) { slot.reset(); slot.id_task = task.id; @@ -2066,11 +2051,6 @@ struct server_context { slot.lora = task.params.lora; } - bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens); - if (!can_detokenize) { - send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST); - return false; - } SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { @@ -2113,7 +2093,7 @@ struct server_context { SRV_DBG("%s", "clearing KV cache\n"); // clear the entire KV cache - llama_kv_self_clear(ctx); + llama_kv_cache_clear(ctx); clean_kv_cache = false; } @@ -2655,8 +2635,8 @@ struct server_context { res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); res->t_start = metrics.t_start; - res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx); - res->kv_cache_used_cells = llama_kv_self_used_cells(ctx); + res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); + res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; res->t_prompt_processing_total = metrics.t_prompt_processing_total; @@ -2772,7 +2752,7 @@ struct server_context { // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - llama_kv_self_seq_rm(ctx, slot->id, -1, -1); + llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); auto res = std::make_unique(); @@ -2840,8 +2820,8 @@ struct server_context { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); - llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -3032,8 +3012,8 @@ struct server_context { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c); - llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift); + llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); + llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; @@ -3071,9 +3051,9 @@ struct server_context { } // keep only the common part - if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) { + if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) - llama_kv_self_seq_rm(ctx, slot.id, -1, -1); + llama_kv_cache_seq_rm(ctx, slot.id, -1, -1); // there is no common part left slot.n_past = 0; @@ -3313,7 +3293,7 @@ struct server_context { slot.cache_tokens.push_back(id); slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); - llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1); + llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; diff --git a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja new file mode 100644 index 0000000000000..9f19e57f94802 --- /dev/null +++ b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja @@ -0,0 +1,35 @@ +{%- if messages[0]["role"] == "system" %} + {%- set system_message = messages[0]["content"] %} +{% elif tools is defined -%} + {%- set system_message = "You are a helpful assistant with access to tools." -%} +{% else %} + {%- set system_message = "" -%} +{%- endif %} +{%- if tools is defined -%} + {%- set system_message = system_message + '<|tool|>' + (tools | tojson) + '<|/tool|>' -%} + {%- if '<|tool_call|>' not in system_message -%} + {%- set system_message = system_message + "\nTo use a tool, respond in this format: <|tool_call|>{\"name\": \"foo\", \"arguments\": {\"a\": 1}}<|/tool_call|>" %} + {%- endif %} +{%- endif %} +{%- if system_message is defined -%} + {{- '<|system|>' + system_message + '<|end|>' -}} +{%- endif -%} +{%- for message in messages -%} + {%- if message['role'] == 'tool' -%} + {{- '<|tool_response|>' + (message['content'] | tojson) + '<|/tool_response|>' -}} + {%- elif message['role'] != 'system' -%} + {{- '<|' + message['role'] + '|>' -}} + {%- if message.content -%} + {{- message['content'] -}} + {%- endif -%} + {%- for tool_call in message.tool_calls -%} + {{- '<|tool_call|>' + (tool_call | tojson) + '<|/tool_call|>' -}} + {%- endfor -%} + {{- '<|end|>' -}} + {%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt -%} + {{- '<|assistant|>' -}} +{%- else -%} + {{- eos_token -}} +{%- endif -%} \ No newline at end of file From 32d32effb1f0b8c5a45d3fd52d5011ed612d2cec Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Fri, 14 Mar 2025 23:31:35 -0400 Subject: [PATCH 03/15] Revert some bits I made a mess while merging in Olivier's work, so it ended up merged into one commit in this branch. In this commit, I undo changes that wouldn't have been intended in this commit (ex. server.cpp --- examples/server/server.cpp | 46 +++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 585bc2065c1eb..71e053b202cd2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -386,7 +386,7 @@ struct server_task { trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN; trigger.value = word; trigger.token = token; - params.sampling.grammar_triggers.push_back(trigger); + params.sampling.grammar_triggers.push_back(std::move(trigger)); } else { SRV_DBG("Grammar trigger word: `%s`\n", word.c_str()); params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); @@ -751,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result { {"name", tc.name}, {"arguments", tc.arguments}, }}, - {"id", tc.id}, + // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo). + // We only generate a random id for the ones that don't generate one by themselves + // (they also won't get to see it as their template likely doesn't use it, so it's all for the client) + {"id", tc.id.empty() ? gen_tool_call_id() : tc.id}, }); } message["tool_calls"] = tool_calls; @@ -2037,6 +2040,18 @@ struct server_context { return ret; } + bool can_be_detokenized(const struct llama_context * ctx, const std::vector & tokens) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); + for (const auto & token : tokens) { + if (token < 0 || token >= n_vocab) { + return false; + } + } + return true; + } + bool launch_slot_with_task(server_slot & slot, const server_task & task) { slot.reset(); slot.id_task = task.id; @@ -2051,6 +2066,11 @@ struct server_context { slot.lora = task.params.lora; } + bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens); + if (!can_detokenize) { + send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST); + return false; + } SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { @@ -2093,7 +2113,7 @@ struct server_context { SRV_DBG("%s", "clearing KV cache\n"); // clear the entire KV cache - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); clean_kv_cache = false; } @@ -2635,8 +2655,8 @@ struct server_context { res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); res->t_start = metrics.t_start; - res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); - res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); + res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx); + res->kv_cache_used_cells = llama_kv_self_used_cells(ctx); res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; res->t_prompt_processing_total = metrics.t_prompt_processing_total; @@ -2752,7 +2772,7 @@ struct server_context { // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); + llama_kv_self_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); auto res = std::make_unique(); @@ -2820,8 +2840,8 @@ struct server_context { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); + llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -3012,8 +3032,8 @@ struct server_context { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); - llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift); + llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c); + llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; @@ -3051,9 +3071,9 @@ struct server_context { } // keep only the common part - if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { + if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id, -1, -1); + llama_kv_self_seq_rm(ctx, slot.id, -1, -1); // there is no common part left slot.n_past = 0; @@ -3293,7 +3313,7 @@ struct server_context { slot.cache_tokens.push_back(id); slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); - llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); + llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1); for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; From 32ab329f4efa7d5a57b0fc1b73b96782380e49f3 Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Sat, 15 Mar 2025 00:02:33 -0400 Subject: [PATCH 04/15] Fix tokens via https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/added_tokens.json { "<|/tool_call|>": 200026, "<|/tool|>": 200024, "<|assistant|>": 200019, "<|end|>": 200020, "<|system|>": 200022, "<|tag|>": 200028, "<|tool_call|>": 200025, "<|tool_response|>": 200027, "<|tool|>": 200023, "<|user|>": 200021 } FWIW tool_response seems to be a role, via https://github.com/kinfey/Phi-3CookBook/blob/main/md/02.Application/07.FunctionCalling/Phi4/Multiagents/Phi_4_mini_multiagent.ipynb --- common/chat.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index fdc6aded3151b..377a18a7b7138 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1388,10 +1388,10 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_call|>"}); data.preserved_tokens = { "<|tool_call|>", - "", + "<|/tool_call|>", "<|tool_response|>", "<|tool|>", - "", + "<|/tool|>", }; }); @@ -1451,7 +1451,7 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa static common_chat_msg common_chat_parse_phi_4(const std::string & input) { static std::regex function_regex("<\\|tool_call\\|>\\s*\\{\\s*\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"arguments\"\\s*:"); - static std::regex close_regex(R"(\}\s*()?)"); + static std::regex close_regex(R"(\}\s*(<\|/tool_call\|>)?)"); return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); } From 094f6079963c8fc06bd5e92d65a40e17859f69f5 Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Sat, 15 Mar 2025 00:12:20 -0400 Subject: [PATCH 05/15] Tweak tool response Phi-4 tool calling's docs include the HuggingFace pages for Phi-4[^1] and Jupyter notebooks at [^2]. [^1] only has , and the notebook at [^3] is the only resource I've found that demonstrates tool responses. It looks like it's used as a sort of role -- it doesn't directly show it 100% directly, but, the message format used defines tool_response as a role in a structure with text content, identical to the user/assistant messages in the same notebook. Given that, and also it explaining another small mystery to me (why isn't <|/tool_response> in the reserved tokens?), I'll apply that here. [^1] https://huggingface.co/microsoft/Phi-4-mini-instruct [^2] https://github.com/microsoft/PhiCookBook/tree/main/md/02.Application/07.FunctionCalling/Phi4 [^3] https://github.com/microsoft/PhiCookBook/blob/main/md/02.Application/07.FunctionCalling/Phi4/Multiagents/Phi_4_mini_multiagent.ipynb --- models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja index 9f19e57f94802..84e4668536c1b 100644 --- a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja +++ b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja @@ -16,7 +16,7 @@ {%- endif -%} {%- for message in messages -%} {%- if message['role'] == 'tool' -%} - {{- '<|tool_response|>' + (message['content'] | tojson) + '<|/tool_response|>' -}} + {{- '<|tool_response|>' + (message['content'] | tojson) + '<|end|>' -}} {%- elif message['role'] != 'system' -%} {{- '<|' + message['role'] + '|>' -}} {%- if message.content -%} From 274ef561409ddcca2d2d20a2f9ff5a79c9ef0dd1 Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Sun, 16 Mar 2025 15:30:09 -0400 Subject: [PATCH 06/15] Phi-4 tool calls x template won't fixup messages; add comment re: passing template --- common/chat.cpp | 55 +++++++------------------------------------------ 1 file changed, 7 insertions(+), 48 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 377a18a7b7138..851a52d11354f 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1397,54 +1397,13 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa // For Phi-4, we need to inject tools into the system message // because the template expects tools in the system message with <|tool|> tags - if (inputs.tools.empty()) { - // No tools, use normal approach - data.prompt = apply(tmpl, inputs.messages, json::array(), inputs.add_generation_prompt); - } else { - // Make a copy of messages that we can modify - json adjusted_messages = inputs.messages; - - // Extract just the function part of the OpenAI-formatted tools - json phi4_tools = json::array(); - foreach_function(inputs.tools, [&](const json & tool) { - phi4_tools.push_back(tool.at("function")); - }); - - // Phi-4 template expects tools in the system message with <|tool|> tags. - // Find the system message, or add one if it doesn't exist - bool found_system_msg = false; - for (auto & message : adjusted_messages) { - if (message.contains("role") && message["role"] == "system") { - // Add tools to the existing system message and update content to mention tools - message["tools"] = phi4_tools; - - // If the system message doesn't mention tools, append that information - std::string content = message["content"]; - if (content.find("tool") == std::string::npos && - content.find("function") == std::string::npos) { - message["content"] = content + " You have access to some tools."; - } - - found_system_msg = true; - break; - } - } - - // If no system message, add one with tools - if (!found_system_msg && !adjusted_messages.empty()) { - json system_msg = { - {"role", "system"}, - {"content", "You are a helpful assistant with access to tools.\nTo use a tool, respond in this format: <|tool_call|>{\"name\": \"foo\", \"arguments\": {\"a\": 1}}<|/tool_call|>"}, - {"tools", phi4_tools} - }; - // Insert system message at the beginning - adjusted_messages.insert(adjusted_messages.begin(), system_msg); - } - - // Apply template with tools embedded in system message, passing empty tools separately - data.prompt = apply(tmpl, adjusted_messages, json(), inputs.add_generation_prompt); - } - + // The Phi-4 template has issues with tool calls. + // It is advisable to use --chat-template-file models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja + // - It expects tools from the system message (instead of as a global variable as most templates). + // - It does not print tool calls (this is worked around by the Minja + the generic mode, but without the <|tool_call|> syntax) + // - With defaults, it prints tool call results (messages such as {"role": "tool", "name": "foo", "content": "42"}) as <|tool|>42<|end|> which conflicts with the tool description wrapping mechanism. + // - Tool call results are expected to be injected as a message from the <|tool_response|> role. i.e. <|tool_response|>(json.dump())<|end|> + data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.format = COMMON_CHAT_FORMAT_PHI_4; return data; } From b15b8096dce6074139a7bbf9ee165ce8e4965898 Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Sun, 16 Mar 2025 15:35:19 -0400 Subject: [PATCH 07/15] Phi-4 template changes from testing latest version 1. Add tag wrapping individually for each tool 2. Avoid adding empty system message when there's no tools and no system message in input messages. 3. Move instructions to bottom (this worked a lot better when multiple tools added) Re: 1 There's no concrete justification for this under than it *seems* better. Unfortunately, the only material for multiple tool calls x Phi-4 are based on ollama, which is downstream of llama.cpp, and either which way, obfuscates how its translated into multiple tools, and is unreliable per user reports. https://colab.research.google.com/github/kinfey/Phi-3CookBook/blob/main/md/02.Application/07.FunctionCalling/Phi4/Multiagents/Phi_4_mini_multiagent.ipynb#scrollTo=NRgfmCBShDot%20via%20https://github.com/kinfey/Phi-3CookBook%20via%20https://github.com/kinfey/Phi-3CookBook/blob/main/md/02.Application/07.FunctionCalling/Phi4/Multiagents/Phi_4_mini_multiagent.ipynb --- ...ma-cpp-microsoft-Phi-4-mini-instruct.jinja | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja index 84e4668536c1b..c2a393d75599b 100644 --- a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja +++ b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja @@ -1,18 +1,20 @@ {%- if messages[0]["role"] == "system" %} {%- set system_message = messages[0]["content"] %} {% elif tools is defined -%} - {%- set system_message = "You are a helpful assistant with access to tools." -%} -{% else %} {%- set system_message = "" -%} {%- endif %} -{%- if tools is defined -%} - {%- set system_message = system_message + '<|tool|>' + (tools | tojson) + '<|/tool|>' -%} - {%- if '<|tool_call|>' not in system_message -%} - {%- set system_message = system_message + "\nTo use a tool, respond in this format: <|tool_call|>{\"name\": \"foo\", \"arguments\": {\"a\": 1}}<|/tool_call|>" %} - {%- endif %} -{%- endif %} + {%- if system_message is defined -%} - {{- '<|system|>' + system_message + '<|end|>' -}} + {{- '<|system|>' + system_message -}} + {%- if tools is defined -%} + {% for tool in tools %} + {{- '<|tool|>' + (tool['function'] | tojson) + '<|/tool|>' -}} + {% endfor %} + {%- if '<|tool_call|>' not in system_message -%} + {{- 'You are a helpful assistant with some tools.\nTo use a tool, respond in this format: <|tool_call|>{"name": "foo", "arguments": {"a": 1}}<|/tool_call|>' -}} + {%- endif -%} + {%- endif -%} + {{- '<|end|>' -}} {%- endif -%} {%- for message in messages -%} {%- if message['role'] == 'tool' -%} From 3ca03c7cd8ca3cb6b06f9980be04551fef7a4e03 Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Sun, 16 Mar 2025 15:50:51 -0400 Subject: [PATCH 08/15] remove unnecessary nl --- common/chat.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index 851a52d11354f..c4492e09d7a8b 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1414,7 +1414,6 @@ static common_chat_msg common_chat_parse_phi_4(const std::string & input) { return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); } - static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; // (content)?({"name": "foo", "arguments": {"a": 1}})* From 8ccefe50ac6eab6730c60fac969ac2bc51a0bcde Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Sun, 16 Mar 2025 15:51:03 -0400 Subject: [PATCH 09/15] fix tests (they had incorrect tool call end tag) --- tests/test-chat.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index e47580274ea9b..3acac89aa4943 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -821,7 +821,7 @@ static void test_template_output_parsers() { "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}"); } { - auto tmpls = read_templates("models/templates/microsoft-Phi-4-mini-instruct.jinja"); + auto tmpls = read_templates("models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja"); std::vector end_tokens{ "<|end|>" }; assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format); @@ -833,21 +833,21 @@ static void test_template_output_parsers() { assert_msg_equals( common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""}, common_chat_parse( - "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}", + "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>", COMMON_CHAT_FORMAT_PHI_4)); // Test with content after tool call assert_msg_equals( common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""}, common_chat_parse( - "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}I'll help with that.", + "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>I'll help with that.", COMMON_CHAT_FORMAT_PHI_4)); // Test with newlines. assert_msg_equals(message_assist_call, common_chat_parse( "<|tool_call|>\n" "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" - "", + "<|/tool_call|>", COMMON_CHAT_FORMAT_PHI_4)); } { From c2343b2aa56ba2ba3396dbb17a9702e1144cddd7 Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Tue, 18 Mar 2025 18:21:31 -0400 Subject: [PATCH 10/15] Fix trailing whitespace via editorconfig action https://github.com/ggml-org/llama.cpp/actions/runs/13886820859/job/38855413335?pr=12288 --- common/chat.cpp | 2 +- common/chat.h | 2 +- tests/test-chat.cpp | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index c4492e09d7a8b..ada4535e189d4 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1399,7 +1399,7 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa // because the template expects tools in the system message with <|tool|> tags // The Phi-4 template has issues with tool calls. // It is advisable to use --chat-template-file models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja - // - It expects tools from the system message (instead of as a global variable as most templates). + // - It expects tools from the system message (instead of as a global variable as most templates). // - It does not print tool calls (this is worked around by the Minja + the generic mode, but without the <|tool_call|> syntax) // - With defaults, it prints tool call results (messages such as {"role": "tool", "name": "foo", "content": "42"}) as <|tool|>42<|end|> which conflicts with the tool description wrapping mechanism. // - Tool call results are expected to be injected as a message from the <|tool_response|> role. i.e. <|tool_response|>(json.dump())<|end|> diff --git a/common/chat.h b/common/chat.h index 5e804b6d0ef35..72215c49eb31d 100644 --- a/common/chat.h +++ b/common/chat.h @@ -57,7 +57,7 @@ enum common_chat_format { COMMON_CHAT_FORMAT_COMMAND_R7B, COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING, COMMON_CHAT_FORMAT_PHI_4, - + COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats }; diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 3acac89aa4943..e13befdb7c179 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -823,12 +823,12 @@ static void test_template_output_parsers() { { auto tmpls = read_templates("models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja"); std::vector end_tokens{ "<|end|>" }; - + assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format); - + // Test normal message without tools test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); - + // Test with content before tool call assert_msg_equals( common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""}, From a5d014b6176bfd53fc889e37bb1df2866a4a7e5a Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Wed, 19 Mar 2025 12:25:34 -0400 Subject: [PATCH 11/15] Test coverage in test_tool_call.py --- examples/server/tests/unit/test_tool_call.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py index 569c2a1f8ea31..320bf68b5804d 100755 --- a/examples/server/tests/unit/test_tool_call.py +++ b/examples/server/tests/unit/test_tool_call.py @@ -138,6 +138,8 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"), ("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"), # ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"), + ("microsoft-Phi-4-mini-instruct", TEST_TOOL, "success"), + ("microsoft-Phi-4-mini-instruct", PYTHON_TOOL, "code"), ]) def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None): global server @@ -164,6 +166,9 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + (TEST_TOOL, "success", "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), + (PYTHON_TOOL, "code", "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), + (TEST_TOOL, "success", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"), @@ -306,6 +311,8 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), + ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None), ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"), @@ -385,6 +392,7 @@ def do_test_weather(server: ServerProcess, **kwargs): @pytest.mark.slow @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [ (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + (None, 128, "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), (None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None), (None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"), (None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), @@ -394,6 +402,7 @@ def do_test_weather(server: ServerProcess, **kwargs): (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + (None, 128, "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", None), ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value) @@ -535,6 +544,8 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), + ("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)), ("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"), From 09b795def52d3e95622f16a27b46b1963005e71a Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Wed, 19 Mar 2025 14:16:44 -0400 Subject: [PATCH 12/15] Fix template expansion --- models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja index c2a393d75599b..dd3a199fe78e9 100644 --- a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja +++ b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja @@ -25,7 +25,7 @@ {{- message['content'] -}} {%- endif -%} {%- for tool_call in message.tool_calls -%} - {{- '<|tool_call|>' + (tool_call | tojson) + '<|/tool_call|>' -}} + {{- '<|tool_call|>' + (tool_call['function'] | tojson) + '<|/tool_call|>' -}} {%- endfor -%} {{- '<|end|>' -}} {%- endif -%} From 61ff59ea669a0dd9bacf418df4b9c21ef4c9b6fe Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Wed, 19 Mar 2025 14:31:55 -0400 Subject: [PATCH 13/15] Ensure both <|tool|> and <|tool_response|> are in template before deciding its phi-4 --- common/chat.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index ada4535e189d4..ee5b5bcd24090 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1702,7 +1702,11 @@ static common_chat_params common_chat_templates_apply_jinja( // Phi-4 mini. if (src.find("<|tool|>") != std::string::npos) { - return common_chat_params_init_phi_4(tmpl, params); + if (src.find("<|tool_response|>") != std::string::npos) { + return common_chat_params_init_phi_4(tmpl, params); + } else { + LOG_WRN("[%s] Invalid legacy Phi 4 template detected: switching to Generic tool call format. To enable native support, please restart with `--chat-template-file models/template/microsoft-Phi-4-mini-instruct.jinja`", __func__); + } } // Plain handler (no tools) From 65c254169fbb456d352ad6a37e181bf97366ec97 Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Wed, 19 Mar 2025 14:32:45 -0400 Subject: [PATCH 14/15] test_tool_call.py tests: chatml; every test block w/Phi-3.5 has a Phi-4 equivalent --- examples/server/tests/unit/test_tool_call.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py index 320bf68b5804d..a8675161fe2a9 100755 --- a/examples/server/tests/unit/test_tool_call.py +++ b/examples/server/tests/unit/test_tool_call.py @@ -168,6 +168,7 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, (TEST_TOOL, "success", "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), (PYTHON_TOOL, "code", "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), + (PYTHON_TOOL, "code", "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"), (TEST_TOOL, "success", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None), @@ -312,6 +313,7 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), + ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"), ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None), ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"), @@ -392,7 +394,9 @@ def do_test_weather(server: ServerProcess, **kwargs): @pytest.mark.slow @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [ (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), - (None, 128, "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), + # Answers using text, not tools, complaining it wants to measure from the positive Z-axis not x-axis. + # (None, 128, "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), + (None, 128, "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"), (None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None), (None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"), (None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), @@ -545,6 +549,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), + ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"), ("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)), ("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"), From ff78c90cc18cc9adbd9e509875ac40857558c5f4 Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Wed, 19 Mar 2025 17:44:04 -0400 Subject: [PATCH 15/15] fix: trailing whitespace --- common/chat.cpp | 2 +- examples/server/tests/unit/test_tool_call.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index ee5b5bcd24090..df28190d5eb26 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1706,7 +1706,7 @@ static common_chat_params common_chat_templates_apply_jinja( return common_chat_params_init_phi_4(tmpl, params); } else { LOG_WRN("[%s] Invalid legacy Phi 4 template detected: switching to Generic tool call format. To enable native support, please restart with `--chat-template-file models/template/microsoft-Phi-4-mini-instruct.jinja`", __func__); - } + } } // Plain handler (no tools) diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py index a8675161fe2a9..206d087ff1e3e 100755 --- a/examples/server/tests/unit/test_tool_call.py +++ b/examples/server/tests/unit/test_tool_call.py @@ -394,7 +394,7 @@ def do_test_weather(server: ServerProcess, **kwargs): @pytest.mark.slow @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [ (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), - # Answers using text, not tools, complaining it wants to measure from the positive Z-axis not x-axis. + # Answers using text, not tools, complaining it wants to measure from the positive Z-axis not x-axis. # (None, 128, "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)), (None, 128, "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"), (None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),