From c3aac4ef6a3e91cbf54906da706c92783f4195b5 Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Sun, 9 Mar 2025 13:50:19 -0400
Subject: [PATCH 01/15] `tool-call`: Phi-4 support

- Add system message if needed (per template requirement)
- Add tools to system message (req'd by template)
- Parse output:
-- add tools to response when there is valid JSON between <|tool_call|> and </|tool_call|>
-- content outside of tool_call tags is added to the text portion of the response
-- if there is no valid JSON, the entire content is added to the text portion of the response
---
 common/chat.cpp                               | 186 ++++++++++++++++++
 common/chat.h                                 |   3 +-
 models/templates/README.md                    |   1 +
 .../microsoft-Phi-4-mini-instruct.jinja       |   1 +
 tests/test-chat.cpp                           |  30 +++
 5 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 models/templates/microsoft-Phi-4-mini-instruct.jinja

diff --git a/common/chat.cpp b/common/chat.cpp
index 62ca26ad7609c..3ddf94545f8ff 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -448,6 +448,7 @@ std::string common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)";
         case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
         case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
+        case COMMON_CHAT_FORMAT_PHI_4: return "Phi-4";
         default:
             throw std::runtime_error("Unknown chat format");
     }
@@ -1356,6 +1357,184 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s
     return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
 }
 
+static common_chat_params common_chat_params_init_phi_4(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    // Phi-4 has a unique format that expects tools in the system message with <|tool|> tags
+    // and returns function calls as a JSON object after <|tool_call|> tag
+    common_chat_params data;
+
+    data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        std::vector<std::string> tool_rules;
+        std::vector<std::string> tool_call_alts;
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const auto & function = tool.at("function");
+            std::string name = function.at("name");
+            auto parameters = function.at("parameters");
+            builder.resolve_refs(parameters);
+            tool_rules.push_back(builder.add_schema(name + "-call", {
+                {"type", "object"},
+                {"properties", {
+                    {"name", {{"const", name}}},
+                    {"arguments", parameters},
+                }},
+                {"required", json::array({"name", "arguments"})},
+            }));
+        });
+        auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
+        std::vector<std::string> alt_tags {
+            any_tool_call,
+        };
+        tool_call_alts.push_back(any_tool_call);
+        auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
+        builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
+        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_call|>"});
+        data.preserved_tokens = {
+            "<|tool_call|>",
+            "</|tool_call|>",
+        };
+    });
+
+    // For Phi-4, we need to inject tools into the system message
+    // because the template expects tools in the system message with <|tool|> tags
+    if (inputs.tools.empty()) {
+        // No tools, use normal approach
+        data.prompt = apply(tmpl, inputs.messages, json::array(), inputs.add_generation_prompt);
+    } else {
+        // Make a copy of messages that we can modify
+        json adjusted_messages = inputs.messages;
+        
+        // Extract just the function part of the OpenAI-formatted tools
+        json phi4_tools = json::array();
+        foreach_function(inputs.tools, [&](const json & tool) {
+            phi4_tools.push_back(tool.at("function"));
+        });
+        
+        // Phi-4 template expects tools in the system message with <|tool|> tags.
+        // Find the system message, or add one if it doesn't exist
+        bool found_system_msg = false;
+        for (auto & message : adjusted_messages) {
+            if (message.contains("role") && message["role"] == "system") {
+                // Add tools to the existing system message and update content to mention tools
+                message["tools"] = phi4_tools;
+                
+                // If the system message doesn't mention tools, append that information
+                std::string content = message["content"];
+                if (content.find("tool") == std::string::npos && 
+                    content.find("function") == std::string::npos) {
+                    message["content"] = content + " You have access to some tools.";
+                }
+                
+                found_system_msg = true;
+                break;
+            }
+        }
+        
+        // If no system message, add one with tools
+        if (!found_system_msg && !adjusted_messages.empty()) {
+            json system_msg = {
+                {"role", "system"},
+                {"content", "You are a helpful assistant with access to tools.\nTo use a tool, respond in this format: <|tool_call|>{\"name\": \"foo\", \"arguments\": {\"a\": 1}}<|/tool_call|>"},
+                {"tools", phi4_tools}
+            };
+            // Insert system message at the beginning
+            adjusted_messages.insert(adjusted_messages.begin(), system_msg);
+        }
+        
+        // Apply template with tools embedded in system message, passing empty tools separately
+        data.prompt = apply(tmpl, adjusted_messages, json(), inputs.add_generation_prompt);
+    } 
+    
+    data.format = COMMON_CHAT_FORMAT_PHI_4;
+    return data;
+}
+
+static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
+    common_chat_msg result;
+    result.role = "assistant";
+    
+    std::string final_content = "";
+    
+    const std::string opening_tag = "<|tool_call|>";
+    const std::string closing_tag = "</|tool_call|>";
+    
+    size_t start_pos = 0;
+    while (true) {
+        // Find next tool call
+        size_t tool_start = input.find(opening_tag, start_pos);
+        if (tool_start == std::string::npos) {
+            // No more tool calls.
+
+            // Is start_pos within string bounds?
+            if (start_pos < input.length()) {
+                // Add the rest of the string to final_content
+                final_content += input.substr(start_pos);
+            }
+            break;
+        }
+        
+        // Add content before the tool call to final_content
+        final_content += input.substr(start_pos, tool_start - start_pos);
+
+        // Find closing tag
+        size_t content_start = tool_start + opening_tag.length();
+        size_t tool_end = input.find(closing_tag, content_start);
+        
+        if (tool_end == std::string::npos) {
+            // No closing tag found, so just include the rest of the string as tool.
+            tool_end = input.length();
+        }
+        
+        // Extract tool call content
+        std::string tool_content = input.substr(
+            content_start,
+            tool_end - content_start
+        );
+        
+        // Try to parse the tool call
+        try {
+            auto tool_call = json::parse(tool_content);
+            
+            // Verify the required fields exist
+            if (!tool_call.contains("name")) {
+                throw std::runtime_error("Missing 'name' field in tool call");
+            }
+            
+            if (!tool_call.contains("arguments")) {
+                throw std::runtime_error("Missing 'arguments' field in tool call");
+            }
+            
+            std::string name = tool_call["name"].get<std::string>();
+            
+            std::string arguments;
+            try {
+                arguments = tool_call["arguments"].dump();
+            } catch (const std::exception & e) {
+                LOG_ERR("Failed to serialize arguments: %s\n", e.what());
+                arguments = "{}";
+            }
+            
+            result.tool_calls.push_back({
+                name,
+                arguments,
+                /* id= */ "",
+            });
+        } catch (const std::exception & e) {
+            // If parsing fails, include the entire tool call in the content
+            final_content += input.substr(
+                tool_start,
+                tool_end + closing_tag.length() - tool_start
+            );
+        }
+        
+        // Move past this tool call for next iteration
+        start_pos = tool_end + closing_tag.length();
+    }
+    
+    result.content = final_content;
+    return result;
+}
+
+
 static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
@@ -1642,6 +1821,11 @@ static common_chat_params common_chat_templates_apply_jinja(
         return common_chat_params_init_firefunction_v2(tmpl, params);
     }
 
+    // Phi-4 mini.
+    if (src.find("<|tool|>") != std::string::npos) {
+        return common_chat_params_init_phi_4(tmpl, params);
+    }
+
     // Plain handler (no tools)
     if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
         return common_chat_params_init_without_tools(tmpl, params);
@@ -1773,6 +1957,8 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
             return common_chat_parse_command_r7b(input, /* extract_reasoning= */ false);
         case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING:
             return common_chat_parse_command_r7b(input, /* extract_reasoning= */ true);
+        case COMMON_CHAT_FORMAT_PHI_4:
+            return common_chat_parse_phi_4(input);
         default:
             throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
     }
diff --git a/common/chat.h b/common/chat.h
index 9aad84e880448..5e804b6d0ef35 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -56,7 +56,8 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
     COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
-
+    COMMON_CHAT_FORMAT_PHI_4,
+    
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
 
diff --git a/models/templates/README.md b/models/templates/README.md
index e4fd104fc9fe6..66b56e4679f4a 100644
--- a/models/templates/README.md
+++ b/models/templates/README.md
@@ -19,4 +19,5 @@ These templates can be updated with the following commands:
 ./scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use > models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
 ./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use   > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
 ./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct                      > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
+./scripts/get_chat_template.py microsoft/Phi-4-mini-instruct                 > models/templates/microsoft-Phi-4-mini-instruct.jinja
 ```
\ No newline at end of file
diff --git a/models/templates/microsoft-Phi-4-mini-instruct.jinja b/models/templates/microsoft-Phi-4-mini-instruct.jinja
new file mode 100644
index 0000000000000..a9c00dd9bbd97
--- /dev/null
+++ b/models/templates/microsoft-Phi-4-mini-instruct.jinja
@@ -0,0 +1 @@
+{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}
\ No newline at end of file
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index a1034b1a41b12..e47580274ea9b 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -820,6 +820,36 @@ static void test_template_output_parsers() {
         test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
                       "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
     }
+    {
+        auto tmpls = read_templates("models/templates/microsoft-Phi-4-mini-instruct.jinja");
+        std::vector<std::string>   end_tokens{ "<|end|>" };
+    
+        assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+    
+        // Test normal message without tools
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+                
+        // Test with content before tool call
+        assert_msg_equals(
+            common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
+            common_chat_parse(
+                "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>",
+                COMMON_CHAT_FORMAT_PHI_4));
+
+        // Test with content after tool call
+        assert_msg_equals(
+            common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
+            common_chat_parse(
+                "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>I'll help with that.",
+                COMMON_CHAT_FORMAT_PHI_4));
+
+        // Test with newlines.
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "<|tool_call|>\n"
+            "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "</|tool_call|>",
+            COMMON_CHAT_FORMAT_PHI_4));
+    }
     {
         auto tmpls = read_templates("models/templates/meetkai-functionary-medium-v3.1.jinja");
         std::vector<std::string>   end_tokens{ "<|eom_id|>", "<|eot_id|>" };

From eae5d97dafbaa5ce963e0b9f91220125e4b6e969 Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Fri, 14 Mar 2025 15:10:19 -0700
Subject: [PATCH 02/15] Merge pull request #1 from
 ochafik/Telosnex_phi4_tools_template

Fixes for phi-4 support
---
 common/chat.cpp                               | 99 ++-----------------
 docs/function-calling.md                      | 14 ++-
 examples/server/server.cpp                    | 46 +++------
 ...ma-cpp-microsoft-Phi-4-mini-instruct.jinja | 35 +++++++
 4 files changed, 68 insertions(+), 126 deletions(-)
 create mode 100644 models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja

diff --git a/common/chat.cpp b/common/chat.cpp
index 3ddf94545f8ff..fdc6aded3151b 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -584,10 +584,7 @@ static common_chat_msg parse_json_tool_calls(
     }
 
     if (!result.tool_calls.empty()) {
-        if (!string_strip(result.content).empty()) {
-            LOG_WRN("Content found with tool calls: %s\n", result.content.c_str());
-        }
-        result.content = "";
+        result.content = string_strip(result.content);
     }
     return result;
 }
@@ -1371,14 +1368,15 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
             std::string name = function.at("name");
             auto parameters = function.at("parameters");
             builder.resolve_refs(parameters);
-            tool_rules.push_back(builder.add_schema(name + "-call", {
+            auto call_rule = builder.add_schema(name + "-call", {
                 {"type", "object"},
                 {"properties", {
                     {"name", {{"const", name}}},
                     {"arguments", parameters},
                 }},
                 {"required", json::array({"name", "arguments"})},
-            }));
+            });
+            tool_rules.push_back(builder.add_rule(name + "-call", "\"<|tool_call|>\" " + call_rule + " \"<|/tool_call|>\""));
         });
         auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
         std::vector<std::string> alt_tags {
@@ -1391,6 +1389,9 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
         data.preserved_tokens = {
             "<|tool_call|>",
             "</|tool_call|>",
+            "<|tool_response|>",
+            "<|tool|>",
+            "</|tool|>",
         };
     });
 
@@ -1449,89 +1450,9 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
 }
 
 static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
-    common_chat_msg result;
-    result.role = "assistant";
-    
-    std::string final_content = "";
-    
-    const std::string opening_tag = "<|tool_call|>";
-    const std::string closing_tag = "</|tool_call|>";
-    
-    size_t start_pos = 0;
-    while (true) {
-        // Find next tool call
-        size_t tool_start = input.find(opening_tag, start_pos);
-        if (tool_start == std::string::npos) {
-            // No more tool calls.
-
-            // Is start_pos within string bounds?
-            if (start_pos < input.length()) {
-                // Add the rest of the string to final_content
-                final_content += input.substr(start_pos);
-            }
-            break;
-        }
-        
-        // Add content before the tool call to final_content
-        final_content += input.substr(start_pos, tool_start - start_pos);
-
-        // Find closing tag
-        size_t content_start = tool_start + opening_tag.length();
-        size_t tool_end = input.find(closing_tag, content_start);
-        
-        if (tool_end == std::string::npos) {
-            // No closing tag found, so just include the rest of the string as tool.
-            tool_end = input.length();
-        }
-        
-        // Extract tool call content
-        std::string tool_content = input.substr(
-            content_start,
-            tool_end - content_start
-        );
-        
-        // Try to parse the tool call
-        try {
-            auto tool_call = json::parse(tool_content);
-            
-            // Verify the required fields exist
-            if (!tool_call.contains("name")) {
-                throw std::runtime_error("Missing 'name' field in tool call");
-            }
-            
-            if (!tool_call.contains("arguments")) {
-                throw std::runtime_error("Missing 'arguments' field in tool call");
-            }
-            
-            std::string name = tool_call["name"].get<std::string>();
-            
-            std::string arguments;
-            try {
-                arguments = tool_call["arguments"].dump();
-            } catch (const std::exception & e) {
-                LOG_ERR("Failed to serialize arguments: %s\n", e.what());
-                arguments = "{}";
-            }
-            
-            result.tool_calls.push_back({
-                name,
-                arguments,
-                /* id= */ "",
-            });
-        } catch (const std::exception & e) {
-            // If parsing fails, include the entire tool call in the content
-            final_content += input.substr(
-                tool_start,
-                tool_end + closing_tag.length() - tool_start
-            );
-        }
-        
-        // Move past this tool call for next iteration
-        start_pos = tool_end + closing_tag.length();
-    }
-    
-    result.content = final_content;
-    return result;
+    static std::regex function_regex("<\\|tool_call\\|>\\s*\\{\\s*\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"arguments\"\\s*:");
+    static std::regex close_regex(R"(\}\s*(</\|tool_call\|>)?)");
+    return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
 }
 
 
diff --git a/docs/function-calling.md b/docs/function-calling.md
index c3873c3fa63d1..62261077dd508 100644
--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@@ -12,11 +12,12 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
   - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
   - Functionary v3.1 / v3.2
   - Hermes 2/3, Qwen 2.5
-  - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
+  - Qwen 2.5 Coder (#12034)
   - Mistral Nemo
   - Firefunction v2
-  - Command R7B
-  - DeepSeek R1 (WIP / seems reluctant to call any tools?)
+  - Command R7B (#11585)
+  - DeepSeek R1 (#11607)
+  - Phi 4 (#12288)
 
 - Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
   - Use `--chat-template-file` to override the template when appropriate (see examples below)
@@ -297,9 +298,14 @@ llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
 llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
     --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
 
+# Native support for Phi 4 also needs a template override (official template is buggy)
+
+llama-server --jinja -fa -hf bartowski/microsoft_Phi-4-mini-instruct-GGUF \
+    --chat-template-file models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
+
 # Native support requires the right template for these GGUFs:
 
-llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M \
     --chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja
 
 llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 71e053b202cd2..585bc2065c1eb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -386,7 +386,7 @@ struct server_task {
                             trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
                             trigger.value = word;
                             trigger.token = token;
-                            params.sampling.grammar_triggers.push_back(std::move(trigger));
+                            params.sampling.grammar_triggers.push_back(trigger);
                         } else {
                             SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
                             params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
@@ -751,10 +751,7 @@ struct server_task_result_cmpl_final : server_task_result {
                         {"name", tc.name},
                         {"arguments", tc.arguments},
                     }},
-                    // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
-                    // We only generate a random id for the ones that don't generate one by themselves
-                    // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
-                    {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
+                    {"id", tc.id},
                 });
             }
             message["tool_calls"] = tool_calls;
@@ -2040,18 +2037,6 @@ struct server_context {
         return ret;
     }
 
-    bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
-        const llama_model * model = llama_get_model(ctx);
-        const llama_vocab * vocab = llama_model_get_vocab(model);
-        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
-        for (const auto & token : tokens) {
-            if (token < 0 || token >= n_vocab) {
-                return false;
-            }
-        }
-        return true;
-    }
-
     bool launch_slot_with_task(server_slot & slot, const server_task & task) {
         slot.reset();
         slot.id_task       = task.id;
@@ -2066,11 +2051,6 @@ struct server_context {
             slot.lora = task.params.lora;
         }
 
-        bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
-        if (!can_detokenize) {
-            send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
-            return false;
-        }
         SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
 
         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -2113,7 +2093,7 @@ struct server_context {
         SRV_DBG("%s", "clearing KV cache\n");
 
         // clear the entire KV cache
-        llama_kv_self_clear(ctx);
+        llama_kv_cache_clear(ctx);
         clean_kv_cache = false;
     }
 
@@ -2655,8 +2635,8 @@ struct server_context {
                     res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                     res->t_start             = metrics.t_start;
 
-                    res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
-                    res->kv_cache_used_cells   = llama_kv_self_used_cells(ctx);
+                    res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
+                    res->kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
 
                     res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                     res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
@@ -2772,7 +2752,7 @@ struct server_context {
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
+                    llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
                     auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2840,8 +2820,8 @@ struct server_context {
 
                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -3032,8 +3012,8 @@ struct server_context {
 
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
-                                            llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
-                                            llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
+                                            llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
+                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3071,9 +3051,9 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
+                    if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
+                        llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
 
                         // there is no common part left
                         slot.n_past = 0;
@@ -3313,7 +3293,7 @@ struct server_context {
                 slot.cache_tokens.push_back(id);
                 slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
 
-                llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
+                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
 
                 for (size_t i = 0; i < ids.size(); ++i) {
                     completion_token_output result;
diff --git a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
new file mode 100644
index 0000000000000..9f19e57f94802
--- /dev/null
+++ b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
@@ -0,0 +1,35 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+{% elif tools is defined -%}
+    {%- set system_message = "You are a helpful assistant with access to tools." -%}
+{% else %}
+    {%- set system_message = "" -%}
+{%- endif %}
+{%- if tools is defined -%}
+    {%- set system_message = system_message + '<|tool|>' + (tools | tojson) + '<|/tool|>' -%}
+    {%- if '<|tool_call|>' not in system_message -%}
+        {%- set system_message = system_message + "\nTo use a tool, respond in this format: <|tool_call|>{\"name\": \"foo\", \"arguments\": {\"a\": 1}}<|/tool_call|>" %}
+    {%- endif %}
+{%- endif %}
+{%- if system_message is defined -%}
+    {{- '<|system|>' + system_message + '<|end|>' -}}
+{%- endif -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'tool' -%}
+        {{- '<|tool_response|>' + (message['content'] | tojson) + '<|/tool_response|>' -}}
+    {%- elif message['role'] != 'system' -%}
+        {{- '<|' + message['role'] + '|>' -}}
+        {%- if message.content -%}
+            {{- message['content'] -}}
+        {%- endif -%}  
+        {%- for tool_call in message.tool_calls -%}
+            {{- '<|tool_call|>' + (tool_call | tojson) + '<|/tool_call|>' -}}
+        {%- endfor -%}
+        {{- '<|end|>' -}}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+   {{- '<|assistant|>' -}}
+{%- else -%}
+   {{- eos_token -}}
+{%- endif -%}
\ No newline at end of file

From 32d32effb1f0b8c5a45d3fd52d5011ed612d2cec Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Fri, 14 Mar 2025 23:31:35 -0400
Subject: [PATCH 03/15] Revert some bits

I made a mess while merging in Olivier's work, so it ended up merged
into one commit in this branch. In this commit, I undo changes that
wouldn't have been intended in this commit (ex. server.cpp
---
 examples/server/server.cpp | 46 +++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 585bc2065c1eb..71e053b202cd2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -386,7 +386,7 @@ struct server_task {
                             trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
                             trigger.value = word;
                             trigger.token = token;
-                            params.sampling.grammar_triggers.push_back(trigger);
+                            params.sampling.grammar_triggers.push_back(std::move(trigger));
                         } else {
                             SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
                             params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
@@ -751,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result {
                         {"name", tc.name},
                         {"arguments", tc.arguments},
                     }},
-                    {"id", tc.id},
+                    // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
+                    // We only generate a random id for the ones that don't generate one by themselves
+                    // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
+                    {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
                 });
             }
             message["tool_calls"] = tool_calls;
@@ -2037,6 +2040,18 @@ struct server_context {
         return ret;
     }
 
+    bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
+        const llama_model * model = llama_get_model(ctx);
+        const llama_vocab * vocab = llama_model_get_vocab(model);
+        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+        for (const auto & token : tokens) {
+            if (token < 0 || token >= n_vocab) {
+                return false;
+            }
+        }
+        return true;
+    }
+
     bool launch_slot_with_task(server_slot & slot, const server_task & task) {
         slot.reset();
         slot.id_task       = task.id;
@@ -2051,6 +2066,11 @@ struct server_context {
             slot.lora = task.params.lora;
         }
 
+        bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
+        if (!can_detokenize) {
+            send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
+            return false;
+        }
         SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
 
         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@@ -2093,7 +2113,7 @@ struct server_context {
         SRV_DBG("%s", "clearing KV cache\n");
 
         // clear the entire KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
         clean_kv_cache = false;
     }
 
@@ -2635,8 +2655,8 @@ struct server_context {
                     res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                     res->t_start             = metrics.t_start;
 
-                    res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
-                    res->kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
+                    res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
+                    res->kv_cache_used_cells   = llama_kv_self_used_cells(ctx);
 
                     res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                     res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
@@ -2752,7 +2772,7 @@ struct server_context {
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
+                    llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
                     auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2820,8 +2840,8 @@ struct server_context {
 
                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-                llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -3012,8 +3032,8 @@ struct server_context {
 
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
-                                            llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
-                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
+                                            llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
+                                            llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -3051,9 +3071,9 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
+                    if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
+                        llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
 
                         // there is no common part left
                         slot.n_past = 0;
@@ -3293,7 +3313,7 @@ struct server_context {
                 slot.cache_tokens.push_back(id);
                 slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
 
-                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
+                llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
 
                 for (size_t i = 0; i < ids.size(); ++i) {
                     completion_token_output result;

From 32ab329f4efa7d5a57b0fc1b73b96782380e49f3 Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Sat, 15 Mar 2025 00:02:33 -0400
Subject: [PATCH 04/15] Fix tokens

via https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/added_tokens.json
{
  "<|/tool_call|>": 200026,
  "<|/tool|>": 200024,
  "<|assistant|>": 200019,
  "<|end|>": 200020,
  "<|system|>": 200022,
  "<|tag|>": 200028,
  "<|tool_call|>": 200025,
  "<|tool_response|>": 200027,
  "<|tool|>": 200023,
  "<|user|>": 200021
}

FWIW tool_response seems to be a role, via https://github.com/kinfey/Phi-3CookBook/blob/main/md/02.Application/07.FunctionCalling/Phi4/Multiagents/Phi_4_mini_multiagent.ipynb
---
 common/chat.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index fdc6aded3151b..377a18a7b7138 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1388,10 +1388,10 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
         data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_call|>"});
         data.preserved_tokens = {
             "<|tool_call|>",
-            "</|tool_call|>",
+            "<|/tool_call|>",
             "<|tool_response|>",
             "<|tool|>",
-            "</|tool|>",
+            "<|/tool|>",
         };
     });
 
@@ -1451,7 +1451,7 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
 
 static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
     static std::regex function_regex("<\\|tool_call\\|>\\s*\\{\\s*\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"arguments\"\\s*:");
-    static std::regex close_regex(R"(\}\s*(</\|tool_call\|>)?)");
+    static std::regex close_regex(R"(\}\s*(<\|/tool_call\|>)?)");
     return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
 }
 

From 094f6079963c8fc06bd5e92d65a40e17859f69f5 Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Sat, 15 Mar 2025 00:12:20 -0400
Subject: [PATCH 05/15] Tweak tool response

Phi-4 tool calling's docs include the HuggingFace pages for Phi-4[^1] and Jupyter notebooks at [^2]. [^1] only has <tool_call>, and the notebook at [^3] is the only resource I've found that demonstrates tool responses.

It looks like it's used as a sort of role -- it doesn't directly show it 100% directly, but, the message format used defines tool_response as a role in a structure with text content, identical to the user/assistant messages in the same notebook.

Given that, and also it explaining another small mystery to me (why isn't <|/tool_response> in the reserved tokens?), I'll apply that here.

[^1] https://huggingface.co/microsoft/Phi-4-mini-instruct
[^2] https://github.com/microsoft/PhiCookBook/tree/main/md/02.Application/07.FunctionCalling/Phi4
[^3] https://github.com/microsoft/PhiCookBook/blob/main/md/02.Application/07.FunctionCalling/Phi4/Multiagents/Phi_4_mini_multiagent.ipynb
---
 models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
index 9f19e57f94802..84e4668536c1b 100644
--- a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
+++ b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
@@ -16,7 +16,7 @@
 {%- endif -%}
 {%- for message in messages -%}
     {%- if message['role'] == 'tool' -%}
-        {{- '<|tool_response|>' + (message['content'] | tojson) + '<|/tool_response|>' -}}
+        {{- '<|tool_response|>' + (message['content'] | tojson) + '<|end|>' -}}
     {%- elif message['role'] != 'system' -%}
         {{- '<|' + message['role'] + '|>' -}}
         {%- if message.content -%}

From 274ef561409ddcca2d2d20a2f9ff5a79c9ef0dd1 Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Sun, 16 Mar 2025 15:30:09 -0400
Subject: [PATCH 06/15] Phi-4 tool calls x template won't fixup messages; add
 comment re: passing template

---
 common/chat.cpp | 55 +++++++------------------------------------------
 1 file changed, 7 insertions(+), 48 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 377a18a7b7138..851a52d11354f 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1397,54 +1397,13 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
 
     // For Phi-4, we need to inject tools into the system message
     // because the template expects tools in the system message with <|tool|> tags
-    if (inputs.tools.empty()) {
-        // No tools, use normal approach
-        data.prompt = apply(tmpl, inputs.messages, json::array(), inputs.add_generation_prompt);
-    } else {
-        // Make a copy of messages that we can modify
-        json adjusted_messages = inputs.messages;
-        
-        // Extract just the function part of the OpenAI-formatted tools
-        json phi4_tools = json::array();
-        foreach_function(inputs.tools, [&](const json & tool) {
-            phi4_tools.push_back(tool.at("function"));
-        });
-        
-        // Phi-4 template expects tools in the system message with <|tool|> tags.
-        // Find the system message, or add one if it doesn't exist
-        bool found_system_msg = false;
-        for (auto & message : adjusted_messages) {
-            if (message.contains("role") && message["role"] == "system") {
-                // Add tools to the existing system message and update content to mention tools
-                message["tools"] = phi4_tools;
-                
-                // If the system message doesn't mention tools, append that information
-                std::string content = message["content"];
-                if (content.find("tool") == std::string::npos && 
-                    content.find("function") == std::string::npos) {
-                    message["content"] = content + " You have access to some tools.";
-                }
-                
-                found_system_msg = true;
-                break;
-            }
-        }
-        
-        // If no system message, add one with tools
-        if (!found_system_msg && !adjusted_messages.empty()) {
-            json system_msg = {
-                {"role", "system"},
-                {"content", "You are a helpful assistant with access to tools.\nTo use a tool, respond in this format: <|tool_call|>{\"name\": \"foo\", \"arguments\": {\"a\": 1}}<|/tool_call|>"},
-                {"tools", phi4_tools}
-            };
-            // Insert system message at the beginning
-            adjusted_messages.insert(adjusted_messages.begin(), system_msg);
-        }
-        
-        // Apply template with tools embedded in system message, passing empty tools separately
-        data.prompt = apply(tmpl, adjusted_messages, json(), inputs.add_generation_prompt);
-    } 
-    
+    // The Phi-4 template has issues with tool calls.
+    // It is advisable to use --chat-template-file models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
+    // - It expects tools from the system message (instead of as a global variable as most templates). 
+    // - It does not print tool calls (this is worked around by the Minja + the generic mode, but without the <|tool_call|> syntax)
+    // - With defaults, it prints tool call results (messages such as {"role": "tool", "name": "foo", "content": "42"}) as <|tool|>42<|end|> which conflicts with the tool description wrapping mechanism.
+    // - Tool call results are expected to be injected as a message from the <|tool_response|> role. i.e. <|tool_response|>(json.dump())<|end|>
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
     data.format = COMMON_CHAT_FORMAT_PHI_4;
     return data;
 }

From b15b8096dce6074139a7bbf9ee165ce8e4965898 Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Sun, 16 Mar 2025 15:35:19 -0400
Subject: [PATCH 07/15] Phi-4 template changes from testing latest version

1. Add <tool> tag wrapping individually for each tool
2. Avoid adding empty system message when there's no tools and no system message in input messages.
3. Move instructions to bottom (this worked a lot better when multiple tools added)

Re: 1

There's no concrete justification for this under than it *seems* better. Unfortunately, the only material for multiple tool calls x Phi-4 are based on ollama, which is downstream of llama.cpp, and either which way, obfuscates how its translated into multiple tools, and is unreliable per user reports.

https://colab.research.google.com/github/kinfey/Phi-3CookBook/blob/main/md/02.Application/07.FunctionCalling/Phi4/Multiagents/Phi_4_mini_multiagent.ipynb#scrollTo=NRgfmCBShDot%20via%20https://github.com/kinfey/Phi-3CookBook%20via%20https://github.com/kinfey/Phi-3CookBook/blob/main/md/02.Application/07.FunctionCalling/Phi4/Multiagents/Phi_4_mini_multiagent.ipynb
---
 ...ma-cpp-microsoft-Phi-4-mini-instruct.jinja | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
index 84e4668536c1b..c2a393d75599b 100644
--- a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
+++ b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
@@ -1,18 +1,20 @@
 {%- if messages[0]["role"] == "system" %}
     {%- set system_message = messages[0]["content"] %}
 {% elif tools is defined -%}
-    {%- set system_message = "You are a helpful assistant with access to tools." -%}
-{% else %}
     {%- set system_message = "" -%}
 {%- endif %}
-{%- if tools is defined -%}
-    {%- set system_message = system_message + '<|tool|>' + (tools | tojson) + '<|/tool|>' -%}
-    {%- if '<|tool_call|>' not in system_message -%}
-        {%- set system_message = system_message + "\nTo use a tool, respond in this format: <|tool_call|>{\"name\": \"foo\", \"arguments\": {\"a\": 1}}<|/tool_call|>" %}
-    {%- endif %}
-{%- endif %}
+
 {%- if system_message is defined -%}
-    {{- '<|system|>' + system_message + '<|end|>' -}}
+    {{- '<|system|>' + system_message -}}
+    {%- if tools is defined -%}
+        {% for tool in tools %}
+            {{- '<|tool|>' + (tool['function'] | tojson) + '<|/tool|>' -}}
+        {% endfor %}
+        {%- if '<|tool_call|>' not in system_message -%}
+            {{- 'You are a helpful assistant with some tools.\nTo use a tool, respond in this format: <|tool_call|>{"name": "foo", "arguments": {"a": 1}}<|/tool_call|>' -}}
+        {%- endif -%}
+    {%- endif -%}
+    {{- '<|end|>' -}}
 {%- endif -%}
 {%- for message in messages -%}
     {%- if message['role'] == 'tool' -%}

From 3ca03c7cd8ca3cb6b06f9980be04551fef7a4e03 Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Sun, 16 Mar 2025 15:50:51 -0400
Subject: [PATCH 08/15] remove unnecessary nl

---
 common/chat.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 851a52d11354f..c4492e09d7a8b 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1414,7 +1414,6 @@ static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
     return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
 }
 
-
 static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*

From 8ccefe50ac6eab6730c60fac969ac2bc51a0bcde Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Sun, 16 Mar 2025 15:51:03 -0400
Subject: [PATCH 09/15] fix tests (they had incorrect tool call end tag)

---
 tests/test-chat.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index e47580274ea9b..3acac89aa4943 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -821,7 +821,7 @@ static void test_template_output_parsers() {
                       "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
     }
     {
-        auto tmpls = read_templates("models/templates/microsoft-Phi-4-mini-instruct.jinja");
+        auto tmpls = read_templates("models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja");
         std::vector<std::string>   end_tokens{ "<|end|>" };
     
         assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
@@ -833,21 +833,21 @@ static void test_template_output_parsers() {
         assert_msg_equals(
             common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
             common_chat_parse(
-                "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>",
+                "I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>",
                 COMMON_CHAT_FORMAT_PHI_4));
 
         // Test with content after tool call
         assert_msg_equals(
             common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
             common_chat_parse(
-                "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}</|tool_call|>I'll help with that.",
+                "<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>I'll help with that.",
                 COMMON_CHAT_FORMAT_PHI_4));
 
         // Test with newlines.
         assert_msg_equals(message_assist_call, common_chat_parse(
             "<|tool_call|>\n"
             "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</|tool_call|>",
+            "<|/tool_call|>",
             COMMON_CHAT_FORMAT_PHI_4));
     }
     {

From c2343b2aa56ba2ba3396dbb17a9702e1144cddd7 Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Tue, 18 Mar 2025 18:21:31 -0400
Subject: [PATCH 10/15] Fix trailing whitespace via editorconfig action

https://github.com/ggml-org/llama.cpp/actions/runs/13886820859/job/38855413335?pr=12288
---
 common/chat.cpp     | 2 +-
 common/chat.h       | 2 +-
 tests/test-chat.cpp | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index c4492e09d7a8b..ada4535e189d4 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1399,7 +1399,7 @@ static common_chat_params common_chat_params_init_phi_4(const common_chat_templa
     // because the template expects tools in the system message with <|tool|> tags
     // The Phi-4 template has issues with tool calls.
     // It is advisable to use --chat-template-file models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
-    // - It expects tools from the system message (instead of as a global variable as most templates). 
+    // - It expects tools from the system message (instead of as a global variable as most templates).
     // - It does not print tool calls (this is worked around by the Minja + the generic mode, but without the <|tool_call|> syntax)
     // - With defaults, it prints tool call results (messages such as {"role": "tool", "name": "foo", "content": "42"}) as <|tool|>42<|end|> which conflicts with the tool description wrapping mechanism.
     // - Tool call results are expected to be injected as a message from the <|tool_response|> role. i.e. <|tool_response|>(json.dump())<|end|>
diff --git a/common/chat.h b/common/chat.h
index 5e804b6d0ef35..72215c49eb31d 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -57,7 +57,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_COMMAND_R7B,
     COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_PHI_4,
-    
+
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
 
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index 3acac89aa4943..e13befdb7c179 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -823,12 +823,12 @@ static void test_template_output_parsers() {
     {
         auto tmpls = read_templates("models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja");
         std::vector<std::string>   end_tokens{ "<|end|>" };
-    
+
         assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
-    
+
         // Test normal message without tools
         test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
-                
+
         // Test with content before tool call
         assert_msg_equals(
             common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},

From a5d014b6176bfd53fc889e37bb1df2866a4a7e5a Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Wed, 19 Mar 2025 12:25:34 -0400
Subject: [PATCH 11/15] Test coverage in test_tool_call.py

---
 examples/server/tests/unit/test_tool_call.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py
index 569c2a1f8ea31..320bf68b5804d 100755
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@@ -138,6 +138,8 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict,
     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
     ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
     # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
+    ("microsoft-Phi-4-mini-instruct",                 TEST_TOOL,            "success"),
+    ("microsoft-Phi-4-mini-instruct",                 PYTHON_TOOL,          "code"),
 ])
 def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
     global server
@@ -164,6 +166,9 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
     (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
     (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
+    (TEST_TOOL,    "success",  "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
+
     (TEST_TOOL,    "success",  "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"),
@@ -306,6 +311,8 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
+    ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
+
     ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
     ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"),
 
@@ -385,6 +392,7 @@ def do_test_weather(server: ServerProcess, **kwargs):
 @pytest.mark.slow
 @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
     (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       "chatml"),
+    (None,                                           128,  "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
     (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
     (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
     (None,                                           128,  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",         "chatml"),
@@ -394,6 +402,7 @@ def do_test_weather(server: ServerProcess, **kwargs):
     (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  None),
     (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  "chatml"),
     (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (None,                                           128,  "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", None),
     ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
 
     # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
@@ -535,6 +544,8 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
+    ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
+
     ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
     ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
 

From 09b795def52d3e95622f16a27b46b1963005e71a Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Wed, 19 Mar 2025 14:16:44 -0400
Subject: [PATCH 12/15] Fix template expansion

---
 models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
index c2a393d75599b..dd3a199fe78e9 100644
--- a/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
+++ b/models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
@@ -25,7 +25,7 @@
             {{- message['content'] -}}
         {%- endif -%}  
         {%- for tool_call in message.tool_calls -%}
-            {{- '<|tool_call|>' + (tool_call | tojson) + '<|/tool_call|>' -}}
+            {{- '<|tool_call|>' + (tool_call['function'] | tojson) + '<|/tool_call|>' -}}
         {%- endfor -%}
         {{- '<|end|>' -}}
     {%- endif -%}

From 61ff59ea669a0dd9bacf418df4b9c21ef4c9b6fe Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Wed, 19 Mar 2025 14:31:55 -0400
Subject: [PATCH 13/15] Ensure both <|tool|> and <|tool_response|> are in
 template before deciding its phi-4

---
 common/chat.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index ada4535e189d4..ee5b5bcd24090 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1702,7 +1702,11 @@ static common_chat_params common_chat_templates_apply_jinja(
 
     // Phi-4 mini.
     if (src.find("<|tool|>") != std::string::npos) {
-        return common_chat_params_init_phi_4(tmpl, params);
+        if (src.find("<|tool_response|>") != std::string::npos) {
+            return common_chat_params_init_phi_4(tmpl, params);
+        } else {
+            LOG_WRN("[%s] Invalid legacy Phi 4 template detected: switching to Generic tool call format. To enable native support, please restart with `--chat-template-file models/template/microsoft-Phi-4-mini-instruct.jinja`", __func__);
+        } 
     }
 
     // Plain handler (no tools)

From 65c254169fbb456d352ad6a37e181bf97366ec97 Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Wed, 19 Mar 2025 14:32:45 -0400
Subject: [PATCH 14/15] test_tool_call.py tests: chatml; every test block
 w/Phi-3.5 has a Phi-4 equivalent

---
 examples/server/tests/unit/test_tool_call.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py
index 320bf68b5804d..a8675161fe2a9 100755
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@@ -168,6 +168,7 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
 
     (TEST_TOOL,    "success",  "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
     (PYTHON_TOOL,  "code",     "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
     (TEST_TOOL,    "success",  "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
@@ -312,6 +313,7 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
     ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
+    ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
     ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
     ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"),
@@ -392,7 +394,9 @@ def do_test_weather(server: ServerProcess, **kwargs):
 @pytest.mark.slow
 @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
     (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       "chatml"),
-    (None,                                           128,  "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
+    # Answers using text, not tools, complaining it wants to measure from the positive Z-axis not x-axis. 
+    # (None,                                           128,  "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
+    (None,                                           128,  "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"),
     (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
     (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
     (None,                                           128,  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",         "chatml"),
@@ -545,6 +549,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
     ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
+    ("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
     ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
     ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),

From ff78c90cc18cc9adbd9e509875ac40857558c5f4 Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Wed, 19 Mar 2025 17:44:04 -0400
Subject: [PATCH 15/15] fix: trailing whitespace

---
 common/chat.cpp                              | 2 +-
 examples/server/tests/unit/test_tool_call.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index ee5b5bcd24090..df28190d5eb26 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1706,7 +1706,7 @@ static common_chat_params common_chat_templates_apply_jinja(
             return common_chat_params_init_phi_4(tmpl, params);
         } else {
             LOG_WRN("[%s] Invalid legacy Phi 4 template detected: switching to Generic tool call format. To enable native support, please restart with `--chat-template-file models/template/microsoft-Phi-4-mini-instruct.jinja`", __func__);
-        } 
+        }
     }
 
     // Plain handler (no tools)
diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py
index a8675161fe2a9..206d087ff1e3e 100755
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@@ -394,7 +394,7 @@ def do_test_weather(server: ServerProcess, **kwargs):
 @pytest.mark.slow
 @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
     (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       "chatml"),
-    # Answers using text, not tools, complaining it wants to measure from the positive Z-axis not x-axis. 
+    # Answers using text, not tools, complaining it wants to measure from the positive Z-axis not x-axis.
     # (None,                                           128,  "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
     (None,                                           128,  "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"),
     (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),