Skip to content

tool-call: Phi-4 support #12288

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
c3aac4e
`tool-call`: Phi-4 support
jpohhhh Mar 9, 2025
eae5d97
Merge pull request #1 from ochafik/Telosnex_phi4_tools_template
jpohhhh Mar 14, 2025
32d32ef
Revert some bits
jpohhhh Mar 15, 2025
32ab329
Fix tokens
jpohhhh Mar 15, 2025
094f607
Tweak tool response
jpohhhh Mar 15, 2025
258b912
Merge branch 'master' of https://github.com/ggml-org/llama.cpp into p…
jpohhhh Mar 16, 2025
274ef56
Phi-4 tool calls x template won't fixup messages; add comment re: pas…
jpohhhh Mar 16, 2025
b15b809
Phi-4 template changes from testing latest version
jpohhhh Mar 16, 2025
f74aee0
Merge branch 'master' of https://github.com/ggml-org/llama.cpp into p…
jpohhhh Mar 16, 2025
3ca03c7
remove unnecessary nl
jpohhhh Mar 16, 2025
8ccefe5
fix tests (they had incorrect tool call end tag)
jpohhhh Mar 16, 2025
6d53c24
Merge branch 'master' of https://github.com/ggml-org/llama.cpp into p…
jpohhhh Mar 18, 2025
c2343b2
Fix trailing whitespace via editorconfig action
jpohhhh Mar 18, 2025
a5d014b
Test coverage in test_tool_call.py
jpohhhh Mar 19, 2025
5cd800b
Merge branch 'master' of https://github.com/ggml-org/llama.cpp into p…
jpohhhh Mar 19, 2025
09b795d
Fix template expansion
jpohhhh Mar 19, 2025
61ff59e
Ensure both <|tool|> and <|tool_response|> are in template before dec…
jpohhhh Mar 19, 2025
65c2541
test_tool_call.py tests: chatml; every test block w/Phi-3.5 has a Phi…
jpohhhh Mar 19, 2025
e450590
Merge branch 'master' of https://github.com/ggml-org/llama.cpp into p…
jpohhhh Mar 19, 2025
ff78c90
fix: trailing whitespace
jpohhhh Mar 19, 2025
42858f6
Merge branch 'master' of https://github.com/ggml-org/llama.cpp into p…
jpohhhh Mar 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 73 additions & 4 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,7 @@ std::string common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)";
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
case COMMON_CHAT_FORMAT_PHI_4: return "Phi-4";
default:
throw std::runtime_error("Unknown chat format");
}
Expand Down Expand Up @@ -583,10 +584,7 @@ static common_chat_msg parse_json_tool_calls(
}

if (!result.tool_calls.empty()) {
if (!string_strip(result.content).empty()) {
LOG_WRN("Content found with tool calls: %s\n", result.content.c_str());
}
result.content = "";
result.content = string_strip(result.content);
}
return result;
}
Expand Down Expand Up @@ -1356,6 +1354,66 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
}

static common_chat_params common_chat_params_init_phi_4(const common_chat_template & tmpl, const struct templates_params & inputs) {
// Phi-4 has a unique format that expects tools in the system message with <|tool|> tags
// and returns function calls as a JSON object after <|tool_call|> tag
common_chat_params data;

data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
std::vector<std::string> tool_rules;
std::vector<std::string> tool_call_alts;
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
std::string name = function.at("name");
auto parameters = function.at("parameters");
builder.resolve_refs(parameters);
auto call_rule = builder.add_schema(name + "-call", {
{"type", "object"},
{"properties", {
{"name", {{"const", name}}},
{"arguments", parameters},
}},
{"required", json::array({"name", "arguments"})},
});
tool_rules.push_back(builder.add_rule(name + "-call", "\"<|tool_call|>\" " + call_rule + " \"<|/tool_call|>\""));
});
auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
std::vector<std::string> alt_tags {
any_tool_call,
};
tool_call_alts.push_back(any_tool_call);
auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_call|>"});
data.preserved_tokens = {
"<|tool_call|>",
"<|/tool_call|>",
"<|tool_response|>",
"<|tool|>",
"<|/tool|>",
};
});

// For Phi-4, we need to inject tools into the system message
// because the template expects tools in the system message with <|tool|> tags
// The Phi-4 template has issues with tool calls.
// It is advisable to use --chat-template-file models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
// - It expects tools from the system message (instead of as a global variable as most templates).
// - It does not print tool calls (this is worked around by the Minja + the generic mode, but without the <|tool_call|> syntax)
// - With defaults, it prints tool call results (messages such as {"role": "tool", "name": "foo", "content": "42"}) as <|tool|>42<|end|> which conflicts with the tool description wrapping mechanism.
// - Tool call results are expected to be injected as a message from the <|tool_response|> role. i.e. <|tool_response|>(json.dump())<|end|>
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_PHI_4;
return data;
}

static common_chat_msg common_chat_parse_phi_4(const std::string & input) {
static std::regex function_regex("<\\|tool_call\\|>\\s*\\{\\s*\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"arguments\"\\s*:");
static std::regex close_regex(R"(\}\s*(<\|/tool_call\|>)?)");
return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
}

static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
Expand Down Expand Up @@ -1642,6 +1700,15 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_firefunction_v2(tmpl, params);
}

// Phi-4 mini.
if (src.find("<|tool|>") != std::string::npos) {
if (src.find("<|tool_response|>") != std::string::npos) {
return common_chat_params_init_phi_4(tmpl, params);
} else {
LOG_WRN("[%s] Invalid legacy Phi 4 template detected: switching to Generic tool call format. To enable native support, please restart with `--chat-template-file models/template/microsoft-Phi-4-mini-instruct.jinja`", __func__);
}
}

// Plain handler (no tools)
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return common_chat_params_init_without_tools(tmpl, params);
Expand Down Expand Up @@ -1773,6 +1840,8 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
return common_chat_parse_command_r7b(input, /* extract_reasoning= */ false);
case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING:
return common_chat_parse_command_r7b(input, /* extract_reasoning= */ true);
case COMMON_CHAT_FORMAT_PHI_4:
return common_chat_parse_phi_4(input);
default:
throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
}
Expand Down
1 change: 1 addition & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
COMMON_CHAT_FORMAT_COMMAND_R7B,
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
COMMON_CHAT_FORMAT_PHI_4,

COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};
Expand Down
14 changes: 10 additions & 4 deletions docs/function-calling.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
- Functionary v3.1 / v3.2
- Hermes 2/3, Qwen 2.5
- Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
- Qwen 2.5 Coder (#12034)
- Mistral Nemo
- Firefunction v2
- Command R7B
- DeepSeek R1 (WIP / seems reluctant to call any tools?)
- Command R7B (#11585)
- DeepSeek R1 (#11607)
- Phi 4 (#12288)

- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
- Use `--chat-template-file` to override the template when appropriate (see examples below)
Expand Down Expand Up @@ -297,9 +298,14 @@ llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja

# Native support for Phi 4 also needs a template override (official template is buggy)

llama-server --jinja -fa -hf bartowski/microsoft_Phi-4-mini-instruct-GGUF \
--chat-template-file models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja

# Native support requires the right template for these GGUFs:

llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M \
--chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja

llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
Expand Down
16 changes: 16 additions & 0 deletions examples/server/tests/unit/test_tool_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict,
("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"),
("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"),
# ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
("microsoft-Phi-4-mini-instruct", TEST_TOOL, "success"),
("microsoft-Phi-4-mini-instruct", PYTHON_TOOL, "code"),
])
def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
global server
Expand All @@ -164,6 +166,10 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
(PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),

(TEST_TOOL, "success", "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
(PYTHON_TOOL, "code", "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
(PYTHON_TOOL, "code", "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"),

(TEST_TOOL, "success", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
(PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"),
Expand Down Expand Up @@ -306,6 +312,9 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you throw in a default (generic) test case for hello_world, weather and calc_result? Just in case their default template changes and something goes boom.

("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M",      None),

("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"),

("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"),

Expand Down Expand Up @@ -385,6 +394,9 @@ def do_test_weather(server: ServerProcess, **kwargs):
@pytest.mark.slow
@pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
(None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
# Answers using text, not tools, complaining it wants to measure from the positive Z-axis not x-axis.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that this (crucial) test doesn't test tool call emission, but leveraging of tool call results. Failure indicates that the model doesn't understand the syntax used to give it the result.

# (None, 128, "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
(None, 128, "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"),
(None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
(None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
(None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
Expand All @@ -394,6 +406,7 @@ def do_test_weather(server: ServerProcess, **kwargs):
(None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
(None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
(None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
(None, 128, "bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", None),
("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),

# TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
Expand Down Expand Up @@ -535,6 +548,9 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),

("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", ("llama-cpp-microsoft-Phi-4-mini-instruct", None)),
("bartowski/microsoft_Phi-4-mini-instruct-GGUF:Q4_K_M", "chatml"),

("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),

Expand Down
1 change: 1 addition & 0 deletions models/templates/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ These templates can be updated with the following commands:
./scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use > models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
./scripts/get_chat_template.py microsoft/Phi-4-mini-instruct > models/templates/microsoft-Phi-4-mini-instruct.jinja
```
37 changes: 37 additions & 0 deletions models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{%- if messages[0]["role"] == "system" %}
{%- set system_message = messages[0]["content"] %}
{% elif tools is defined -%}
{%- set system_message = "" -%}
{%- endif %}

{%- if system_message is defined -%}
{{- '<|system|>' + system_message -}}
{%- if tools is defined -%}
{% for tool in tools %}
{{- '<|tool|>' + (tool['function'] | tojson) + '<|/tool|>' -}}
{% endfor %}
{%- if '<|tool_call|>' not in system_message -%}
{{- 'You are a helpful assistant with some tools.\nTo use a tool, respond in this format: <|tool_call|>{"name": "foo", "arguments": {"a": 1}}<|/tool_call|>' -}}
{%- endif -%}
{%- endif -%}
{{- '<|end|>' -}}
{%- endif -%}
{%- for message in messages -%}
{%- if message['role'] == 'tool' -%}
{{- '<|tool_response|>' + (message['content'] | tojson) + '<|end|>' -}}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is seems to be the cause of the test_calc_result failure. Good news is, the following (wild hunch / inspired by other tool call styles) makes it work:

{{- '<|tool_response|>' + message['name'] + '<|tag|>' + (message['content'] | tojson) + '<|end|>' -}}

{%- elif message['role'] != 'system' -%}
{{- '<|' + message['role'] + '|>' -}}
{%- if message.content -%}
{{- message['content'] -}}
{%- endif -%}
{%- for tool_call in message.tool_calls -%}
{{- '<|tool_call|>' + (tool_call['function'] | tojson) + '<|/tool_call|>' -}}
{%- endfor -%}
{{- '<|end|>' -}}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- '<|assistant|>' -}}
{%- else -%}
{{- eos_token -}}
{%- endif -%}
1 change: 1 addition & 0 deletions models/templates/microsoft-Phi-4-mini-instruct.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}
30 changes: 30 additions & 0 deletions tests/test-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -820,6 +820,36 @@ static void test_template_output_parsers() {
test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
"{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
}
{
auto tmpls = read_templates("models/templates/llama-cpp-microsoft-Phi-4-mini-instruct.jinja");
std::vector<std::string> end_tokens{ "<|end|>" };

assert_equals(COMMON_CHAT_FORMAT_PHI_4, common_chat_templates_apply(tmpls.get(), inputs_tools).format);

// Test normal message without tools
test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);

// Test with content before tool call
assert_msg_equals(
common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
common_chat_parse(
"I'll help with that.<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>",
COMMON_CHAT_FORMAT_PHI_4));

// Test with content after tool call
assert_msg_equals(
common_chat_msg{"assistant", "I'll help with that.", {}, tool_calls, "", "", ""},
common_chat_parse(
"<|tool_call|>{\"name\":\"special_function\",\"arguments\":{\"arg1\":1}}<|/tool_call|>I'll help with that.",
COMMON_CHAT_FORMAT_PHI_4));

// Test with newlines.
assert_msg_equals(message_assist_call, common_chat_parse(
"<|tool_call|>\n"
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
"<|/tool_call|>",
COMMON_CHAT_FORMAT_PHI_4));
}
{
auto tmpls = read_templates("models/templates/meetkai-functionary-medium-v3.1.jinja");
std::vector<std::string> end_tokens{ "<|eom_id|>", "<|eot_id|>" };
Expand Down