Skip to content

Commit d1a7ced

Browse files
authored
ENH: support chat_template_kwargs for llama.cpp (#3988)
1 parent 9827cd2 commit d1a7ced

File tree

3 files changed

+18
-4
lines changed

3 files changed

+18
-4
lines changed

xinference/model/llm/llama_cpp/core.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
from ....constants import XINFERENCE_MAX_TOKENS
2525
from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
26-
from ..core import LLM
26+
from ..core import LLM, chat_context_var
2727
from ..llm_family import LLMFamilyV2, LLMSpecV1
2828
from ..utils import ChatModelMixin
2929

@@ -297,6 +297,15 @@ def chat(
297297
if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
298298
generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
299299
stream = generate_config.get("stream", False)
300+
301+
chat_template_kwargs = (
302+
self._get_chat_template_kwargs_from_generate_config(
303+
generate_config, self.reasoning_parser
304+
)
305+
or {}
306+
)
307+
chat_context_var.set(chat_template_kwargs)
308+
300309
tools = generate_config.pop("tools", []) if generate_config else None
301310
q: queue.Queue = queue.Queue()
302311

@@ -314,6 +323,9 @@ def _handle_chat_completion():
314323
"model": self.model_uid,
315324
}
316325
)
326+
if chat_template_kwargs:
327+
data["chat_template_kwargs"] = chat_template_kwargs
328+
317329
try:
318330

319331
def _callback(res):

xinference/model/llm/llm_family.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7687,7 +7687,7 @@
76877687
"packages": [
76887688
"transformers>=4.51.3",
76897689
"mlx-lm>=0.23.1 ; sys_platform=='darwin'",
7690-
"numpy==1.26.4"
7690+
"#system_numpy#"
76917691
]
76927692
}
76937693
},
@@ -15521,7 +15521,7 @@
1552115521
"virtualenv": {
1552215522
"packages": [
1552315523
"git+https://github.com/huggingface/[email protected]",
15524-
"numpy==1.26.4",
15524+
"#system_numpy#",
1552515525
"qwen_omni_utils",
1552615526
"soundfile"
1552715527
]
@@ -17302,7 +17302,7 @@
1730217302
"packages": [
1730317303
"transformers>=4.51.0",
1730417304
"mlx-lm>=0.24.0 ; sys_platform=='darwin'",
17305-
"numpy==1.26.4"
17305+
"#system_numpy#"
1730617306
]
1730717307
}
1730817308
},

xinference/ui/gradio/chat_interface.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ def generate_wrapper(
135135
generate_config=generate_config, # type: ignore
136136
):
137137
assert isinstance(chunk, dict)
138+
if not chunk["choices"]:
139+
continue
138140
delta = chunk["choices"][0]["delta"]
139141

140142
if (

0 commit comments

Comments
 (0)