ENH: support chat_template_kwargs for llama.cpp (#3988)

qinxuye · web-flow · commit d1a7ced04aff · 2025-08-29T23:58:09.000+08:00
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
@@ -23,7 +23,7 @@
 
 from ....constants import XINFERENCE_MAX_TOKENS
 from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
-from ..core import LLM
+from ..core import LLM, chat_context_var
 from ..llm_family import LLMFamilyV2, LLMSpecV1
 from ..utils import ChatModelMixin
 
@@ -297,6 +297,15 @@ def chat(
         if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
             generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
         stream = generate_config.get("stream", False)
+
+        chat_template_kwargs = (
+            self._get_chat_template_kwargs_from_generate_config(
+                generate_config, self.reasoning_parser
+            )
+            or {}
+        )
+        chat_context_var.set(chat_template_kwargs)
+
         tools = generate_config.pop("tools", []) if generate_config else None
         q: queue.Queue = queue.Queue()
 
@@ -314,6 +323,9 @@ def _handle_chat_completion():
                     "model": self.model_uid,
                 }
             )
+            if chat_template_kwargs:
+                data["chat_template_kwargs"] = chat_template_kwargs
+
             try:
 
                 def _callback(res):
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -7687,7 +7687,7 @@
       "packages": [
         "transformers>=4.51.3",
         "mlx-lm>=0.23.1 ; sys_platform=='darwin'",
-        "numpy==1.26.4"
+        "#system_numpy#"
       ]
     }
   },
@@ -15521,7 +15521,7 @@
     "virtualenv": {
       "packages": [
         "git+https://github.com/huggingface/transformers@v4.51.3-Qwen2.5-Omni-preview",
-        "numpy==1.26.4",
+        "#system_numpy#",
         "qwen_omni_utils",
         "soundfile"
       ]
@@ -17302,7 +17302,7 @@
       "packages": [
         "transformers>=4.51.0",
         "mlx-lm>=0.24.0 ; sys_platform=='darwin'",
-        "numpy==1.26.4"
+        "#system_numpy#"
       ]
     }
   },
diff --git a/xinference/ui/gradio/chat_interface.py b/xinference/ui/gradio/chat_interface.py
@@ -135,6 +135,8 @@ def generate_wrapper(
                     generate_config=generate_config,  # type: ignore
                 ):
                     assert isinstance(chunk, dict)
+                    if not chunk["choices"]:
+                        continue
                     delta = chunk["choices"][0]["delta"]
 
                     if (

Original file line number	Diff line number	Diff line change
`@@ -7687,7 +7687,7 @@`
`7687`	`7687`	`"packages": [`
`7688`	`7688`	`"transformers>=4.51.3",`
`7689`	`7689`	`"mlx-lm>=0.23.1 ; sys_platform=='darwin'",`
`7690`		`- "numpy==1.26.4"`
	`7690`	`+ "#system_numpy#"`
`7691`	`7691`	`]`
`7692`	`7692`	`}`
`7693`	`7693`	`},`
`@@ -15521,7 +15521,7 @@`
`15521`	`15521`	`"virtualenv": {`
`15522`	`15522`	`"packages": [`
`15523`	`15523`	`"git+https://github.com/huggingface/[email protected]",`
`15524`		`- "numpy==1.26.4",`
	`15524`	`+ "#system_numpy#",`
`15525`	`15525`	`"qwen_omni_utils",`
`15526`	`15526`	`"soundfile"`
`15527`	`15527`	`]`
`@@ -17302,7 +17302,7 @@`
`17302`	`17302`	`"packages": [`
`17303`	`17303`	`"transformers>=4.51.0",`
`17304`	`17304`	`"mlx-lm>=0.24.0 ; sys_platform=='darwin'",`
`17305`		`- "numpy==1.26.4"`
	`17305`	`+ "#system_numpy#"`
`17306`	`17306`	`]`
`17307`	`17307`	`}`
`17308`	`17308`	`},`