triton-inference-server · pskiran1 · Dec 4, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py
@@ -57,6 +57,8 @@
     _create_trtllm_generate_request,
     _create_vllm_embedding_request,
     _create_vllm_generate_request,
+    _get_openai_chat_format_logprobs_from_vllm_response,
+    _get_openai_completion_format_logprobs_from_vllm_response,
     _get_output,
     _get_usage_from_response,
     _get_vllm_lora_names,
@@ -66,6 +68,7 @@
 from schemas.openai import (
     ChatCompletionChoice,
     ChatCompletionFinishReason,
+    ChatCompletionLogprobs,
     ChatCompletionMessageToolCall,
     ChatCompletionMessageToolCallChunk,
     ChatCompletionNamedToolChoice,
@@ -255,13 +258,22 @@ async def chat(
             response, metadata.backend, RequestKind.GENERATION
         )
 
+        # Parse logprobs if requested
+        logprobs_data = None
+        if request.logprobs:
+            openai_logprobs = _get_openai_chat_format_logprobs_from_vllm_response(
+                response
+            )
+            if openai_logprobs:
+                logprobs_data = ChatCompletionLogprobs(content=openai_logprobs)
+
         return CreateChatCompletionResponse(
             id=request_id,
             choices=[
                 ChatCompletionChoice(
                     index=0,
                     message=response_message,
-                    logprobs=None,
+                    logprobs=logprobs_data,
                     finish_reason=finish_reason,
                 )
             ],
@@ -360,10 +372,17 @@ async def completion(
             response, metadata.backend, RequestKind.GENERATION
         )
 
+        # Parse logprobs if requested
+        logprobs_data = None
+        if request.logprobs is not None and request.logprobs > 0:
+            logprobs_data = _get_openai_completion_format_logprobs_from_vllm_response(
+                response
+            )
+
         choice = Choice(
             finish_reason=FinishReason.stop,
             index=0,
-            logprobs=None,
+            logprobs=logprobs_data,
             text=text,
         )
         return CreateCompletionResponse(
@@ -605,6 +624,15 @@ async def _streaming_chat_iterator(
             )
             previous_text = current_text
 
+            # Parse logprobs for this chunk if requested
+            chunk_logprobs = None
+            if request.logprobs:
+                openai_logprobs = _get_openai_chat_format_logprobs_from_vllm_response(
+                    response
+                )
+                if openai_logprobs:
+                    chunk_logprobs = ChatCompletionLogprobs(content=openai_logprobs)
+
             # if the response delta is None (e.g. because it was a
             # "control token" for tool calls or the parser otherwise
             # wasn't ready to send a token, then
@@ -618,7 +646,7 @@ async def _streaming_chat_iterator(
             choice = ChatCompletionStreamingResponseChoice(
                 index=0,
                 delta=response_delta,
-                logprobs=None,
+                logprobs=chunk_logprobs,
                 finish_reason=finish_reason,
             )
 
@@ -791,8 +819,19 @@ def _validate_chat_request(
                 f"Received n={request.n}, but only single choice (n=1) is currently supported"
             )
 
-        if request.logit_bias is not None or request.logprobs:
-            raise ClientError("logit bias and log probs not currently supported")
+        if request.logit_bias is not None:
+            raise ClientError("logit bias is not currently supported")
+
+        # Logprobs are only supported for vLLM backend currently
+        if metadata.backend != "vllm" and (
+            request.logprobs is not None or request.top_logprobs is not None
+        ):
+            raise ClientError(
+                "logprobs are currently available only for the vLLM backend"
+            )
+
+        if request.top_logprobs is not None and not request.logprobs:
+            raise ClientError("`top_logprobs` can only be used when `logprobs` is True")
 
         self._verify_chat_tool_call_settings(request=request)
 
@@ -847,16 +886,32 @@ async def _streaming_completion_iterator(
         model = request.model
         include_usage = request.stream_options and request.stream_options.include_usage
         usage_accumulator = _StreamingUsageAccumulator(backend)
+        current_offset = 0
 
         async for response in responses:
             if include_usage:
                 usage_accumulator.update(response)
 
             text = _get_output(response)
+
+            # Parse logprobs for this chunk if requested
+            chunk_logprobs = None
+            if request.logprobs is not None and request.logprobs > 0:
+                chunk_logprobs = (
+                    _get_openai_completion_format_logprobs_from_vllm_response(response)
+                )
+                # Adjust text offsets based on accumulated output
+                if chunk_logprobs and chunk_logprobs.text_offset:
+                    chunk_logprobs.text_offset = [
+                        offset + current_offset for offset in chunk_logprobs.text_offset
+                    ]
+
+            current_offset += len(text)
+
             choice = Choice(
                 finish_reason=FinishReason.stop if response.final else None,
                 index=0,
-                logprobs=None,
+                logprobs=chunk_logprobs,
                 text=text,
             )
             chunk = CreateCompletionResponse(
@@ -942,8 +997,18 @@ def _validate_completion_request(
                 f"Received best_of={request.best_of}, but only single choice (best_of=1) is currently supported"
             )
 
-        if request.logit_bias is not None or request.logprobs is not None:
-            raise ClientError("logit bias and log probs not supported")
+        if request.logit_bias is not None:
+            raise ClientError("logit bias is not supported")
+
+        # Logprobs are only supported for vLLM backend currently
+        if (
+            request.logprobs is not None
+            and request.logprobs > 0
+            and metadata.backend != "vllm"
+        ):
+            raise ClientError(
+                "logprobs are currently available only for the vLLM backend"
+            )
 
         if request.stream_options and not request.stream:
             raise ClientError("`stream_options` can only be used when `stream` is True")