|
25 | 25 | ChatCompletionRequest, ChatCompletionResponse, |
26 | 26 | ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, |
27 | 27 | ChatCompletionStreamResponse, ChatMessage, CompletionRequest, |
28 | | - CompletionResponse, CompletionResponseChoice, DeltaMessage, ErrorResponse, |
29 | | - ModelCard, ModelList, ModelPermission, UsageInfo) |
| 28 | + CompletionResponse, CompletionResponseChoice, |
| 29 | + CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage, |
| 30 | + ErrorResponse, ModelCard, ModelList, ModelPermission, UsageInfo) |
30 | 31 | from vllm.entrypoints.openai.serving_chat import (ChatMessageParseResult, |
31 | 32 | ConversationMessage) |
32 | 33 | from vllm.logger import init_logger |
@@ -174,18 +175,18 @@ async def completion_generator(model, result_queue, choices, created_time, |
174 | 175 | request_id, token, stats = await result_queue.get() |
175 | 176 |
|
176 | 177 | choice_idx = choices[request_id] |
177 | | - res = CompletionResponse(id=request_id, |
178 | | - created=created_time, |
179 | | - model=model, |
180 | | - choices=[ |
181 | | - CompletionResponseChoice( |
182 | | - index=choice_idx, |
183 | | - text=token, |
184 | | - logprobs=None, |
185 | | - finish_reason=None, |
186 | | - stop_reason=None) |
187 | | - ], |
188 | | - usage=None) |
| 178 | + res = CompletionStreamResponse(id=request_id, |
| 179 | + created=created_time, |
| 180 | + model=model, |
| 181 | + choices=[ |
| 182 | + CompletionResponseStreamChoice( |
| 183 | + index=choice_idx, |
| 184 | + text=token, |
| 185 | + logprobs=None, |
| 186 | + finish_reason=None, |
| 187 | + stop_reason=None) |
| 188 | + ], |
| 189 | + usage=None) |
189 | 190 | if stats is not None: |
190 | 191 | res.usage = UsageInfo() |
191 | 192 | res.usage.completion_tokens = stats.get("tokens", 0) |
|
0 commit comments