@@ -103,7 +103,9 @@ async def generate(
103103 raise RuntimeError ("Model not loaded. Call load_model() first." )
104104
105105 max_tokens = max_tokens or self .config .max_tokens
106- temperature = temperature if temperature is not None else self .config .temperature
106+ temperature = (
107+ temperature if temperature is not None else self .config .temperature
108+ )
107109
108110 # Convert messages to prompt
109111 prompt = self ._messages_to_prompt (messages )
@@ -168,7 +170,9 @@ async def generate(
168170 "choices" : [
169171 {
170172 "index" : 0 ,
171- "delta" : {"content" : word + " " if i < len (words ) - 1 else word },
173+ "delta" : {
174+ "content" : word + " " if i < len (words ) - 1 else word
175+ },
172176 "logprobs" : None ,
173177 "finish_reason" : None ,
174178 }
@@ -184,7 +188,9 @@ async def generate(
184188 "created" : response_data ["created" ],
185189 "model" : self .config .served_model_name ,
186190 "system_fingerprint" : "llm-katan-transformers" ,
187- "choices" : [{"index" : 0 , "delta" : {}, "logprobs" : None , "finish_reason" : "stop" }],
191+ "choices" : [
192+ {"index" : 0 , "delta" : {}, "logprobs" : None , "finish_reason" : "stop" }
193+ ],
188194 "usage" : {
189195 "prompt_tokens" : prompt_tokens ,
190196 "completion_tokens" : completion_tokens ,
@@ -281,7 +287,9 @@ async def generate(
281287 from vllm .sampling_params import SamplingParams
282288
283289 max_tokens = max_tokens or self .config .max_tokens
284- temperature = temperature if temperature is not None else self .config .temperature
290+ temperature = (
291+ temperature if temperature is not None else self .config .temperature
292+ )
285293
286294 # Convert messages to prompt
287295 prompt = self ._messages_to_prompt (messages )
@@ -293,7 +301,9 @@ async def generate(
293301
294302 # Generate
295303 loop = asyncio .get_event_loop ()
296- outputs = await loop .run_in_executor (None , self .engine .generate , [prompt ], sampling_params )
304+ outputs = await loop .run_in_executor (
305+ None , self .engine .generate , [prompt ], sampling_params
306+ )
297307
298308 output = outputs [0 ]
299309 generated_text = output .outputs [0 ].text .strip ()
@@ -316,7 +326,8 @@ async def generate(
316326 "usage" : {
317327 "prompt_tokens" : len (output .prompt_token_ids ),
318328 "completion_tokens" : len (output .outputs [0 ].token_ids ),
319- "total_tokens" : len (output .prompt_token_ids ) + len (output .outputs [0 ].token_ids ),
329+ "total_tokens" : len (output .prompt_token_ids )
330+ + len (output .outputs [0 ].token_ids ),
320331 "prompt_tokens_details" : {"cached_tokens" : 0 },
321332 "completion_tokens_details" : {"reasoning_tokens" : 0 },
322333 },
@@ -338,7 +349,9 @@ async def generate(
338349 "choices" : [
339350 {
340351 "index" : 0 ,
341- "delta" : {"content" : word + " " if i < len (words ) - 1 else word },
352+ "delta" : {
353+ "content" : word + " " if i < len (words ) - 1 else word
354+ },
342355 "logprobs" : None ,
343356 "finish_reason" : None ,
344357 }
@@ -354,11 +367,14 @@ async def generate(
354367 "created" : response_data ["created" ],
355368 "model" : self .config .served_model_name ,
356369 "system_fingerprint" : "llm-katan-vllm" ,
357- "choices" : [{"index" : 0 , "delta" : {}, "logprobs" : None , "finish_reason" : "stop" }],
370+ "choices" : [
371+ {"index" : 0 , "delta" : {}, "logprobs" : None , "finish_reason" : "stop" }
372+ ],
358373 "usage" : {
359374 "prompt_tokens" : len (output .prompt_token_ids ),
360375 "completion_tokens" : len (output .outputs [0 ].token_ids ),
361- "total_tokens" : len (output .prompt_token_ids ) + len (output .outputs [0 ].token_ids ),
376+ "total_tokens" : len (output .prompt_token_ids )
377+ + len (output .outputs [0 ].token_ids ),
362378 "prompt_tokens_details" : {"cached_tokens" : 0 },
363379 "completion_tokens_details" : {"reasoning_tokens" : 0 },
364380 },
0 commit comments