anyscale · zinccat · Oct 14, 2023 · Oct 14, 2023
diff --git a/launch_scripts/launch_vllm b/launch_scripts/launch_vllm
@@ -15,7 +15,7 @@ from types import SimpleNamespace
 import subprocess
 import threading
 from dataclasses import dataclass
-from vllm import ServerArgs, LLMServer, SamplingParams
+from vllm import EngineArgs, LLMEngine, SamplingParams
 import os
 os.environ["TRANSFORMERS_CACHE"] = '/data/cache'
 
@@ -88,7 +88,7 @@ class ModelThread:
 
             needs_call_progress = False
             for vllm_output in vllm_outputs:
-                if not vllm_output.finished():
+                if not vllm_output.finished:
                     continue
 
                 needs_call_progress = True
@@ -111,8 +111,8 @@ class ModelThread:
     @staticmethod
     def init_model(vllm_args):
         print('Init model')
-        server_args = ServerArgs.from_cli_args(vllm_args)
-        server = LLMServer.from_server_args(server_args)
+        server_args = EngineArgs.from_cli_args(vllm_args)
+        server = LLMEngine.from_engine_args(server_args)
         print('Model ready')
         return server
 
@@ -225,10 +225,10 @@ async def is_ready(request: Request):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--port', type=int, required=True)
-    ServerArgs.add_cli_args(parser)
+    EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
 
-    vllm_args = ServerArgs.from_cli_args(args)
+    vllm_args = EngineArgs.from_cli_args(args)
 
     loop = asyncio.new_event_loop()
     server = FastAPIServer(loop, vllm_args)

diff --git a/launch_scripts/launch_vllm_ray_serve b/launch_scripts/launch_vllm_ray_serve
@@ -51,7 +51,7 @@ class VLLMPredictDeployment:
                 yield (json.dumps(ret) + "\0").encode("utf-8")
 
         async def abort_request() -> None:
-            await engine.abort(request_id)
+            await self.engine.abort(request_id)
 
         if stream:
             background_tasks = BackgroundTasks()
@@ -64,7 +64,7 @@ class VLLMPredictDeployment:
         async for request_output in results_generator:
             if await request.is_disconnected():
                 # Abort the request if the client disconnects.
-                await engine.abort(request_id)
+                await self.engine.abort(request_id)
                 return Response(status_code=499)
             final_output = request_output
 
@@ -93,8 +93,10 @@ if __name__ == "__main__":
     parser.add_argument('--port', type=int, required=True)
     args = parser.parse_args()
 
-    model = 'facebook/opt-13b'
+    model = 'facebook/opt-125m' #'facebook/opt-13b'
     deployment = VLLMPredictDeployment.bind(
-        model=model, max_num_batched_tokens=8100, use_np_weights=True)
+        model=model, max_num_batched_tokens=8100)
     serve.run(deployment, port=args.port)
     send_request()
+    while True:
+        pass