|
68 | 68 | from vllm.logger import init_logger
|
69 | 69 | from vllm.usage.usage_lib import UsageContext
|
70 | 70 | from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
|
71 |
| - is_valid_ipv6_address, kill_process_tree, set_ulimit) |
| 71 | + is_valid_ipv6_address, set_ulimit) |
72 | 72 | from vllm.version import __version__ as VLLM_VERSION
|
73 | 73 |
|
74 | 74 | TIMEOUT_KEEP_ALIVE = 5 # seconds
|
@@ -133,32 +133,21 @@ async def build_async_engine_client_from_engine_args(
|
133 | 133 | Returns the Client or None if the creation failed.
|
134 | 134 | """
|
135 | 135 |
|
136 |
| - # Fall back |
137 |
| - # TODO: fill out feature matrix. |
| 136 | + # AsyncLLMEngine. |
138 | 137 | if (MQLLMEngineClient.is_unsupported_config(engine_args)
|
139 | 138 | or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
|
140 |
| - engine_config = engine_args.create_engine_config( |
141 |
| - UsageContext.OPENAI_API_SERVER) |
142 |
| - uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config), |
143 |
| - "uses_ray", False) |
144 |
| - |
145 |
| - build_engine = partial(AsyncLLMEngine.from_engine_args, |
146 |
| - engine_args=engine_args, |
147 |
| - engine_config=engine_config, |
148 |
| - usage_context=UsageContext.OPENAI_API_SERVER) |
149 |
| - if uses_ray: |
150 |
| - # Must run in main thread with ray for its signal handlers to work |
151 |
| - engine_client = build_engine() |
152 |
| - else: |
153 |
| - engine_client = await asyncio.get_running_loop().run_in_executor( |
154 |
| - None, build_engine) |
155 | 139 |
|
156 |
| - yield engine_client |
157 |
| - if hasattr(engine_client, "shutdown"): |
158 |
| - engine_client.shutdown() |
159 |
| - return |
| 140 | + engine_client: Optional[EngineClient] = None |
| 141 | + try: |
| 142 | + engine_client = AsyncLLMEngine.from_engine_args( |
| 143 | + engine_args=engine_args, |
| 144 | + usage_context=UsageContext.OPENAI_API_SERVER) |
| 145 | + yield engine_client |
| 146 | + finally: |
| 147 | + if engine_client and hasattr(engine_client, "shutdown"): |
| 148 | + engine_client.shutdown() |
160 | 149 |
|
161 |
| - # Otherwise, use the multiprocessing AsyncLLMEngine. |
| 150 | + # MQLLMEngine. |
162 | 151 | else:
|
163 | 152 | if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
|
164 | 153 | # Make TemporaryDirectory for prometheus multiprocessing
|
@@ -737,15 +726,6 @@ def signal_handler(*_) -> None:
|
737 | 726 |
|
738 | 727 | signal.signal(signal.SIGTERM, signal_handler)
|
739 | 728 |
|
740 |
| - # The child processes will send SIGQUIT to this process when |
741 |
| - # any error happens. This process then clean up the whole tree. |
742 |
| - # TODO(rob): move this into AsyncLLM.__init__ once we remove |
743 |
| - # the context manager below. |
744 |
| - def sigquit_handler(signum, frame): |
745 |
| - kill_process_tree(os.getpid()) |
746 |
| - |
747 |
| - signal.signal(signal.SIGQUIT, sigquit_handler) |
748 |
| - |
749 | 729 | async with build_async_engine_client(args) as engine_client:
|
750 | 730 | app = build_app(args)
|
751 | 731 |
|
|
0 commit comments