-
-
Notifications
You must be signed in to change notification settings - Fork 10.3k
Closed
Description
Hello,
I've installed vLLM from source with CUDA 12.1 and it is failing to instantiate the Mixtral model (I've tested both official models released by Mistral mistralai/Mixtral-8x7B-Instruct-v0.1
and mistralai/Mixtral-8x7B-v0.1
.
The hardware to run it:
- 2 A100 - 80GB GPUs
The code I'm using to run it:
from vllm import LLM
llm = LLM(model="mistralai/Mixtral-8x7B-v0.1", tensor_parallel_size=2)
Error stack trace:
RayTaskError(KeyError) Traceback (most recent call last)
Cell In[2], line 1
----> 1 llm = LLM(model="mistralai/Mixtral-8x7B-v0.1", tensor_parallel_size=2)
File /opt/conda/lib/python3.11/site-packages/vllm/entrypoints/llm.py:93, in LLM.__init__(self, model, tokenizer, tokenizer_mode, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, **kwargs)
77 kwargs["disable_log_stats"] = True
78 engine_args = EngineArgs(
79 model=model,
80 tokenizer=tokenizer,
(...)
91 **kwargs,
92 )
---> 93 self.llm_engine = LLMEngine.from_engine_args(engine_args)
94 self.request_counter = Counter()
File /opt/conda/lib/python3.11/site-packages/vllm/engine/llm_engine.py:246, in LLMEngine.from_engine_args(cls, engine_args)
243 distributed_init_method, placement_group = initialize_cluster(
244 parallel_config)
245 # Create the LLM engine.
--> 246 engine = cls(*engine_configs,
247 distributed_init_method,
248 placement_group,
249 log_stats=not engine_args.disable_log_stats)
250 return engine
File /opt/conda/lib/python3.11/site-packages/vllm/engine/llm_engine.py:107, in LLMEngine.__init__(self, model_config, cache_config, parallel_config, scheduler_config, distributed_init_method, placement_group, log_stats)
105 # Create the parallel GPU workers.
106 if self.parallel_config.worker_use_ray:
--> 107 self._init_workers_ray(placement_group)
108 else:
109 self._init_workers(distributed_init_method)
File /opt/conda/lib/python3.11/site-packages/vllm/engine/llm_engine.py:194, in LLMEngine._init_workers_ray(self, placement_group, **ray_remote_kwargs)
181 self._run_workers("init_worker",
182 get_all_outputs=True,
183 worker_init_fn=lambda: Worker(
(...)
188 None,
189 ))
190 self._run_workers(
191 "init_model",
192 get_all_outputs=True,
193 )
--> 194 self._run_workers(
195 "load_model",
196 get_all_outputs=True,
197 max_concurrent_workers=self.parallel_config.
198 max_parallel_loading_workers,
199 )
File /opt/conda/lib/python3.11/site-packages/vllm/engine/llm_engine.py:750, in LLMEngine._run_workers(self, method, get_all_outputs, max_concurrent_workers, *args, **kwargs)
746 work_groups = [self.workers]
748 for workers in work_groups:
749 all_outputs.extend(
--> 750 self._run_workers_in_batch(workers, method, *args, **kwargs))
752 if get_all_outputs:
753 return all_outputs
File /opt/conda/lib/python3.11/site-packages/vllm/engine/llm_engine.py:727, in LLMEngine._run_workers_in_batch(self, workers, method, *args, **kwargs)
725 all_outputs.append(output)
726 if self.parallel_config.worker_use_ray:
--> 727 all_outputs = ray.get(all_outputs)
728 return all_outputs
File /opt/conda/lib/python3.11/site-packages/ray/_private/auto_init_hook.py:24, in wrap_auto_init.<locals>.auto_init_wrapper(*args, **kwargs)
21 @wraps(fn)
22 def auto_init_wrapper(*args, **kwargs):
23 auto_init_ray()
---> 24 return fn(*args, **kwargs)
File /opt/conda/lib/python3.11/site-packages/ray/_private/client_mode_hook.py:103, in client_mode_hook.<locals>.wrapper(*args, **kwargs)
101 if func.__name__ != "init" or is_client_mode_enabled_by_default:
102 return getattr(ray, func.__name__)(*args, **kwargs)
--> 103 return func(*args, **kwargs)
File /opt/conda/lib/python3.11/site-packages/ray/_private/worker.py:2563, in get(object_refs, timeout)
2561 worker.core_worker.dump_object_store_memory_usage()
2562 if isinstance(value, RayTaskError):
-> 2563 raise value.as_instanceof_cause()
2564 else:
2565 raise value
RayTaskError(KeyError): ray::RayWorkerVllm.execute_method() (pid=15602, ip=10.244.5.3, actor_id=9c42bf8131d1e3c5560cb67b01000000, repr=<vllm.engine.ray_utils.RayWorkerVllm object at 0x7f9d12281490>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/vllm/engine/ray_utils.py", line 32, in execute_method
return executor(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/vllm/worker/worker.py", line 72, in load_model
self.model_runner.load_model()
File "/opt/conda/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 36, in load_model
self.model = get_model(self.model_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/vllm/model_executor/model_loader.py", line 124, in get_model
model.load_weights(model_config.model, model_config.download_dir,
File "/opt/conda/lib/python3.11/site-packages/vllm/model_executor/models/mixtral.py", line 531, in load_weights
param = params_dict[name]
~~~~~~~~~~~^^^^^^
KeyError: 'model.layers.10.block_sparse_moe.experts.0.w1.weight'
2023-12-11 13:46:58,239 ERROR worker.py:406 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::RayWorkerVllm.execute_method() (pid=15601, ip=10.244.5.3, actor_id=04b9e56fb479563b9fee588101000000, repr=<vllm.engine.ray_utils.RayWorkerVllm object at 0x7fcdf1147b10>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/vllm/engine/ray_utils.py", line 32, in execute_method
return executor(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/vllm/worker/worker.py", line 72, in load_model
self.model_runner.load_model()
File "/opt/conda/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 36, in load_model
self.model = get_model(self.model_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/lib/python3.11/site-packages/vllm/model_executor/model_loader.py", line 124, in get_model
model.load_weights(model_config.model, model_config.download_dir,
File "/opt/conda/lib/python3.11/site-packages/vllm/model_executor/models/mixtral.py", line 531, in load_weights
param = params_dict[name]
~~~~~~~~~~~^^^^^^
KeyError: 'model.layers.10.block_sparse_moe.experts.0.w1.weight'
Thanks
Metadata
Metadata
Assignees
Labels
No labels