Skip to content

Mixtral - KeyError: 'model.layers.10.block_sparse_moe.experts.0.w1.weight' #2020

@MarcosRiveraMartinez

Description

@MarcosRiveraMartinez

Hello,

I've installed vLLM from source with CUDA 12.1 and it is failing to instantiate the Mixtral model (I've tested both official models released by Mistral mistralai/Mixtral-8x7B-Instruct-v0.1 and mistralai/Mixtral-8x7B-v0.1.

The hardware to run it:

  • 2 A100 - 80GB GPUs

The code I'm using to run it:

from vllm import LLM

llm = LLM(model="mistralai/Mixtral-8x7B-v0.1", tensor_parallel_size=2)

Error stack trace:

RayTaskError(KeyError)                    Traceback (most recent call last)
Cell In[2], line 1
----> 1 llm = LLM(model="mistralai/Mixtral-8x7B-v0.1", tensor_parallel_size=2)

File /opt/conda/lib/python3.11/site-packages/vllm/entrypoints/llm.py:93, in LLM.__init__(self, model, tokenizer, tokenizer_mode, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, **kwargs)
     77     kwargs["disable_log_stats"] = True
     78 engine_args = EngineArgs(
     79     model=model,
     80     tokenizer=tokenizer,
   (...)
     91     **kwargs,
     92 )
---> 93 self.llm_engine = LLMEngine.from_engine_args(engine_args)
     94 self.request_counter = Counter()

File /opt/conda/lib/python3.11/site-packages/vllm/engine/llm_engine.py:246, in LLMEngine.from_engine_args(cls, engine_args)
    243 distributed_init_method, placement_group = initialize_cluster(
    244     parallel_config)
    245 # Create the LLM engine.
--> 246 engine = cls(*engine_configs,
    247              distributed_init_method,
    248              placement_group,
    249              log_stats=not engine_args.disable_log_stats)
    250 return engine

File /opt/conda/lib/python3.11/site-packages/vllm/engine/llm_engine.py:107, in LLMEngine.__init__(self, model_config, cache_config, parallel_config, scheduler_config, distributed_init_method, placement_group, log_stats)
    105 # Create the parallel GPU workers.
    106 if self.parallel_config.worker_use_ray:
--> 107     self._init_workers_ray(placement_group)
    108 else:
    109     self._init_workers(distributed_init_method)

File /opt/conda/lib/python3.11/site-packages/vllm/engine/llm_engine.py:194, in LLMEngine._init_workers_ray(self, placement_group, **ray_remote_kwargs)
    181 self._run_workers("init_worker",
    182                   get_all_outputs=True,
    183                   worker_init_fn=lambda: Worker(
   (...)
    188                       None,
    189                   ))
    190 self._run_workers(
    191     "init_model",
    192     get_all_outputs=True,
    193 )
--> 194 self._run_workers(
    195     "load_model",
    196     get_all_outputs=True,
    197     max_concurrent_workers=self.parallel_config.
    198     max_parallel_loading_workers,
    199 )

File /opt/conda/lib/python3.11/site-packages/vllm/engine/llm_engine.py:750, in LLMEngine._run_workers(self, method, get_all_outputs, max_concurrent_workers, *args, **kwargs)
    746     work_groups = [self.workers]
    748 for workers in work_groups:
    749     all_outputs.extend(
--> 750         self._run_workers_in_batch(workers, method, *args, **kwargs))
    752 if get_all_outputs:
    753     return all_outputs

File /opt/conda/lib/python3.11/site-packages/vllm/engine/llm_engine.py:727, in LLMEngine._run_workers_in_batch(self, workers, method, *args, **kwargs)
    725     all_outputs.append(output)
    726 if self.parallel_config.worker_use_ray:
--> 727     all_outputs = ray.get(all_outputs)
    728 return all_outputs

File /opt/conda/lib/python3.11/site-packages/ray/_private/auto_init_hook.py:24, in wrap_auto_init.<locals>.auto_init_wrapper(*args, **kwargs)
     21 @wraps(fn)
     22 def auto_init_wrapper(*args, **kwargs):
     23     auto_init_ray()
---> 24     return fn(*args, **kwargs)

File /opt/conda/lib/python3.11/site-packages/ray/_private/client_mode_hook.py:103, in client_mode_hook.<locals>.wrapper(*args, **kwargs)
    101     if func.__name__ != "init" or is_client_mode_enabled_by_default:
    102         return getattr(ray, func.__name__)(*args, **kwargs)
--> 103 return func(*args, **kwargs)

File /opt/conda/lib/python3.11/site-packages/ray/_private/worker.py:2563, in get(object_refs, timeout)
   2561     worker.core_worker.dump_object_store_memory_usage()
   2562 if isinstance(value, RayTaskError):
-> 2563     raise value.as_instanceof_cause()
   2564 else:
   2565     raise value

RayTaskError(KeyError): ray::RayWorkerVllm.execute_method() (pid=15602, ip=10.244.5.3, actor_id=9c42bf8131d1e3c5560cb67b01000000, repr=<vllm.engine.ray_utils.RayWorkerVllm object at 0x7f9d12281490>)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/vllm/engine/ray_utils.py", line 32, in execute_method
    return executor(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/vllm/worker/worker.py", line 72, in load_model
    self.model_runner.load_model()
  File "/opt/conda/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 36, in load_model
    self.model = get_model(self.model_config)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/vllm/model_executor/model_loader.py", line 124, in get_model
    model.load_weights(model_config.model, model_config.download_dir,
  File "/opt/conda/lib/python3.11/site-packages/vllm/model_executor/models/mixtral.py", line 531, in load_weights
    param = params_dict[name]
            ~~~~~~~~~~~^^^^^^
KeyError: 'model.layers.10.block_sparse_moe.experts.0.w1.weight'
2023-12-11 13:46:58,239	ERROR worker.py:406 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::RayWorkerVllm.execute_method() (pid=15601, ip=10.244.5.3, actor_id=04b9e56fb479563b9fee588101000000, repr=<vllm.engine.ray_utils.RayWorkerVllm object at 0x7fcdf1147b10>)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/vllm/engine/ray_utils.py", line 32, in execute_method
    return executor(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/vllm/worker/worker.py", line 72, in load_model
    self.model_runner.load_model()
  File "/opt/conda/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 36, in load_model
    self.model = get_model(self.model_config)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/vllm/model_executor/model_loader.py", line 124, in get_model
    model.load_weights(model_config.model, model_config.download_dir,
  File "/opt/conda/lib/python3.11/site-packages/vllm/model_executor/models/mixtral.py", line 531, in load_weights
    param = params_dict[name]
            ~~~~~~~~~~~^^^^^^
KeyError: 'model.layers.10.block_sparse_moe.experts.0.w1.weight'

Thanks

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions