From 215d598f37d4d13d8181205b29146f7ae43014a5 Mon Sep 17 00:00:00 2001 From: SzymonOzog Date: Mon, 10 Feb 2025 08:22:25 +0000 Subject: [PATCH 1/2] Fix initializing GGUF weights when using tensor parallel Signed-off-by: SzymonOzog --- vllm/model_executor/layers/linear.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index da8db08fe715..a648eed99231 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -335,6 +335,12 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): tp_rank = get_tensor_model_parallel_rank() output_dim = getattr(param, "output_dim", None) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + # Special case for GGUF is_gguf_weight = getattr(param, "is_gguf_weight", False) is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) @@ -343,13 +349,12 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # Materialize GGUF UninitializedParameter if is_gguf_weight and isinstance(param, UninitializedParameter): - param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) - - use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) - is_sharded_weight = getattr(param, "is_sharded_weight", False) - # bitsandbytes loads the weights of the specific portion - # no need to narrow - is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + final_shape = list(loaded_weight.shape) + if output_dim is not None and not is_sharded_weight: + tp_size = get_tensor_model_parallel_world_size() + assert final_shape[output_dim] % tp_size == 0 + final_shape[output_dim] = final_shape[output_dim] // tp_size + param.materialize(final_shape, dtype=loaded_weight.dtype) param_data = param.data if output_dim is not None and not is_sharded_weight: From a61e3ae4c3d535e51104b1142f091db16cf3807c Mon Sep 17 00:00:00 2001 From: SzymonOzog Date: Mon, 10 Feb 2025 15:44:49 +0000 Subject: [PATCH 2/2] remove bnb specific param Signed-off-by: SzymonOzog --- vllm/model_executor/layers/linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index a648eed99231..dad16112082c 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -350,7 +350,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # Materialize GGUF UninitializedParameter if is_gguf_weight and isinstance(param, UninitializedParameter): final_shape = list(loaded_weight.shape) - if output_dim is not None and not is_sharded_weight: + if output_dim is not None: tp_size = get_tensor_model_parallel_world_size() assert final_shape[output_dim] % tp_size == 0 final_shape[output_dim] = final_shape[output_dim] // tp_size