Skip to content

Commit aa1e77a

Browse files
authored
[Hardware][CPU] Support MOE models on x86 CPU (#11831)
Signed-off-by: jiang1.li <[email protected]>
1 parent 5959564 commit aa1e77a

File tree

3 files changed

+43
-4
lines changed

3 files changed

+43
-4
lines changed

docs/source/getting_started/installation/cpu-x86.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
66

77
- Tensor Parallel
8-
- Model Quantization (`INT8 W8A8, AWQ`)
8+
- Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
99
- Chunked-prefill
1010
- Prefix-caching
1111
- FP8-E5M2 KV-Caching (TODO)

tests/models/decoder_only/language/test_models.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@
4848
),
4949
pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm
5050
pytest.param("bigcode/starcoder2-3b"), # starcoder2
51+
pytest.param(
52+
"ehristoforu/Falcon3-MoE-2x7B-Insruct", # mixtral
53+
marks=[pytest.mark.cpu_model],
54+
)
5155
])
5256
@pytest.mark.parametrize("dtype", ["half"])
5357
@pytest.mark.parametrize("max_tokens", [32])

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
QuantizationConfig, QuantizeMethodBase)
1414
from vllm.model_executor.utils import set_weight_attrs
1515
from vllm.platforms import current_platform
16+
from vllm.platforms.interface import CpuArchEnum
1617

1718
if current_platform.is_cuda_alike():
1819
from .fused_moe import fused_experts
@@ -83,6 +84,20 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
8384
layer.register_parameter("w2_weight", w2_weight)
8485
set_weight_attrs(w2_weight, extra_weight_attrs)
8586

87+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
88+
super().process_weights_after_loading(layer)
89+
90+
if current_platform.is_cpu():
91+
if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
92+
import intel_extension_for_pytorch as ipex
93+
layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
94+
layer.w13_weight,
95+
layer.w2_weight,
96+
use_prepack=True,
97+
)
98+
else:
99+
raise NotImplementedError("CPU MOE only supports x86 arch.")
100+
86101
def apply(
87102
self,
88103
layer: torch.nn.Module,
@@ -142,9 +157,29 @@ def forward_cuda(
142157
topk_ids=topk_ids,
143158
inplace=True)
144159

145-
def forward_cpu(self, *args, **kwargs):
146-
raise NotImplementedError(
147-
"The CPU backend currently does not support MoE.")
160+
def forward_cpu(
161+
self,
162+
layer: torch.nn.Module,
163+
x: torch.Tensor,
164+
use_grouped_topk: bool,
165+
top_k: int,
166+
router_logits: torch.Tensor,
167+
renormalize: bool,
168+
topk_group: Optional[int] = None,
169+
num_expert_group: Optional[int] = None,
170+
custom_routing_function: Optional[Callable] = None,
171+
**kwargs,
172+
):
173+
assert custom_routing_function is None
174+
return layer.ipex_fusion(
175+
x,
176+
use_grouped_topk,
177+
top_k,
178+
router_logits,
179+
renormalize,
180+
topk_group,
181+
num_expert_group,
182+
)
148183

149184
def forward_tpu(
150185
self,

0 commit comments

Comments
 (0)