11
11
UnquantizedLinearMethod )
12
12
from vllm .model_executor .layers .quantization .base_config import (
13
13
QuantizationConfig , QuantizeMethodBase )
14
- from vllm .model_executor .layers .quantization .fp8 import cutlass_fp8_supported
15
14
from vllm .model_executor .layers .quantization .utils .marlin_utils_fp8 import (
16
15
apply_fp8_marlin_linear , prepare_fp8_layer_for_marlin )
17
16
from vllm .model_executor .layers .quantization .utils .quant_utils import (
18
17
is_layer_skipped )
19
18
from vllm .model_executor .layers .quantization .utils .w8a8_utils import (
20
- apply_fp8_linear , maybe_create_device_identity ,
21
- normalize_e4m3fn_to_e4m3fnuz )
19
+ Fp8LinearOp , maybe_create_device_identity , normalize_e4m3fn_to_e4m3fnuz )
22
20
from vllm .model_executor .parameter import (ChannelQuantScaleParameter ,
23
21
ModelWeightParameter )
24
22
from vllm .platforms import current_platform
@@ -37,6 +35,7 @@ def __init__(self, ignore_list: List[str], input_scale_ub: float):
37
35
# For GPUs that lack FP8 hardware support, we can leverage the Marlin
38
36
# kernel for fast weight-only FP8 quantization
39
37
self .use_marlin = not current_platform .has_device_capability (89 )
38
+ self .fp8_linear = Fp8LinearOp ()
40
39
41
40
@classmethod
42
41
def get_name (cls ) -> str :
@@ -73,7 +72,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
73
72
74
73
def __init__ (self , quant_config : FBGEMMFp8Config ):
75
74
self .quant_config = quant_config
76
- self .cutlass_fp8_supported = cutlass_fp8_supported ( )
75
+ self .fp8_linear = Fp8LinearOp ( use_per_token_if_dynamic = True )
77
76
78
77
def create_weights (
79
78
self ,
@@ -159,12 +158,9 @@ def apply(self,
159
158
size_k = layer .input_size_per_partition ,
160
159
bias = bias )
161
160
162
- return apply_fp8_linear (
163
- input = x ,
164
- weight = layer .weight ,
165
- weight_scale = layer .weight_scale ,
166
- input_scale = None ,
167
- input_scale_ub = layer .input_scale_ub ,
168
- bias = bias ,
169
- cutlass_fp8_supported = self .cutlass_fp8_supported ,
170
- use_per_token_if_dynamic = True )
161
+ return self .fp8_linear .apply (input = x ,
162
+ weight = layer .weight ,
163
+ weight_scale = layer .weight_scale ,
164
+ input_scale = None ,
165
+ input_scale_ub = layer .input_scale_ub ,
166
+ bias = bias )
0 commit comments