4
4
"""
5
5
6
6
import os
7
+ from typing import List , NamedTuple , Type
7
8
8
9
import pytest
9
10
from huggingface_hub import hf_hub_download
10
11
from transformers import AutoTokenizer
11
12
12
13
from tests .quantization .utils import is_quant_method_supported
13
14
15
+ from ....conftest import VllmRunner
14
16
from ...utils import check_logprobs_close
15
17
16
18
os .environ ["TOKENIZERS_PARALLELISM" ] = "true"
17
19
18
20
MAX_MODEL_LEN = 1024
19
21
20
22
23
+ class GGUFTestConfig (NamedTuple ):
24
+ original_model : str
25
+ gguf_repo : str
26
+ gguf_filename : str
27
+
28
+ @property
29
+ def gguf_model (self ):
30
+ return hf_hub_download (self .gguf_repo , filename = self .gguf_filename )
31
+
32
+
33
+ LLAMA_CONFIG = GGUFTestConfig (
34
+ original_model = "meta-llama/Llama-3.2-1B-Instruct" ,
35
+ gguf_repo = "bartowski/Llama-3.2-1B-Instruct-GGUF" ,
36
+ gguf_filename = "Llama-3.2-1B-Instruct-IQ4_XS.gguf" ,
37
+ )
38
+
39
+ QWEN2_CONFIG = GGUFTestConfig (
40
+ original_model = "Qwen/Qwen2.5-1.5B-Instruct" ,
41
+ gguf_repo = "Qwen/Qwen2.5-1.5B-Instruct-GGUF" ,
42
+ gguf_filename = "qwen2.5-1.5b-instruct-q6_k.gguf" ,
43
+ )
44
+
45
+ PHI3_CONFIG = GGUFTestConfig (
46
+ original_model = "microsoft/Phi-3.5-mini-instruct" ,
47
+ gguf_repo = "bartowski/Phi-3.5-mini-instruct-GGUF" ,
48
+ gguf_filename = "Phi-3.5-mini-instruct-IQ4_XS.gguf" ,
49
+ )
50
+
51
+ GPT2_CONFIG = GGUFTestConfig (
52
+ original_model = "openai-community/gpt2-large" ,
53
+ gguf_repo = "QuantFactory/gpt2-large-GGUF" ,
54
+ gguf_filename = "gpt2-large.Q4_K_M.gguf" ,
55
+ )
56
+
57
+ STABLELM_CONFIG = GGUFTestConfig (
58
+ original_model = "stabilityai/stablelm-3b-4e1t" ,
59
+ gguf_repo = "afrideva/stablelm-3b-4e1t-GGUF" ,
60
+ gguf_filename = "stablelm-3b-4e1t.q4_k_m.gguf" ,
61
+ )
62
+
63
+ STARCODER_CONFIG = GGUFTestConfig (
64
+ original_model = "bigcode/starcoder2-3b" ,
65
+ gguf_repo = "QuantFactory/starcoder2-3b-GGUF" ,
66
+ gguf_filename = "starcoder2-3b.Q6_K.gguf" ,
67
+ )
68
+
69
+ MODELS = [
70
+ LLAMA_CONFIG ,
71
+ QWEN2_CONFIG ,
72
+ PHI3_CONFIG ,
73
+ GPT2_CONFIG ,
74
+ STABLELM_CONFIG ,
75
+ # STARCODER_CONFIG, # broken
76
+ ]
77
+
78
+
21
79
@pytest .mark .skipif (not is_quant_method_supported ("gguf" ),
22
80
reason = "gguf is not supported on this GPU type." )
23
- @pytest .mark .parametrize (("original_model" , "gguf_id" , "gguf_path" ), [
24
- ("meta-llama/Llama-3.2-1B-Instruct" ,
25
- "bartowski/Llama-3.2-1B-Instruct-GGUF" ,
26
- "Llama-3.2-1B-Instruct-Q4_K_M.gguf" ),
27
- ("meta-llama/Llama-3.2-1B-Instruct" ,
28
- "bartowski/Llama-3.2-1B-Instruct-GGUF" ,
29
- "Llama-3.2-1B-Instruct-IQ4_XS.gguf" ),
30
- ("Qwen/Qwen2-1.5B-Instruct" , "Qwen/Qwen2-1.5B-Instruct-GGUF" ,
31
- "qwen2-1_5b-instruct-q4_k_m.gguf" ),
32
- ("Qwen/Qwen2-1.5B-Instruct" , "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF" ,
33
- "Qwen2-1.5B-Instruct.IQ4_XS.gguf" ),
34
- ])
81
+ @pytest .mark .parametrize ("model" , MODELS )
35
82
@pytest .mark .parametrize ("dtype" , ["half" ])
36
83
@pytest .mark .parametrize ("max_tokens" , [32 ])
37
84
@pytest .mark .parametrize ("num_logprobs" , [5 ])
38
85
@pytest .mark .parametrize ("tp_size" , [1 , 2 ])
39
86
def test_models (
40
- num_gpus_available ,
41
- vllm_runner ,
42
- example_prompts ,
43
- original_model ,
44
- gguf_id ,
45
- gguf_path ,
87
+ num_gpus_available : int ,
88
+ vllm_runner : Type [VllmRunner ],
89
+ example_prompts : List [str ],
90
+ model : GGUFTestConfig ,
46
91
dtype : str ,
47
92
max_tokens : int ,
48
93
num_logprobs : int ,
@@ -51,28 +96,26 @@ def test_models(
51
96
if num_gpus_available < tp_size :
52
97
pytest .skip (f"Not enough GPUs for tensor parallelism { tp_size } " )
53
98
54
- gguf_model = hf_hub_download (gguf_id , filename = gguf_path )
55
-
56
- tokenizer = AutoTokenizer .from_pretrained (original_model )
57
- messages = [[{
58
- 'role' : 'user' ,
59
- 'content' : prompt
60
- }] for prompt in example_prompts ]
61
- example_prompts = tokenizer .apply_chat_template (messages ,
62
- tokenize = False ,
63
- add_generation_prompt = True )
99
+ tokenizer = AutoTokenizer .from_pretrained (model .original_model )
100
+ if tokenizer .chat_template is not None :
101
+ messages = [[{
102
+ 'role' : 'user' ,
103
+ 'content' : prompt
104
+ }] for prompt in example_prompts ]
105
+ example_prompts = tokenizer .apply_chat_template (
106
+ messages , tokenize = False , add_generation_prompt = True )
64
107
65
108
# Run unquantized model.
66
- with vllm_runner (model_name = original_model ,
109
+ with vllm_runner (model_name = model . original_model ,
67
110
dtype = dtype ,
68
111
max_model_len = MAX_MODEL_LEN ,
69
112
tensor_parallel_size = tp_size ) as original_model :
70
-
71
113
original_outputs = original_model .generate_greedy_logprobs (
72
114
example_prompts [:- 1 ], max_tokens , num_logprobs )
73
115
74
116
# Run gguf model.
75
- with vllm_runner (model_name = gguf_model ,
117
+ with vllm_runner (model_name = model .gguf_model ,
118
+ tokenizer_name = model .original_model ,
76
119
dtype = dtype ,
77
120
max_model_len = MAX_MODEL_LEN ,
78
121
tensor_parallel_size = tp_size ) as gguf_model :
0 commit comments