Skip to content

Commit 1b34b74

Browse files
committed
[llm_bench]Add possibility to run tool with models in gguf format
1 parent a1e5e18 commit 1b34b74

File tree

6 files changed

+81
-55
lines changed

6 files changed

+81
-55
lines changed

tests/python_tests/samples/test_tools_llm_benchmark.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
import pytest
66
import sys
77

8-
from conftest import SAMPLES_PY_DIR, convert_model, download_test_content
98
from test_utils import run_sample
9+
from data.models import get_gguf_model_list
10+
from utils.hugging_face import download_gguf_model
11+
from conftest import SAMPLES_PY_DIR, convert_model, download_test_content
1012
from utils.hugging_face import download_and_convert_embeddings_models, download_and_convert_model
1113

1214
convert_draft_model = convert_model
@@ -286,3 +288,20 @@ def test_python_tool_llm_benchmark_text_reranking_qwen3(self, model_id, sample_a
286288
"-m", models_path,
287289
] + sample_args
288290
run_sample(benchmark_py_command)
291+
292+
293+
@pytest.mark.samples
294+
@pytest.mark.parametrize("sample_args", [
295+
["-d", "cpu", "-n", "1"],
296+
["-d", "cpu", "-n", "1", "-f", "pt"],
297+
])
298+
def test_python_tool_llm_benchmark_gguf_format(self, sample_args):
299+
benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py')
300+
gguf_model = get_gguf_model_list()[0]
301+
gguf_full_path = download_gguf_model(gguf_model["gguf_model_id"], gguf_model["gguf_filename"])
302+
benchmark_py_command = [
303+
sys.executable,
304+
benchmark_script,
305+
"-m", gguf_full_path,
306+
] + sample_args
307+
run_sample(benchmark_py_command)

tools/llm_bench/benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def num_infer_count_type(x):
4040

4141
def get_argprser():
4242
parser = argparse.ArgumentParser('LLM benchmarking tool', add_help=True, formatter_class=argparse.RawTextHelpFormatter)
43-
parser.add_argument('-m', '--model', help='model folder including IR files or Pytorch files', required=TabError)
43+
parser.add_argument('-m', '--model', help='model folder including IR files or Pytorch files or path to GGUF model', required=TabError)
4444
parser.add_argument('-d', '--device', default='cpu', help='inference device')
4545
parser.add_argument('-r', '--report', help='report csv')
4646
parser.add_argument('-rj', '--report_json', help='report json')

tools/llm_bench/llm_bench_utils/model_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,12 @@ def get_use_case(model_name_or_path: str | Path, task: str | None = None):
250250
log.info(f'==SUCCESS FOUND==: use_case: {case}, model_type: {model_name}')
251251
return case, model_name
252252
model_id = config.get("model_type").lower().replace('_', '-')
253+
elif Path(model_name_or_path).suffix in '.gguf':
254+
import gguf_parser
255+
parser = gguf_parser.GGUFParser(model_name_or_path)
256+
parser.parse()
257+
if parser.metadata and parser.metadata.get('general.architecture'):
258+
model_id = parser.metadata.get('general.architecture').lower()
253259

254260
if model_id is not None:
255261
case, model_id = get_use_case_by_model_id(model_id, task)

tools/llm_bench/llm_bench_utils/ov_utils.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -208,15 +208,13 @@ def cb_pipeline_required(args):
208208

209209
def create_genai_text_gen_model(model_path, device, ov_config, memory_data_collector, **kwargs):
210210
import openvino_genai
211-
from transformers import AutoTokenizer
212211
from packaging.version import parse
213212

214-
if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists():
213+
if Path(model_path).suffix != '.gguf'\
214+
and (not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists()):
215215
raise ValueError("OpenVINO Tokenizer model is not found in model directory. Please convert tokenizer using following command:\n"
216216
"convert_tokenizer --with-detokenizer MODEL_DIR --output MODEL_DIR ")
217217

218-
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
219-
220218
config = {}
221219
draft_model_path = kwargs.get("draft_model", '')
222220
cb_config = kwargs.get("cb_config")
@@ -288,7 +286,7 @@ def get_time_list(self):
288286
return self.token_generation_time
289287
streamer = TokenStreamer(llm_pipe.get_tokenizer()) if use_streamer_metrics else None
290288

291-
return llm_pipe, tokenizer, end - start, streamer, True
289+
return llm_pipe, None, end - start, streamer, True
292290

293291

294292
def convert_ov_tokenizer(tokenizer_path):

tools/llm_bench/llm_bench_utils/pt_utils.py

Lines changed: 49 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -60,61 +60,62 @@ def run_torch_compile(model, backend='openvino', dynamic=None, options=None, chi
6060

6161
def create_text_gen_model(model_path, device, memory_data_collector, **kwargs):
6262
model_path = Path(model_path)
63-
from_pretrain_time = 0
64-
if model_path.exists():
65-
if model_path.is_dir() and len(os.listdir(model_path)) != 0:
66-
log.info(f'Load text model from model path:{model_path}')
67-
model_class = kwargs['use_case'].pt_cls
68-
token_class = kwargs['use_case'].tokenizer_cls
69-
if kwargs.get("mem_consumption"):
70-
memory_data_collector.start()
71-
start = time.perf_counter()
72-
trust_remote_code = False
73-
try:
74-
model = model_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
75-
except Exception:
76-
start = time.perf_counter()
77-
trust_remote_code = True
78-
model = model_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
79-
tokenizer = token_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
80-
end = time.perf_counter()
81-
from_pretrain_time = end - start
82-
if kwargs.get("mem_consumption"):
83-
memory_data_collector.stop_and_collect_data('from_pretrained_phase')
84-
memory_data_collector.log_data(compilation_phase=True)
85-
else:
86-
raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty')
87-
else:
63+
is_gguf_model = model_path.suffix == '.gguf'
64+
if not model_path.exists():
8865
raise RuntimeError(f'==Failure ==: model path:{model_path} is not exist')
66+
if not is_gguf_model and not (model_path.is_dir() and len(os.listdir(model_path)) != 0):
67+
raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty')
68+
if not device:
69+
raise RuntimeError('==Failure ==: no device to load')
70+
71+
log.info(f'Load text model from model path:{model_path}')
72+
model_class = kwargs['use_case'].pt_cls
73+
token_class = kwargs['use_case'].tokenizer_cls
74+
if kwargs.get("mem_consumption"):
75+
memory_data_collector.start()
76+
start = time.perf_counter()
77+
load_model_kwargs = {'trust_remote_code': False}
78+
if is_gguf_model:
79+
load_model_kwargs |= {'gguf_file': str(model_path)}
80+
model_path = model_path.parent
81+
try:
82+
model = model_class.from_pretrained(model_path, **load_model_kwargs)
83+
except Exception:
84+
start = time.perf_counter()
85+
load_model_kwargs['trust_remote_code'] = True
86+
model = model_class.from_pretrained(model_path, **load_model_kwargs)
87+
tokenizer = token_class.from_pretrained(model_path, **load_model_kwargs)
88+
end = time.perf_counter()
89+
from_pretrain_time = end - start
90+
if kwargs.get("mem_consumption"):
91+
memory_data_collector.stop_and_collect_data('from_pretrained_phase')
92+
memory_data_collector.log_data(compilation_phase=True)
8993

9094
log.info(f'model path:{model_path}, from pretrained time: {from_pretrain_time:.2f}s')
9195

92-
if device is not None:
93-
gptjfclm = 'transformers.models.gptj.modeling_gptj.GPTJForCausalLM'
94-
lfclm = 'transformers.models.llama.modeling_llama.LlamaForCausalLM'
95-
bfclm = 'transformers.models.bloom.modeling_bloom.BloomForCausalLM'
96-
gpt2lmhm = 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'
97-
gptneoxclm = 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM'
98-
chatglmfcg = 'transformers_modules.pytorch_original.modeling_chatglm.ChatGLMForConditionalGeneration'
99-
real_base_model_name = str(type(model)).lower()
100-
log.info(f'Real base model={real_base_model_name}')
101-
# bfclm will trigger generate crash.
96+
gptjfclm = 'transformers.models.gptj.modeling_gptj.GPTJForCausalLM'
97+
lfclm = 'transformers.models.llama.modeling_llama.LlamaForCausalLM'
98+
bfclm = 'transformers.models.bloom.modeling_bloom.BloomForCausalLM'
99+
gpt2lmhm = 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'
100+
gptneoxclm = 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM'
101+
chatglmfcg = 'transformers_modules.pytorch_original.modeling_chatglm.ChatGLMForConditionalGeneration'
102+
real_base_model_name = str(type(model)).lower()
103+
log.info(f'Real base model={real_base_model_name}')
104+
# bfclm will trigger generate crash.
102105

103-
# If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch
104-
if device.upper() == 'GPU':
105-
device = torch.device('cuda') if torch.cuda.is_available() else log.info('CUDA device is unavailable')
106-
else:
107-
device = torch.device(device.lower())
108-
log.info(f'Torch device was set to: {device}')
106+
# If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch
107+
if device.upper() == 'GPU':
108+
device = torch.device('cuda') if torch.cuda.is_available() else log.info('CUDA device is unavailable')
109+
else:
110+
device = torch.device(device.lower())
111+
log.info(f'Torch device was set to: {device}')
109112

110-
if any(x in real_base_model_name for x in [gptjfclm, lfclm, bfclm, gpt2lmhm, gptneoxclm, chatglmfcg]):
111-
model = set_bf16(model, device, **kwargs)
112-
else:
113-
if len(kwargs['config']) > 0 and kwargs['config'].get('PREC_BF16') and kwargs['config']['PREC_BF16'] is True:
114-
log.info('Param [bf16/prec_bf16] will not work.')
115-
model.to(device)
113+
if any(x in real_base_model_name for x in [gptjfclm, lfclm, bfclm, gpt2lmhm, gptneoxclm, chatglmfcg]):
114+
model = set_bf16(model, device, **kwargs)
116115
else:
117-
raise RuntimeError('==Failure ==: no device to load')
116+
if len(kwargs['config']) > 0 and kwargs['config'].get('PREC_BF16') and kwargs['config']['PREC_BF16'] is True:
117+
log.info('Param [bf16/prec_bf16] will not work.')
118+
model.to(device)
118119

119120
bench_hook = hook_common.get_bench_hook(kwargs['num_beams'], model)
120121

tools/llm_bench/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,5 @@ librosa # For Whisper
1818
matplotlib
1919
jinja2>=3.1.0
2020
scipy
21+
gguf_parser
22+
gguf>=0.10

0 commit comments

Comments
 (0)