|
17 | 17 | import transformers
|
18 | 18 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
19 | 19 | import habana_frameworks.torch.core as htcore
|
20 |
| -import numpy as np |
21 |
| -import lm_eval |
22 |
| -import lm_eval.tasks |
23 |
| -import lm_eval.evaluator |
24 | 20 | from accelerate import init_empty_weights
|
25 |
| -from utils import itrex_bootstrap_stderr, show_msg, save_to_excel |
| 21 | +from utils import show_msg, eval_func |
26 | 22 |
|
27 | 23 |
|
28 | 24 | torch.set_grad_enabled(False)
|
29 | 25 | htcore.hpu_set_env()
|
30 | 26 | torch.device('hpu')
|
31 | 27 |
|
32 | 28 |
|
33 |
| -# to avoid out-of-memory caused by Popen for large language models. |
34 |
| -lm_eval.metrics.bootstrap_stderr = itrex_bootstrap_stderr |
35 | 29 |
|
36 | 30 |
|
37 | 31 | parser = argparse.ArgumentParser()
|
|
52 | 46 | parser.add_argument("--precision", type=str, default='fp8_e4m3',
|
53 | 47 | help="Select from ['fp8_e4m3', 'fp8_e5m2', 'bf16', 'fp16'], \
|
54 | 48 | ['bf16', 'fp16'] only work with cast approach")
|
| 49 | +parser.add_argument("--autotune", action="store_true") |
55 | 50 | parser.add_argument("--accuracy", action="store_true")
|
56 | 51 | parser.add_argument("--performance", action="store_true")
|
57 | 52 | parser.add_argument("--generate", action="store_true")
|
|
182 | 177 | ### dynamic & static quantization ###
|
183 | 178 | if args.approach in ["dynamic", "static"] and not args.load:
|
184 | 179 | print("device:", next(user_model.parameters()).device)
|
185 |
| - from neural_compressor.torch.quantization.config import FP8Config, get_default_fp8_config |
186 |
| - from neural_compressor.torch.quantization import quantize |
| 180 | + from neural_compressor.torch.quantization import ( |
| 181 | + quantize, autotune, FP8Config, get_default_fp8_config, TuningConfig, get_default_fp8_config_set |
| 182 | + ) |
187 | 183 | dtype = args.precision
|
188 | 184 | if args.approach == "dynamic":
|
189 | 185 | from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic
|
@@ -300,106 +296,7 @@ def replace_torch_mm_bmm():
|
300 | 296 |
|
301 | 297 |
|
302 | 298 | if args.accuracy:
|
303 |
| - |
304 |
| - class HabanaModelAdapter(lm_eval.base.BaseLM): |
305 |
| - def __init__(self, tokenizer, model, args, options): |
306 |
| - super().__init__() |
307 |
| - self.tokenizer = tokenizer |
308 |
| - self.model = model.eval() |
309 |
| - self._batch_size = args.batch_size |
310 |
| - self.buckets = list(sorted(args.buckets)) |
311 |
| - self.options = options |
312 |
| - self._device = "hpu" |
313 |
| - torch.set_grad_enabled(False) |
314 |
| - |
315 |
| - @property |
316 |
| - def eot_token_id(self): |
317 |
| - return self.model.config.eos_token_id |
318 |
| - |
319 |
| - @property |
320 |
| - def max_length(self): |
321 |
| - return self.buckets[-1] |
322 |
| - |
323 |
| - @property |
324 |
| - def max_gen_toks(self): |
325 |
| - raise NotImplementedError() |
326 |
| - |
327 |
| - @property |
328 |
| - def batch_size(self): |
329 |
| - return self._batch_size |
330 |
| - |
331 |
| - @property |
332 |
| - def device(self): |
333 |
| - # We need to do padding ourselves, otherwise we'll end up with recompilations |
334 |
| - # Returning 'cpu' to keep tensors on CPU in lm_eval code |
335 |
| - return 'cpu' # 'hpu' |
336 |
| - |
337 |
| - def tok_encode(self, string): |
338 |
| - if re.search("chatglm3", args.model.lower()) or re.search("llama", args.model.lower()) : |
339 |
| - string = string.lstrip() |
340 |
| - return self.tokenizer.encode(string, add_special_tokens=False) |
341 |
| - |
342 |
| - def tok_decode(self, tokens): |
343 |
| - return self.tokenizer.decode(tokens, skip_special_tokens=True) |
344 |
| - |
345 |
| - def _model_generate(self, context, max_length, eos_token_id): |
346 |
| - raise NotImplementedError() |
347 |
| - |
348 |
| - def find_bucket(self, length): |
349 |
| - return [b for b in self.buckets if b >= length][0] |
350 |
| - |
351 |
| - def _model_call(self, inps): |
352 |
| - seq_length = inps.shape[-1] |
353 |
| - padding_length = 0 |
354 |
| - bucket_length = self.find_bucket(seq_length) |
355 |
| - padding_length = bucket_length - seq_length |
356 |
| - inps = F.pad(inps, (0, padding_length), value=self.model.config.pad_token_id) |
357 |
| - logits = self.model(inps.to(self._device))["logits"].cpu() |
358 |
| - |
359 |
| - if padding_length > 0: |
360 |
| - logits = logits[:, :-padding_length, :] |
361 |
| - logits = logits.to(torch.float32) |
362 |
| - return logits |
363 |
| - |
364 |
| - lm_tasks = lm_eval.tasks.get_task_dict(args.tasks) |
365 |
| - options = None |
366 |
| - lm = HabanaModelAdapter(tokenizer, user_model, args, options) |
367 |
| - |
368 |
| - eval_start = time.perf_counter() |
369 |
| - if args.approach == "cast": |
370 |
| - from neural_compressor.torch.amp import autocast |
371 |
| - if args.precision == "fp8_e4m3": |
372 |
| - dtype = torch.float8_e4m3fn |
373 |
| - elif args.precision == "fp8_e5m2": |
374 |
| - dtype = torch.float8_e5m2 |
375 |
| - elif args.precision == "fp16": |
376 |
| - dtype = torch.float16 |
377 |
| - elif args.precision == "bf16": |
378 |
| - dtype = torch.bfloat16 |
379 |
| - with autocast('hpu', dtype=dtype): |
380 |
| - results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit) |
381 |
| - else: |
382 |
| - results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit) |
383 |
| - print(lm_eval.evaluator.make_table(results)) |
384 |
| - eval_end = time.perf_counter() |
385 |
| - print("Duration:", eval_end - eval_start) |
386 |
| - results['args'] = vars(args) |
387 |
| - results['duration'] = eval_end - eval_start |
388 |
| - |
389 |
| - |
390 |
| - dumped = json.dumps(results, indent=2) |
391 |
| - accu_dict = {} |
392 |
| - case_name = args.approach + "-" + args.precision |
393 |
| - for task_name in args.tasks: |
394 |
| - if task_name == "wikitext": |
395 |
| - print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]), flush=True) |
396 |
| - accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["word_perplexity"]] |
397 |
| - else: |
398 |
| - print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]), flush=True) |
399 |
| - accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["acc"]] |
400 |
| - if args.dump_to_excel and local_rank in [-1, 0]: |
401 |
| - save_to_excel(accu_dict) |
402 |
| - |
| 299 | + eval_func(user_model, tokenizer=tokenizer, args=args) |
403 | 300 |
|
404 | 301 | # dump final message of HPU
|
405 | 302 | show_msg()
|
0 commit comments