Skip to content

Commit 0a3d4bd

Browse files
authored
improve HPU usage (#1643)
refine example add hpu in auto accelerator and fix bug --------- Signed-off-by: xin3he <[email protected]>
1 parent d4bcdd4 commit 0a3d4bd

File tree

5 files changed

+179
-116
lines changed

5 files changed

+179
-116
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
transformers
22
datasets
3+
accelerate
34
SentencePiece
4-
intel_extension_for_transformers
5-
lm_eval
5+
lm_eval==0.3.0
6+
openpyxl
7+
einops

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py

+6-109
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,15 @@
1717
import transformers
1818
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
1919
import habana_frameworks.torch.core as htcore
20-
import numpy as np
21-
import lm_eval
22-
import lm_eval.tasks
23-
import lm_eval.evaluator
2420
from accelerate import init_empty_weights
25-
from utils import itrex_bootstrap_stderr, show_msg, save_to_excel
21+
from utils import show_msg, eval_func
2622

2723

2824
torch.set_grad_enabled(False)
2925
htcore.hpu_set_env()
3026
torch.device('hpu')
3127

3228

33-
# to avoid out-of-memory caused by Popen for large language models.
34-
lm_eval.metrics.bootstrap_stderr = itrex_bootstrap_stderr
3529

3630

3731
parser = argparse.ArgumentParser()
@@ -52,6 +46,7 @@
5246
parser.add_argument("--precision", type=str, default='fp8_e4m3',
5347
help="Select from ['fp8_e4m3', 'fp8_e5m2', 'bf16', 'fp16'], \
5448
['bf16', 'fp16'] only work with cast approach")
49+
parser.add_argument("--autotune", action="store_true")
5550
parser.add_argument("--accuracy", action="store_true")
5651
parser.add_argument("--performance", action="store_true")
5752
parser.add_argument("--generate", action="store_true")
@@ -182,8 +177,9 @@
182177
### dynamic & static quantization ###
183178
if args.approach in ["dynamic", "static"] and not args.load:
184179
print("device:", next(user_model.parameters()).device)
185-
from neural_compressor.torch.quantization.config import FP8Config, get_default_fp8_config
186-
from neural_compressor.torch.quantization import quantize
180+
from neural_compressor.torch.quantization import (
181+
quantize, autotune, FP8Config, get_default_fp8_config, TuningConfig, get_default_fp8_config_set
182+
)
187183
dtype = args.precision
188184
if args.approach == "dynamic":
189185
from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic
@@ -300,106 +296,7 @@ def replace_torch_mm_bmm():
300296

301297

302298
if args.accuracy:
303-
304-
class HabanaModelAdapter(lm_eval.base.BaseLM):
305-
def __init__(self, tokenizer, model, args, options):
306-
super().__init__()
307-
self.tokenizer = tokenizer
308-
self.model = model.eval()
309-
self._batch_size = args.batch_size
310-
self.buckets = list(sorted(args.buckets))
311-
self.options = options
312-
self._device = "hpu"
313-
torch.set_grad_enabled(False)
314-
315-
@property
316-
def eot_token_id(self):
317-
return self.model.config.eos_token_id
318-
319-
@property
320-
def max_length(self):
321-
return self.buckets[-1]
322-
323-
@property
324-
def max_gen_toks(self):
325-
raise NotImplementedError()
326-
327-
@property
328-
def batch_size(self):
329-
return self._batch_size
330-
331-
@property
332-
def device(self):
333-
# We need to do padding ourselves, otherwise we'll end up with recompilations
334-
# Returning 'cpu' to keep tensors on CPU in lm_eval code
335-
return 'cpu' # 'hpu'
336-
337-
def tok_encode(self, string):
338-
if re.search("chatglm3", args.model.lower()) or re.search("llama", args.model.lower()) :
339-
string = string.lstrip()
340-
return self.tokenizer.encode(string, add_special_tokens=False)
341-
342-
def tok_decode(self, tokens):
343-
return self.tokenizer.decode(tokens, skip_special_tokens=True)
344-
345-
def _model_generate(self, context, max_length, eos_token_id):
346-
raise NotImplementedError()
347-
348-
def find_bucket(self, length):
349-
return [b for b in self.buckets if b >= length][0]
350-
351-
def _model_call(self, inps):
352-
seq_length = inps.shape[-1]
353-
padding_length = 0
354-
bucket_length = self.find_bucket(seq_length)
355-
padding_length = bucket_length - seq_length
356-
inps = F.pad(inps, (0, padding_length), value=self.model.config.pad_token_id)
357-
logits = self.model(inps.to(self._device))["logits"].cpu()
358-
359-
if padding_length > 0:
360-
logits = logits[:, :-padding_length, :]
361-
logits = logits.to(torch.float32)
362-
return logits
363-
364-
lm_tasks = lm_eval.tasks.get_task_dict(args.tasks)
365-
options = None
366-
lm = HabanaModelAdapter(tokenizer, user_model, args, options)
367-
368-
eval_start = time.perf_counter()
369-
if args.approach == "cast":
370-
from neural_compressor.torch.amp import autocast
371-
if args.precision == "fp8_e4m3":
372-
dtype = torch.float8_e4m3fn
373-
elif args.precision == "fp8_e5m2":
374-
dtype = torch.float8_e5m2
375-
elif args.precision == "fp16":
376-
dtype = torch.float16
377-
elif args.precision == "bf16":
378-
dtype = torch.bfloat16
379-
with autocast('hpu', dtype=dtype):
380-
results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
381-
else:
382-
results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
383-
print(lm_eval.evaluator.make_table(results))
384-
eval_end = time.perf_counter()
385-
print("Duration:", eval_end - eval_start)
386-
results['args'] = vars(args)
387-
results['duration'] = eval_end - eval_start
388-
389-
390-
dumped = json.dumps(results, indent=2)
391-
accu_dict = {}
392-
case_name = args.approach + "-" + args.precision
393-
for task_name in args.tasks:
394-
if task_name == "wikitext":
395-
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]), flush=True)
396-
accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["word_perplexity"]]
397-
else:
398-
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]), flush=True)
399-
accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["acc"]]
400-
if args.dump_to_excel and local_rank in [-1, 0]:
401-
save_to_excel(accu_dict)
402-
299+
eval_func(user_model, tokenizer=tokenizer, args=args)
403300

404301
# dump final message of HPU
405302
show_msg()

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py

+122
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,125 @@ def save_to_excel(dict):
3333
df_existing = pd.DataFrame()
3434
df_combined = pd.concat([df_existing, df_new], axis=0, ignore_index=True)
3535
df_combined.to_excel('output.xlsx', index=False, engine='openpyxl', header=True)
36+
37+
38+
def eval_func(user_model, tokenizer, args):
39+
import os
40+
import re
41+
import time
42+
import json
43+
import torch
44+
import habana_frameworks.torch.hpex
45+
import torch.nn.functional as F
46+
import lm_eval
47+
import lm_eval.tasks
48+
import lm_eval.evaluator
49+
50+
# to avoid out-of-memory caused by Popen for large language models.
51+
lm_eval.metrics.bootstrap_stderr = itrex_bootstrap_stderr
52+
53+
class HabanaModelAdapter(lm_eval.base.BaseLM):
54+
def __init__(self, tokenizer, model, args, options):
55+
super().__init__()
56+
self.tokenizer = tokenizer
57+
self.model = model.eval()
58+
self._batch_size = args.batch_size
59+
self.buckets = list(sorted(args.buckets))
60+
self.options = options
61+
self._device = "hpu"
62+
torch.set_grad_enabled(False)
63+
64+
@property
65+
def eot_token_id(self):
66+
return self.model.config.eos_token_id
67+
68+
@property
69+
def max_length(self):
70+
return self.buckets[-1]
71+
72+
@property
73+
def max_gen_toks(self):
74+
raise NotImplementedError()
75+
76+
@property
77+
def batch_size(self):
78+
return self._batch_size
79+
80+
@property
81+
def device(self):
82+
# We need to do padding ourselves, otherwise we'll end up with recompilations
83+
# Returning 'cpu' to keep tensors on CPU in lm_eval code
84+
return 'cpu' # 'hpu'
85+
86+
def tok_encode(self, string):
87+
if (
88+
re.search("chatglm3", args.model.lower()) or
89+
re.search("llama", args.model.lower()) or
90+
re.search("mistral", args.model.lower())
91+
):
92+
string = string.lstrip()
93+
return self.tokenizer.encode(string, add_special_tokens=False)
94+
95+
def tok_decode(self, tokens):
96+
return self.tokenizer.decode(tokens, skip_special_tokens=True)
97+
98+
def _model_generate(self, context, max_length, eos_token_id):
99+
raise NotImplementedError()
100+
101+
def find_bucket(self, length):
102+
return [b for b in self.buckets if b >= length][0]
103+
104+
def _model_call(self, inputs):
105+
seq_length = inputs.shape[-1]
106+
padding_length = 0
107+
bucket_length = self.find_bucket(seq_length)
108+
padding_length = bucket_length - seq_length
109+
inputs = F.pad(inputs, (0, padding_length), value=self.model.config.pad_token_id)
110+
logits = self.model(inputs.to(self._device))["logits"].cpu()
111+
112+
if padding_length > 0:
113+
logits = logits[:, :-padding_length, :]
114+
logits = logits.to(torch.float32)
115+
return logits
116+
117+
lm_tasks = lm_eval.tasks.get_task_dict(args.tasks)
118+
options = None
119+
lm = HabanaModelAdapter(tokenizer, user_model, args, options)
120+
121+
eval_start = time.perf_counter()
122+
if args.approach == "cast":
123+
from neural_compressor.torch.amp import autocast
124+
if args.precision == "fp8_e4m3":
125+
dtype = torch.float8_e4m3fn
126+
elif args.precision == "fp8_e5m2":
127+
dtype = torch.float8_e5m2
128+
elif args.precision == "fp16":
129+
dtype = torch.float16
130+
elif args.precision == "bf16":
131+
dtype = torch.bfloat16
132+
with autocast('hpu', dtype=dtype):
133+
results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
134+
else:
135+
results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
136+
print(lm_eval.evaluator.make_table(results))
137+
eval_end = time.perf_counter()
138+
print("Duration:", eval_end - eval_start)
139+
results['args'] = vars(args)
140+
results['duration'] = eval_end - eval_start
141+
142+
# make sure that result is dumped only once during multi-cards evaluation
143+
local_rank = int(os.getenv('LOCAL_RANK', '-1'))
144+
if local_rank in [-1, 0]:
145+
dumped = json.dumps(results, indent=2)
146+
accu_dict = {}
147+
case_name = args.approach + "-" + args.precision
148+
for task_name in args.tasks:
149+
if task_name == "wikitext":
150+
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]), flush=True)
151+
accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["word_perplexity"]]
152+
else:
153+
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]), flush=True)
154+
accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["acc"]]
155+
if args.dump_to_excel:
156+
save_to_excel(accu_dict)
157+
return results["results"][task_name]["acc"]

neural_compressor/torch/algorithms/weight_only/gptq.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -257,8 +257,7 @@ def __init__(
257257

258258
# device
259259
self.device = get_device(kwargs.pop("device", "auto"))
260-
if str(self.model.device).startswith("cuda"):
261-
self.device = self.model.device
260+
self.model.to(self.device)
262261
self.is_ready = False
263262

264263
self.export_compressed_model = export_compressed_model

neural_compressor/torch/utils/auto_accelerator.py

+46-3
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131

3232
from neural_compressor.torch.utils import logger
3333

34-
PRIORITY_CUDA = 100
34+
PRIORITY_HPU = 100
35+
PRIORITY_CUDA = 95
3536
PRIORITY_CPU = 90
3637

3738

@@ -53,8 +54,9 @@ class CPU_Accelerator:
5354
"""
5455

5556
def decorator(accelerator_cls):
56-
cls.registered_accelerators.setdefault(name, {})
57-
cls.registered_accelerators[name] = (accelerator_cls, priority)
57+
if accelerator_cls.is_available():
58+
cls.registered_accelerators.setdefault(name, {})
59+
cls.registered_accelerators[name] = (accelerator_cls, priority)
5860
return accelerator_cls
5961

6062
return decorator
@@ -202,6 +204,47 @@ def empty_cache(self):
202204
return torch.cuda.empty_cache()
203205

204206

207+
@register_accelerator(name="hpu", priority=PRIORITY_HPU)
208+
class HPU_Accelerator(Auto_Accelerator):
209+
def __init__(self) -> None:
210+
self._name = "hpu"
211+
212+
def name(self) -> str:
213+
return self._name
214+
215+
@classmethod
216+
def is_available(cls) -> bool:
217+
from .environ import is_hpex_available
218+
219+
if is_hpex_available():
220+
return torch.hpu.is_available()
221+
else:
222+
return False
223+
224+
def device_name(self, device_indx) -> str:
225+
if device_indx is None:
226+
return "hpu"
227+
return f"hpu:{device_indx}"
228+
229+
def synchronize(self):
230+
return torch.hpu.synchronize()
231+
232+
def set_device(self, device_index):
233+
return torch.hpu.set_device(device_index)
234+
235+
def current_device(self):
236+
return torch.hpu.current_device()
237+
238+
def current_device_name(self):
239+
return "hpu:{}".format(torch.hpu.current_device())
240+
241+
def device(self, device_index=None):
242+
return torch.hpu.device(device_index)
243+
244+
def empty_cache(self):
245+
return torch.hpu.empty_cache()
246+
247+
205248
def auto_detect_accelerator(device_name="auto") -> Auto_Accelerator:
206249
# Force use the cpu on node has both cpu and gpu: `FORCE_DEVICE=cpu` python main.py ...
207250
# The `FORCE_DEVICE` is case insensitive.

0 commit comments

Comments
 (0)