Skip to content

Commit 2445811

Browse files
authored
enable auto_round format export (#2002)
Signed-off-by: Zhang, Weiwei1 <[email protected]>
1 parent 906333a commit 2445811

File tree

9 files changed

+65
-9
lines changed

9 files changed

+65
-9
lines changed

.azure-pipelines/scripts/ut/3x/run_3x_pt.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ rm -rf torch/quantization/fp8_quant
2121
LOG_DIR=/neural-compressor/log_dir
2222
mkdir -p ${LOG_DIR}
2323
ut_log_name=${LOG_DIR}/ut_3x_pt.log
24-
pytest --cov="${inc_path}" -vs --disable-warnings --html=report.html --self-contained-html . 2>&1 | tee -a ${ut_log_name}
24+
25+
find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${inc_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh
26+
cat run.sh
27+
bash run.sh 2>&1 | tee ${ut_log_name}
2528

2629
cp report.html ${LOG_DIR}/
2730

.azure-pipelines/scripts/ut/env_setup.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
9292
fi
9393

9494
if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then
95-
pip install git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e
95+
pip install git+https://github.com/intel/auto-round.git@5dd16fc34a974a8c2f5a4288ce72e61ec3b1410f
9696
fi
9797

9898
# test deps

neural_compressor/torch/algorithms/weight_only/autoround.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def __init__(
6161
act_sym: bool = None,
6262
act_dynamic: bool = True,
6363
low_cpu_mem_usage: bool = False,
64+
export_format: str = "itrex",
6465
**kwargs,
6566
):
6667
"""Init a AutQRoundQuantizer object.
@@ -152,6 +153,7 @@ def __init__(
152153
self.act_sym = act_sym
153154
self.act_dynamic = act_dynamic
154155
self.low_cpu_mem_usage = low_cpu_mem_usage
156+
self.export_format = export_format
155157

156158
def prepare(self, model: torch.nn.Module, *args, **kwargs):
157159
"""Prepares a given model for quantization.
@@ -211,7 +213,11 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
211213
)
212214
model, weight_config = rounder.quantize()
213215
model.autoround_config = weight_config
214-
model = pack_model(model, weight_config, device=self.device, inplace=True)
216+
if "itrex" in self.export_format:
217+
model = pack_model(model, weight_config, device=self.device, inplace=True)
218+
else: # pragma: no cover
219+
model = rounder.save_quantized(output_dir=None, format=self.export_format, device=self.device, inplace=True)
220+
215221
return model
216222

217223

neural_compressor/torch/algorithms/weight_only/save_load.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,32 @@
4040
device_woqlinear_mapping = {"cpu": INCWeightOnlyLinear, "hpu": HPUWeightOnlyLinear}
4141

4242

43-
def save(model, output_dir="./saved_results"):
43+
def save(model, output_dir="./saved_results", format=LoadFormat.DEFAULT, **kwargs):
4444
"""Save the quantized model and config to the output path.
4545
4646
Args:
4747
model (torch.nn.module): raw fp32 model or prepared model.
4848
output_dir (str, optional): output path to save.
49+
format (str, optional): The format in which to save the model. Options include "default" and "huggingface". Defaults to "default".
50+
kwargs: Additional arguments for specific formats. For example:
51+
- safe_serialization (bool): Whether to use safe serialization when saving (only applicable for 'huggingface' format). Defaults to True.
52+
- tokenizer (Tokenizer, optional): The tokenizer to be saved along with the model (only applicable for 'huggingface' format).
53+
- max_shard_size (str, optional): The maximum size for each shard (only applicable for 'huggingface' format). Defaults to "5GB".
4954
"""
5055
os.makedirs(output_dir, exist_ok=True)
56+
if format == LoadFormat.HUGGINGFACE: # pragma: no cover
57+
config = model.config
58+
quantization_config = config.quantization_config if hasattr(config, "quantization_config") else None
59+
if "backend" in quantization_config and "auto_round" in quantization_config["backend"]:
60+
safe_serialization = kwargs.get("safe_serialization", True)
61+
tokenizer = kwargs.get("tokenizer", None)
62+
max_shard_size = kwargs.get("max_shard_size", "5GB")
63+
if tokenizer is not None:
64+
tokenizer.save_pretrained(output_dir)
65+
del model.save
66+
model.save_pretrained(output_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
67+
return
68+
5169
qmodel_weight_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
5270
qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
5371
# saving process
@@ -203,8 +221,15 @@ def load_hf_format_woq_model(self):
203221

204222
# get model class and config
205223
model_class, config = self._get_model_class_and_config()
206-
self.quantization_config = config.quantization_config
207-
224+
self.quantization_config = config.quantization_config if hasattr(config, "quantization_config") else None
225+
if (
226+
"backend" in self.quantization_config and "auto_round" in self.quantization_config["backend"]
227+
): # # pragma: no cover
228+
# load autoround format quantized model
229+
from auto_round import AutoRoundConfig
230+
231+
model = model_class.from_pretrained(self.model_name_or_path)
232+
return model
208233
# get loaded state_dict
209234
self.loaded_state_dict = self._get_loaded_state_dict(config)
210235
self.loaded_state_dict_keys = list(set(self.loaded_state_dict.keys()))
@@ -400,7 +425,7 @@ def _get_model_class_and_config(self):
400425
trust_remote_code = self.kwargs.pop("trust_remote_code", None)
401426
kwarg_attn_imp = self.kwargs.pop("attn_implementation", None)
402427

403-
config = AutoConfig.from_pretrained(self.model_name_or_path)
428+
config = AutoConfig.from_pretrained(self.model_name_or_path, trust_remote_code=trust_remote_code)
404429
# quantization_config = config.quantization_config
405430

406431
if kwarg_attn_imp is not None and config._attn_implementation != kwarg_attn_imp: # pragma: no cover

neural_compressor/torch/quantization/algorithm_entry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,7 @@ def autoround_quantize_entry(
609609
scale_dtype = quant_config.scale_dtype
610610
quant_block_list = quant_config.quant_block_list
611611
low_cpu_mem_usage = quant_config.use_layer_wise
612+
export_format = quant_config.export_format
612613

613614
kwargs.pop("example_inputs")
614615

@@ -636,6 +637,7 @@ def autoround_quantize_entry(
636637
scale_dtype=scale_dtype,
637638
quant_block_list=quant_block_list,
638639
low_cpu_mem_usage=low_cpu_mem_usage,
640+
export_format=export_format,
639641
)
640642
model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
641643
model.qconfig = configs_mapping

neural_compressor/torch/quantization/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -939,6 +939,7 @@ def __init__(
939939
scale_dtype: str = "fp16",
940940
use_layer_wise: bool = False,
941941
quant_block_list: list = None,
942+
export_format: str = "itrex",
942943
white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
943944
):
944945
"""Init AUTOROUND weight-only quantization config.
@@ -973,6 +974,7 @@ def __init__(
973974
have different choices.
974975
use_layer_wise (bool): Enables quantize model per layer. Defaults to False.
975976
quant_block_list (list): A list whose elements are list of block's layer names to be quantized.
977+
export_format (str, optional): The format used for exporting the quantized model. Defaults to "itrex".
976978
white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types.
977979
Default is DEFAULT_WHITE_LIST.
978980
"""
@@ -1005,6 +1007,7 @@ def __init__(
10051007
self.scale_dtype = scale_dtype
10061008
self.use_layer_wise = use_layer_wise
10071009
self.quant_block_list = quant_block_list
1010+
self.export_format = export_format
10081011
self._post_init()
10091012

10101013
@classmethod

test/3x/torch/quantization/weight_only/test_autoround.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def run_fn(model, dataloader):
4040

4141
@pytest.mark.skipif(not auto_round_installed, reason="auto_round module is not installed")
4242
class TestAutoRound:
43+
@classmethod
4344
def setup_class(self):
4445
self.gptj = transformers.AutoModelForCausalLM.from_pretrained(
4546
"hf-internal-testing/tiny-random-GPTJForCausalLM",
@@ -52,6 +53,7 @@ def setup_class(self):
5253
self.dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=10)
5354
self.label = self.gptj(self.inp)[0]
5455

56+
@classmethod
5557
def teardown_class(self):
5658
shutil.rmtree("saved_results", ignore_errors=True)
5759

@@ -159,3 +161,18 @@ def test_conv1d(self):
159161
out2 = q_model(**encoded_input)[0]
160162
assert torch.allclose(out2, out1, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
161163
assert isinstance(q_model.h[0].attn.c_attn, WeightOnlyLinear), "loading compressed model failed."
164+
165+
# def test_autoround_format_export(self):
166+
# from neural_compressor.torch.quantization import load
167+
# from auto_gptq.nn_modules.qlinear.qlinear_triton import QuantLinear
168+
# gpt_j_model = copy.deepcopy(self.gptj)
169+
# quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32", export_format="auto_round:gptq")
170+
# logger.info(f"Test AutoRound with config {quant_config}")
171+
# model = prepare(model=gpt_j_model, quant_config=quant_config)
172+
# run_fn(model, self.dataloader)
173+
# q_model = convert(model)
174+
# out = q_model(self.inp)[0]
175+
# assert torch.allclose(out, self.label, atol=1e-1)
176+
# assert isinstance(q_model.transformer.h[0].attn.k_proj, QuantLinear), "packing model failed."
177+
# q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
178+
# loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)

test/3x/torch/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
auto_round @ git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e
1+
auto_round @ git+https://github.com/intel/auto-round.git@5dd16fc34a974a8c2f5a4288ce72e61ec3b1410f
22
expecttest
33
intel_extension_for_pytorch
44
numpy

test/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
--find-links https://download.pytorch.org/whl/torch_stable.html
22
accelerate==0.21.0
3-
auto-round @ git+https://github.com/intel/auto-round.git@e24b9074af6cdb099e31c92eb81b7f5e9a4a244e
3+
auto-round @ git+https://github.com/intel/auto-round.git@5dd16fc34a974a8c2f5a4288ce72e61ec3b1410f
44
dynast==1.6.0rc1
55
horovod
66
intel-extension-for-pytorch

0 commit comments

Comments
 (0)