Skip to content

Commit cf1de74

Browse files
authored
Support automatic detection of amp and device Autoround [2.x] (#1649)
Signed-off-by: Kaihui-intel <[email protected]>
1 parent e7dfa63 commit cf1de74

File tree

7 files changed

+7
-110
lines changed

7 files changed

+7
-110
lines changed

.azure-pipelines/scripts/ut/env_setup.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
9999
fi
100100

101101
if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then
102-
pip install git+https://github.com/intel/auto-round.git@b65830f3f6cb32d92a5c8ba5f80ace12d517357b
102+
pip install git+https://github.com/intel/auto-round.git@6815f8b66be456ecbef2d0beb33dbc4efeefdc04
103103
fi
104104

105105
# test deps

neural_compressor/adaptor/pytorch.py

-8
Original file line numberDiff line numberDiff line change
@@ -4918,8 +4918,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
49184918
# auto round recipes
49194919
enable_full_range = self.recipes["autoround_args"].get("enable_full_range", False)
49204920
bs = self.recipes["autoround_args"].get("bs", 8)
4921-
amp = self.recipes["autoround_args"].get("amp", True)
4922-
device = self.recipes["autoround_args"].get("device", "cpu")
49234921
lr_scheduler = self.recipes["autoround_args"].get("lr_scheduler", None)
49244922
dataset_name = self.recipes["autoround_args"].get("dataset_name", "NeelNanda/pile-10k")
49254923
dataset_split = self.recipes["autoround_args"].get("dataset_split", "train")
@@ -4939,8 +4937,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
49394937
dynamic_max_gap = self.recipes["autoround_args"].get("dynamic_max_gap", -1)
49404938
data_type = self.recipes["autoround_args"].get("data_type", "int") ##only support data_type
49414939
scale_dtype = self.recipes["autoround_args"].get("scale_dtype", "fp16")
4942-
# autoround export
4943-
export_args = self.recipes["autoround_args"].get("export_args", {"format": None})
49444940

49454941
model, autoround_config = autoround_quantize(
49464942
model=model,
@@ -4951,8 +4947,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
49514947
weight_config=weight_config,
49524948
enable_full_range=enable_full_range,
49534949
bs=bs,
4954-
amp=amp,
4955-
device=device,
49564950
lr_scheduler=lr_scheduler,
49574951
dataloader=dataloader,
49584952
dataset_name=dataset_name,
@@ -4973,8 +4967,6 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
49734967
dynamic_max_gap=dynamic_max_gap,
49744968
data_type=data_type,
49754969
scale_dtype=scale_dtype,
4976-
# export arguments
4977-
export_args=export_args,
49784970
)
49794971
return model, autoround_config
49804972

neural_compressor/adaptor/torch_utils/weight_only.py

+3-13
Original file line numberDiff line numberDiff line change
@@ -682,7 +682,7 @@ def autoround_quantize(
682682
enable_full_range: bool = False, ##for symmetric, TODO support later
683683
bs: int = 8,
684684
amp: bool = True,
685-
device="cuda:0",
685+
device=None,
686686
lr_scheduler=None,
687687
dataloader=None, ## to support later
688688
dataset_name: str = "NeelNanda/pile-10k",
@@ -703,7 +703,6 @@ def autoround_quantize(
703703
dynamic_max_gap: int = -1,
704704
data_type: str = "int", ##only support data_type
705705
scale_dtype="fp16",
706-
export_args: dict = {"format": None, "inplace": True},
707706
**kwargs,
708707
):
709708
"""Run autoround weight-only quantization.
@@ -726,8 +725,8 @@ def autoround_quantize(
726725
}
727726
enable_full_range (bool): Whether to enable full range quantization (default is False).
728727
bs (int): Batch size for training (default is 8).
729-
amp (bool): Whether to use automatic mixed precision (default is True).
730-
device: The device to be used for tuning (default is "cuda:0").
728+
amp (bool): Whether to use automatic mixed precision (default is True). Automatically detect and set.
729+
device: The device to be used for tuning (default is None). Automatically detect and set.
731730
lr_scheduler: The learning rate scheduler to be used.
732731
dataloader: The dataloader for input data (to be supported in future).
733732
dataset_name (str): The default dataset name (default is "NeelNanda/pile-10k").
@@ -747,8 +746,6 @@ def autoround_quantize(
747746
not_use_best_mse (bool): Whether to use mean squared error (default is False).
748747
dynamic_max_gap (int): The dynamic maximum gap (default is -1).
749748
data_type (str): The data type to be used (default is "int").
750-
export_args (dict): The arguments for exporting compressed model, default is {"format": None, "inplace": True}.
751-
Supported format: "itrex", "auto_gptq".
752749
**kwargs: Additional keyword arguments.
753750
754751
Returns:
@@ -790,11 +787,4 @@ def autoround_quantize(
790787
**kwargs,
791788
)
792789
qdq_model, weight_config = rounder.quantize()
793-
if export_args["format"] is not None:
794-
output_dir = export_args.get("output_dir", None)
795-
format = export_args["format"]
796-
inplace = export_args.get("inplace", True)
797-
use_triton = export_args.get("use_triton", False)
798-
model = rounder.save_quantized(output_dir=output_dir, format=format, inplace=inplace, use_triton=use_triton)
799-
return model, weight_config
800790
return qdq_model, weight_config

neural_compressor/model/torch_model.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -559,9 +559,9 @@ def export_compressed_model(
559559
new_module.pack(int_weight, gptq_scale, gptq_zp, m.bias, gptq_perm)
560560
set_module(self.model, k, new_module)
561561
elif autoround_config:
562-
from auto_round.export.export_to_itrex import compress_model # pylint: disable=E0401
562+
from auto_round.export.export_to_itrex.export import _pack_model # pylint: disable=E0401
563563

564-
self.model = compress_model(
564+
self.model = _pack_model(
565565
self.model,
566566
weight_config=autoround_config,
567567
enable_full_range=enable_full_range,

test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py

-55
Original file line numberDiff line numberDiff line change
@@ -778,11 +778,9 @@ def test_AutoRound_quant(self):
778778
recipes={
779779
"autoround_args": {
780780
"n_samples": 20,
781-
"amp": False,
782781
"seq_len": 10,
783782
"iters": 10,
784783
"scale_dtype": "fp32",
785-
"device": "cpu",
786784
},
787785
},
788786
)
@@ -809,59 +807,6 @@ def test_AutoRound_quant(self):
809807
self.assertTrue(isinstance(q_model.model.transformer.h[0].attn.k_proj, WeightOnlyLinear))
810808
self.assertTrue(isinstance(export_model.transformer.h[0].attn.k_proj, WeightOnlyLinear))
811809

812-
fp32_model = copy.deepcopy(self.gptj)
813-
814-
conf = PostTrainingQuantConfig(
815-
approach="weight_only",
816-
op_type_dict={
817-
".*": { # re.match
818-
"weight": {
819-
"dtype": "int",
820-
"bits": 4,
821-
"group_size": 32, # -1 (per-channel)
822-
"scheme": "sym",
823-
"algorithm": "AUTOROUND",
824-
},
825-
},
826-
},
827-
op_name_dict={
828-
".*lm_head": { # re.match
829-
"weight": {"dtype": "fp32"},
830-
},
831-
},
832-
recipes={
833-
"autoround_args": {
834-
"n_samples": 20,
835-
"amp": False,
836-
"seq_len": 10,
837-
"iters": 10,
838-
"scale_dtype": "fp32",
839-
"device": "cpu",
840-
"export_args": {"format": "itrex", "inplace": False},
841-
},
842-
},
843-
)
844-
"""All export arguments.
845-
846-
"export_args": {
847-
"format": "itrex", # "iterx", "auto_gptq", default is None
848-
"output_dir": None, # saved path
849-
"inplace": False,
850-
"use_triton": False,
851-
}
852-
"""
853-
input = torch.ones([1, 512], dtype=torch.long)
854-
fp32_model = copy.deepcopy(self.gptj)
855-
out1 = fp32_model(input)
856-
export_model = quantization.fit(
857-
fp32_model,
858-
conf,
859-
calib_dataloader=dataloader,
860-
)
861-
out2 = export_model.model(input)
862-
self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01))
863-
self.assertTrue(isinstance(export_model.model.transformer.h[0].attn.k_proj, WeightOnlyLinear))
864-
865810

866811
if __name__ == "__main__":
867812
unittest.main()

test/quantization/test_weight_only_quantization.py

-30
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,6 @@ def test_autoround_int_quant(self):
278278
model=model,
279279
tokenizer=self.tokenizer,
280280
n_samples=20,
281-
device=device,
282-
amp=False,
283281
seqlen=10,
284282
iters=10,
285283
scale_dtype="fp32",
@@ -292,34 +290,6 @@ def test_autoround_int_quant(self):
292290
self.assertFalse(torch.all(out1[0] == out2[0]))
293291
self.assertTrue(torch.all(out2[0] == out3[0]))
294292

295-
def test_autoround_export(self):
296-
model = copy.deepcopy(self.gptj)
297-
device = "cpu"
298-
model = model
299-
out1 = model(self.lm_input)
300-
export_model, weight_config1 = autoround_quantize(
301-
model=model,
302-
tokenizer=self.tokenizer,
303-
n_samples=20,
304-
device=device,
305-
amp=False,
306-
seqlen=10,
307-
iters=10,
308-
scale_dtype="fp32",
309-
export_args={"format": "itrex", "inplace": True},
310-
)
311-
export_model = export_model
312-
model = model
313-
out2 = model(self.lm_input)
314-
out3 = export_model(self.lm_input)
315-
self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
316-
self.assertFalse(torch.all(out1[0] == out2[0]))
317-
self.assertTrue(torch.all(out2[0] == out3[0]))
318-
319-
from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
320-
321-
self.assertTrue(isinstance(export_model.transformer.h[0].attn.k_proj, WeightOnlyLinear))
322-
323293

324294
if __name__ == "__main__":
325295
unittest.main()

test/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
--find-links https://download.pytorch.org/whl/torch_stable.html
22
accelerate==0.21.0
33
dynast==1.6.0rc1
4-
git+https://github.com/intel/auto-round.git@b65830f3f6cb32d92a5c8ba5f80ace12d517357b
4+
git+https://github.com/intel/auto-round.git@6815f8b66be456ecbef2d0beb33dbc4efeefdc04
55
horovod
66
intel-extension-for-pytorch
77
intel-tensorflow>=2.12.0

0 commit comments

Comments
 (0)