Skip to content

Commit bacc164

Browse files
authored
Support save/load API for WOQ (#1786)
Signed-off-by: Kaihui-intel <[email protected]> Signed-off-by: chensuyue <[email protected]>
1 parent ca9f8eb commit bacc164

File tree

19 files changed

+462
-232
lines changed

19 files changed

+462
-232
lines changed

.azure-pipelines/scripts/models/run_model_trigger_common.sh

+5
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ elif [ "${mode}" == "tuning" ]; then
8888
[[ ${output_model} ]] && tuning_cmd="${tuning_cmd} --output_model=${output_model}"
8989

9090
cd ${WORK_SOURCE_DIR}/${model_src_dir}
91+
# for int4 models add "--accuracy" to run tuning after quantize
92+
if [[ "${model}" == *"int4"* ]]; then
93+
sed -i "s|--quantize|--quantize --accuracy --int8|g" run_quant.sh
94+
fi
95+
9196
$BOLD_YELLOW && echo "workspace ${WORK_SOURCE_DIR}/${model_src_dir}" && $RESET
9297
$BOLD_YELLOW && echo "tuning_cmd is === ${tuning_cmd}" && $RESET
9398
$BOLD_YELLOW && echo "======== run tuning ========" && $RESET
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#!/bin/bash
2+
set -x
3+
4+
function main {
5+
6+
init_params "$@"
7+
run_benchmark
8+
9+
}
10+
11+
# init params
12+
function init_params {
13+
iters=100
14+
batch_size=16
15+
approach=static
16+
tuned_checkpoint=saved_results
17+
task=lambada_openai
18+
echo ${max_eval_samples}
19+
for var in "$@"
20+
do
21+
case $var in
22+
--topology=*)
23+
topology=$(echo $var |cut -f2 -d=)
24+
;;
25+
--dataset_location=*)
26+
dataset_location=$(echo $var |cut -f2 -d=)
27+
;;
28+
--input_model=*)
29+
input_model=$(echo $var |cut -f2 -d=)
30+
;;
31+
--mode=*)
32+
mode=$(echo $var |cut -f2 -d=)
33+
;;
34+
--batch_size=*)
35+
batch_size=$(echo $var |cut -f2 -d=)
36+
;;
37+
--iters=*)
38+
iters=$(echo ${var} |cut -f2 -d=)
39+
;;
40+
--int8=*)
41+
int8=$(echo ${var} |cut -f2 -d=)
42+
;;
43+
--config=*)
44+
tuned_checkpoint=$(echo $var |cut -f2 -d=)
45+
;;
46+
*)
47+
echo "Error: No such parameter: ${var}"
48+
exit 1
49+
;;
50+
esac
51+
done
52+
53+
}
54+
55+
56+
# run_benchmark
57+
function run_benchmark {
58+
extra_cmd=''
59+
60+
if [[ ${mode} == "accuracy" ]]; then
61+
mode_cmd=" --accuracy "
62+
elif [[ ${mode} == "performance" ]]; then
63+
mode_cmd=" --performance --iters "${iters}
64+
else
65+
echo "Error: No such mode: ${mode}"
66+
exit 1
67+
fi
68+
69+
if [[ ${int8} == "true" ]]; then
70+
extra_cmd=$extra_cmd" --int8"
71+
fi
72+
echo $extra_cmd
73+
74+
if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
75+
model_name_or_path="facebook/opt-125m"
76+
approach="weight_only"
77+
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
78+
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
79+
model_name_or_path="facebook/opt-125m"
80+
approach="weight_only"
81+
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
82+
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
83+
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
84+
model_name_or_path="facebook/opt-125m"
85+
approach="weight_only"
86+
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
87+
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
88+
elif [ "${topology}" = "opt_125m_ipex" ]; then
89+
model_name_or_path="facebook/opt-125m"
90+
extra_cmd=$extra_cmd" --ipex"
91+
elif [ "${topology}" = "opt_125m_ipex_sq" ]; then
92+
model_name_or_path="facebook/opt-125m"
93+
extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5"
94+
elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
95+
model_name_or_path="meta-llama/Llama-2-7b-hf"
96+
approach="weight_only"
97+
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
98+
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
99+
model_name_or_path="meta-llama/Llama-2-7b-hf"
100+
approach="weight_only"
101+
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
102+
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
103+
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
104+
model_name_or_path="meta-llama/Llama-2-7b-hf"
105+
approach="weight_only"
106+
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
107+
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
108+
elif [ "${topology}" = "llama2_7b_ipex" ]; then
109+
model_name_or_path="meta-llama/Llama-2-7b-hf"
110+
extra_cmd=$extra_cmd" --ipex"
111+
elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then
112+
model_name_or_path="meta-llama/Llama-2-7b-hf"
113+
extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8"
114+
elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
115+
model_name_or_path="EleutherAI/gpt-j-6b"
116+
approach="weight_only"
117+
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
118+
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
119+
model_name_or_path="EleutherAI/gpt-j-6b"
120+
approach="weight_only"
121+
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
122+
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
123+
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
124+
model_name_or_path="EleutherAI/gpt-j-6b"
125+
approach="weight_only"
126+
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
127+
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
128+
elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
129+
model_name_or_path="EleutherAI/gpt-j-6b"
130+
approach="weight_only"
131+
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
132+
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
133+
model_name_or_path="EleutherAI/gpt-j-6b"
134+
approach="weight_only"
135+
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
136+
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
137+
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
138+
model_name_or_path="EleutherAI/gpt-j-6b"
139+
approach="weight_only"
140+
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
141+
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
142+
elif [ "${topology}" = "gpt_j_ipex" ]; then
143+
model_name_or_path="EleutherAI/gpt-j-6b"
144+
extra_cmd=$extra_cmd" --ipex"
145+
elif [ "${topology}" = "gpt_j_ipex_sq" ]; then
146+
model_name_or_path="EleutherAI/gpt-j-6b"
147+
extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0"
148+
fi
149+
150+
python -u run_clm_no_trainer.py \
151+
--model ${model_name_or_path} \
152+
--approach ${approach} \
153+
--output_dir ${tuned_checkpoint} \
154+
--task ${task} \
155+
--batch_size ${batch_size} \
156+
${extra_cmd} ${mode_cmd}
157+
}
158+
159+
main "$@"

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py

+11-21
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import datasets
1212
from torch.nn.functional import pad
1313
from torch.utils.data import DataLoader
14+
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer
1415

1516
parser = argparse.ArgumentParser()
1617
parser.add_argument(
@@ -66,7 +67,6 @@
6667
parser.add_argument("--woq_scheme", default="sym")
6768
parser.add_argument("--woq_use_mse_search", action="store_true")
6869
parser.add_argument("--woq_use_full_range", action="store_true")
69-
parser.add_argument("--woq_export_compressed_model", action="store_true")
7070
# =============GPTQ configs====================
7171
parser.add_argument("--gptq_actorder", action="store_true",
7272
help="Whether to apply the activation order GPTQ heuristic.")
@@ -192,7 +192,6 @@ def evaluate(self, model):
192192

193193

194194
def get_user_model():
195-
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer
196195
torchscript = False
197196
if args.sq or args.ipex or args.woq_algo in ['AWQ', 'TEQ']:
198197
torchscript = True
@@ -248,7 +247,6 @@ def get_user_model():
248247
# TODO: add group_dim into double quant config?
249248
"use_full_range": args.woq_use_full_range,
250249
"use_mse_search": args.woq_use_mse_search,
251-
"export_compressed_model": args.woq_export_compressed_model,
252250
}
253251
)
254252
quant_config = RTNConfig.from_dict(double_quant_config_dict)
@@ -261,7 +259,6 @@ def get_user_model():
261259
group_dim=args.woq_group_dim,
262260
use_full_range=args.woq_use_full_range,
263261
use_mse_search=args.woq_use_mse_search,
264-
export_compressed_model=args.woq_export_compressed_model,
265262
use_double_quant=False,
266263
double_quant_bits=args.double_quant_bits,
267264
double_quant_dtype=args.double_quant_dtype,
@@ -298,7 +295,6 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
298295
double_quant_config_dict.update(
299296
{
300297
"use_mse_search": args.woq_use_mse_search,
301-
"export_compressed_model": args.woq_export_compressed_model,
302298
"percdamp": args.gptq_percdamp,
303299
"act_order": args.gptq_actorder,
304300
"block_size": args.gptq_block_size,
@@ -313,7 +309,6 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
313309
use_sym=weight_sym,
314310
group_size=args.woq_group_size,
315311
use_mse_search=args.woq_use_mse_search,
316-
export_compressed_model=args.woq_export_compressed_model,
317312
percdamp=args.gptq_percdamp,
318313
act_order=args.gptq_actorder,
319314
block_size=args.gptq_block_size,
@@ -380,24 +375,19 @@ def run_fn(model):
380375
user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
381376
run_fn(user_model)
382377
user_model = convert(user_model)
383-
user_model.save(args.output_dir)
378+
user_model.save(args.output_dir)
384379

385380

386381
# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
387-
# if args.int8 or args.int8_bf16_mixed:
388-
# print("load int8 model")
389-
390-
# # TODO: from neural_compressor.torch.quantization import load
391-
# from neural_compressor.torch.algorithms.static_quant import load
392-
393-
# if args.ipex:
394-
# user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
395-
# else:
396-
# # TODO: WOQ save&load
397-
# print("Int8 model loading does not support WeightOnlyQuant now.")
398-
# pass
399-
# else:
400-
user_model, _ = get_user_model()
382+
383+
if args.int8 or args.int8_bf16_mixed:
384+
print("load int8 model")
385+
386+
from neural_compressor.torch.quantization import load
387+
tokenizer = AutoTokenizer.from_pretrained(args.model)
388+
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
389+
else:
390+
user_model, tokenizer = get_user_model()
401391

402392

403393
if args.accuracy:

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_quant.sh

-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,6 @@ function run_tuning {
122122
--model ${model_name_or_path} \
123123
--dataset ${DATASET_NAME} \
124124
--quantize \
125-
--accuracy \
126125
--approach ${approach} \
127126
--output_dir ${tuned_checkpoint} \
128127
--tasks "lambada_openai" \

neural_compressor/torch/algorithms/weight_only/gptq.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1146,7 +1146,7 @@ def prepare(
11461146
max_seq_length=2048,
11471147
use_max_length=True,
11481148
device=None,
1149-
export_compressed_model=False,
1149+
export_compressed_model=True,
11501150
use_layer_wise=False,
11511151
model_path=None,
11521152
*args,

neural_compressor/torch/algorithms/weight_only/modules.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
194194
invperm = torch.argsort(self.g_idx)
195195
self.g_idx = invperm // self.group_size
196196
self.g_idx = self.g_idx.type(torch.int32).to(self.device)
197-
assert scale.shape == self.scales.shape, "Scale shape is mismatched."
197+
assert scale.shape == self.scales.shape, f"{scale.shape} != {self.scales.shape} Scale shape is mismatched."
198198
self.scales = scale.type(self.float_type).to(self.device)
199199
if not self.use_optimum_format and self.compression_dim == 0:
200200
int_weight = int_weight.t_().contiguous()

neural_compressor/torch/algorithms/weight_only/rtn.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def convert(
5959
group_size=32,
6060
group_dim=1,
6161
quantile=1.0,
62-
export_compressed_model=False,
62+
export_compressed_model=True,
6363
use_full_range=False,
6464
use_mse_search=False,
6565
*args,
@@ -128,7 +128,6 @@ def convert(
128128
use_full_range = weight_config[name]["use_full_range"]
129129
use_mse_search = weight_config[name]["use_mse_search"]
130130
use_layer_wise = weight_config[name]["use_layer_wise"]
131-
export_compressed_model = weight_config[name]["export_compressed_model"]
132131
if export_compressed_model:
133132
use_optimum_format = kwargs.get("use_optimum_format", True)
134133
# double quant config
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright (c) 2024 Intel Corporation
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# pylint:disable=import-error
16+
17+
import json
18+
import os
19+
20+
import torch
21+
22+
from neural_compressor.common.utils import load_config_mapping, save_config_mapping
23+
from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger
24+
25+
26+
def save(model, output_dir="./saved_results"):
27+
if not os.path.exists(output_dir):
28+
os.mkdir(output_dir)
29+
qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
30+
qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
31+
# saving process
32+
save_config_mapping(model.qconfig, qconfig_file_path)
33+
34+
if hasattr(model, "gptq_config") and model.gptq_config:
35+
gptq_config_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), "gptq_config.json")
36+
with open(gptq_config_path, "w") as f:
37+
json.dump(model.gptq_config, f, indent=4)
38+
39+
# MethodType 'save' not in state_dict
40+
del model.save
41+
torch.save(model, qmodel_file_path)
42+
43+
logger.info("Save quantized model to {}.".format(qmodel_file_path))
44+
logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
45+
46+
47+
def load(output_dir="./saved_results"):
48+
qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
49+
model = torch.load(qmodel_file_path)
50+
logger.info("Quantized model loading successful.")
51+
return model

neural_compressor/torch/algorithms/weight_only/utility.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,8 @@ def quant_tensor(
356356
scale_bits = kwargs.get("double_quant_bits", 8)
357357
scale_scheme = kwargs.get("double_quant_scheme", "asym")
358358
scale_group_size = kwargs.get("double_quant_group_size", 256)
359-
scale_return_int = kwargs.get("double_quant_return_int", return_int)
359+
# TODO: kwargs.get("double_quant_return_int", return_int)
360+
scale_return_int = kwargs.get("double_quant_return_int", False)
360361
orig_scale_shape = scale.shape
361362
scale = scale.reshape(1, -1)
362363
# pre-process: scale_mean

0 commit comments

Comments
 (0)