Skip to content

Commit d2bce6a

Browse files
authored
Move autoround from generate.py to eval.py (#868)
* move autoround from generate to eval Signed-off-by: yiliu30 <[email protected]> * add llama3 back Signed-off-by: yiliu30 <[email protected]> * update the scripts Signed-off-by: yiliu30 <[email protected]> * update the scripts Signed-off-by: yiliu30 <[email protected]> * rename eval_acc.sh -> evals.sh Signed-off-by: yiliu30 <[email protected]> * update Signed-off-by: yiliu30 <[email protected]> * update Signed-off-by: yiliu30 <[email protected]> --------- Signed-off-by: yiliu30 <[email protected]>
1 parent 90c8cbd commit d2bce6a

File tree

4 files changed

+67
-62
lines changed

4 files changed

+67
-62
lines changed

torchao/_models/llama/benchmarks.sh

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,6 @@ python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --co
4949
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
5050
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
5151
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
52-
# TODO: this is an accuracy technique with same perf as int4, should be in evaluations instead of generate.py
53-
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoround # auto-round w/o quant_lm_head
54-
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoround-cuda-0 # auto-round w/o quant_lm_head
5552

5653
export MODEL_REPO=meta-llama/Meta-Llama-3-8B
5754
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
@@ -61,7 +58,4 @@ python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --co
6158
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization fp6 --write_result benchmark_results.txt --precision float16
6259
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization sparse-marlin --precision float16 --write_result benchmark_results.txt
6360
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-4-64 --write_result benchmark_results.txt
64-
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt
65-
# TODO: this is an accuracy technique with same perf as int4, should be in evaluations instead of generate.py
66-
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoround # auto-round w/o quant_lm_head
67-
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization autoround-cuda-0 # auto-round w/o quant_lm_head
61+
python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization uintx-2-8 --write_result benchmark_results.txt

torchao/_models/llama/eval.py

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from tokenizer import get_tokenizer
2727
import time
2828
from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
29-
from torchao._models.llama.model import prepare_inputs_for_model
29+
from torchao._models.llama.model import prepare_inputs_for_model, TransformerBlock
3030
from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
3131

3232
def run_evaluation(
@@ -122,6 +122,51 @@ def run_evaluation(
122122
else:
123123
if not TORCH_VERSION_AT_LEAST_2_5:
124124
unwrap_tensor_subclass(model)
125+
if "autoround" in quantization:
126+
from torchao.prototype.autoround.autoround_llm import quantize_model_with_autoround_
127+
from transformers import AutoTokenizer
128+
129+
_tokenizer = AutoTokenizer.from_pretrained(checkpoint_path.parent)
130+
# parse args from quantization string:
131+
# autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>
132+
_quant_args = quantization.split("-")
133+
_default_quant_args = [False, 200, 128, 8, 2048, 128]
134+
_model_devie = _quant_args[1] if len(_quant_args) > 1 else device
135+
_quant_args = _quant_args[2:]
136+
quant_lm_head, iters, groupsize, batch_size, seqlen, nsamples = [
137+
int(x) for x in _quant_args
138+
] + _default_quant_args[len(_quant_args) :]
139+
model = model.to(_model_devie)
140+
print(
141+
(
142+
f"Quantizing model with autoround(iters={iters}, groupsize={groupsize}, "
143+
f"quant_lm_head={quant_lm_head}, batch_size={batch_size}, seqlen={seqlen}, nsamples={nsamples})"
144+
)
145+
)
146+
with torch.device(_model_devie):
147+
model.setup_caches(
148+
max_batch_size=batch_size, max_seq_length=seqlen, training=True
149+
)
150+
151+
if quant_lm_head:
152+
is_target_module = (
153+
lambda mod, fqn: isinstance(mod, TransformerBlock)
154+
or "output" in fqn
155+
)
156+
else:
157+
is_target_module = lambda mod, fqn: isinstance(mod, TransformerBlock)
158+
quantize_model_with_autoround_(
159+
model=model,
160+
tokenizer=_tokenizer,
161+
is_target_module=is_target_module,
162+
bits=4,
163+
seqlen=seqlen,
164+
bs=batch_size,
165+
iters=iters,
166+
nsamples=nsamples,
167+
)
168+
model.to(device)
169+
model.reset_caches()
125170

126171
if compile:
127172
model = torch.compile(model, mode="max-autotune", fullgraph=True)
@@ -145,11 +190,15 @@ def run_evaluation(
145190
parser.add_argument('--limit', type=int, default=None, help='Number of eval samples to evaluate')
146191
parser.add_argument('--precision', type=lambda x: getattr(torch, x.split(".")[-1]), default=torch.bfloat16, help='dtype precision to use')
147192
parser.add_argument('--device', type=str, default="cuda", help='Device to use for evaluation')
148-
parser.add_argument('-q', '--quantization', type=str,
193+
parser.add_argument(
194+
"-q",
195+
"--quantization",
196+
type=str,
149197
help=(
150-
'Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, int4wo-<groupsize>-gptq, autoquant, autoquant-int4, '+
151-
'int4wo-<groupsize>-hqq, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin'
152-
)
198+
"Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, int4wo-<groupsize>-gptq, "
199+
"autoquant, autoquant-int4, int4wo-<groupsize>-hqq, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, "
200+
"sparse-marlin, autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>"
201+
),
153202
)
154203
parser.add_argument('--compile', action='store_true', help='Whether to compile the model.')
155204
parser.add_argument('--max_length', type=int, default=None, help='Length of text to process at one time')

torchao/_models/llama/evals.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
2+
3+
export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
4+
python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround # auto-round w/o quant_lm_head
5+
python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head
6+
7+
export MODEL_REPO=meta-llama/Meta-Llama-3-8B
8+
python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantizatio autoround-cpu # auto-round w/o quant_lm_head
9+
python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cpu-1 # auto-round w/ quant_lm_head

torchao/_models/llama/generate.py

Lines changed: 3 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def device_sync(device):
3030
wd = Path(__file__).parent.parent.resolve()
3131
sys.path.append(str(wd))
3232

33-
from torchao._models.llama.model import Transformer, prepare_inputs_for_model, TransformerBlock
33+
from torchao._models.llama.model import Transformer, prepare_inputs_for_model
3434
from torchao._models.llama.tokenizer import get_tokenizer
3535

3636
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
@@ -227,53 +227,7 @@ def main(
227227
if "marlin" in quantization:
228228
from torchao.dtypes import MarlinSparseLayoutType
229229
quantize_(model, int4_weight_only(layout_type=MarlinSparseLayoutType()))
230-
if "autoround" in quantization:
231-
from torchao.prototype.autoround.autoround_llm import quantize_model_with_autoround_
232-
from transformers import AutoTokenizer
233-
234-
_tokenizer = AutoTokenizer.from_pretrained(checkpoint_path.parent)
235-
# parse args from quantization string:
236-
# autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>
237-
# A lightweight configuration for generation benchmarking.
238-
_quant_args = quantization.split("-")
239-
_default_quant_args = [True, 1, 128, 1, 512, 32]
240-
_model_devie = _quant_args[1] if len(_quant_args) > 1 else device
241-
_quant_args = _quant_args[2:]
242-
quant_lm_head, iters, groupsize, batch_size, seqlen, nsamples = [
243-
int(x) for x in _quant_args
244-
] + _default_quant_args[len(_quant_args) :]
245-
model = model.to(_model_devie)
246-
print(
247-
(
248-
f"Quantizing model with autoround(iters={iters}, groupsize={groupsize}, "
249-
f"quant_lm_head={quant_lm_head}, batch_size={batch_size}, seqlen={seqlen}, nsamples={nsamples})"
250-
)
251-
)
252-
with torch.device(_model_devie):
253-
model.setup_caches(
254-
max_batch_size=batch_size, max_seq_length=seqlen, training=True
255-
)
256-
257-
if quant_lm_head:
258-
is_target_module = (
259-
lambda mod, fqn: isinstance(mod, TransformerBlock) or "output" in fqn
260-
)
261-
else:
262-
is_target_module = lambda mod, fqn: isinstance(mod, TransformerBlock)
263-
quantize_model_with_autoround_(
264-
model=model,
265-
tokenizer=_tokenizer,
266-
is_target_module=is_target_module,
267-
bits=4,
268-
seqlen=seqlen,
269-
bs=batch_size,
270-
iters=iters,
271-
nsamples=nsamples,
272-
)
273-
model.to(device)
274-
model.reset_caches()
275-
# TODO this needs to be expanded to all of fpx so they can
276-
if "fp6" in quantization:
230+
if "fp6" in quantization:
277231
quantize_(model, fpx_weight_only(3, 2))
278232
if "uintx" in quantization:
279233
# uintx-nbits-groupsize, e.g. "uintx-2-64"
@@ -461,8 +415,7 @@ def callback(x):
461415
parser.add_argument('-q', '--quantization', type=str,
462416
help=(
463417
'Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, int4wo-<groupsize>-hqq, autoquant, '
464-
+'autoquant-int4, autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>, '
465-
+'uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin'
418+
+'autoquant-int4, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin'
466419
)
467420
)
468421
parser.add_argument('--kv_cache_quantization', action='store_true', help='Whether to quantize the KV cache')

0 commit comments

Comments
 (0)