Skip to content

Commit 7214dff

Browse files
committed
qnn end to end flow for stories model (#3038)
Summary: Pull Request resolved: #3038 Patch a few changes including: - support bool tensor type - support fp16 and fix the 8w8a quantization. - add two non-supported ops (slice_scatter and index_put) in common_defs.py stories model working end to end: AOT: fp16: ``` python -m examples.models.llama2.export_llama -kv --qnn -c stories110M.pt -p params.json ``` quantize: ``` python -m examples.models.llama2.export_llama -kv --qnn --pt2e_quantize qnn_8a8w -c stories110M.pt -p params.json ``` Runtime: ``` /llama_main --model_path=llama2_fp16_qnn_2.21.pte --tokenizer_path=tokenizer.bin --prompt="Once" ``` Output: ``` Once upon a time, there was a little girl named Lily. She loved to play outside and explore the world around her. One day, she went on a walk with her mommy and they found a beautiful landscape with lots of trees and flowers. Lily said, "Mommy, this place is so pretty! Can we take a picture?" Mommy replied, "Of course, Lily! Let's take a picture to remember the original place we found." After they took the picture, they continued their walk and saw a bird flying in the sky. Lily said, "MomPyTorchObserver {"prompt_tokens":2,"generated_tokens":125,"model_load_start_ms":1713226585936,"model_load_end_ms":1713226586909,"inference_start_ms":1713226586909,"inference_end_ms":1713226590363,"prompt_eval_end_ms":1713226586966,"first_token_ms":1713226586994,"aggregate_sampling_time_ms":23,"SCALING_FACTOR_UNITS_PER_SECOND":1000} I 00:00:04.436699 executorch:runner.cpp:414] Prompt Tokens: 2 Generated Tokens: 125 I 00:00:04.436703 executorch:runner.cpp:420] Model Load Time: 0.973000 (seconds) I 00:00:04.436732 executorch:runner.cpp:430] Total inference time: 3.454000 (seconds) Rate: 36.189925 (tokens/second) I 00:00:04.436735 executorch:runner.cpp:438] Prompt evaluation: 0.057000 (seconds) Rate: 35.087719 (tokens/second) I 00:00:04.436739 executorch:runner.cpp:449] Generated 125 tokens: 3.397000 (seconds) Rate: 36.797174 (tokens/second) I 00:00:04.436742 executorch:runner.cpp:457] Time to first generated token: 0.085000 (seconds) I 00:00:04.436744 executorch:runner.cpp:464] Sampling time over 127 tokens: 0.023000 (seconds) [INFO] [Qnn ExecuTorch]: Destroy Qnn backend parameters [INFO] [Qnn ExecuTorch]: Destroy Qnn context ``` Stories model is too small and sensitive to qunatization. ghstack-source-id: 223199545 exported-using-ghexport Reviewed By: mergennachin, kirklandsign Differential Revision: D56119738 fbshipit-source-id: daf5563fe51a677f302e09ae8a9fb80e6bda72c5 (cherry picked from commit 3257c66)
1 parent 36eb9c8 commit 7214dff

File tree

3 files changed

+70
-10
lines changed

3 files changed

+70
-10
lines changed

backends/qualcomm/builders/node_visitor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,
3030
}
3131
QNN_TENSOR_TYPE_MAP = {
32+
torch.bool: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
3233
torch.float32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
3334
torch.int8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_8,
3435
torch.int16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_16,

backends/qualcomm/partition/common_defs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
exir_ops.edge.aten.clone.default,
1414
exir_ops.edge.aten.index.Tensor,
1515
exir_ops.edge.aten.full.default,
16+
exir_ops.edge.aten.slice_scatter.default,
17+
exir_ops.edge.aten.index_put.default,
1618
]
1719

1820
allow_list_operator = [

examples/models/llama2/export_llama_lib.py

Lines changed: 67 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,13 @@ def build_args_parser() -> argparse.ArgumentParser:
355355
parser.add_argument(
356356
"--pt2e_quantize",
357357
default=None,
358+
choices=[
359+
"xnnpack_dynamic",
360+
"xnnpack_dynamic_qc4",
361+
"qnn_8a8w",
362+
"qnn_16a16w",
363+
"qnn_16a4w",
364+
],
358365
help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.",
359366
)
360367
parser.add_argument(
@@ -615,6 +622,9 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
615622
if args.use_sdpa_with_kv_cache:
616623
transforms.append(replace_sdpa_with_custom_op)
617624

625+
if args.qnn and args.use_kv_cache:
626+
transforms.append(replace_sdpa_with_simple_sdpa)
627+
transforms.append(replace_causal_mask)
618628
return (
619629
load_llama_model(
620630
checkpoint=checkpoint_path,
@@ -636,13 +646,16 @@ def _export_llama(modelname, args) -> str: # noqa: C901
636646
# export_to_edge
637647
pt2e_quant_params = _get_pt2e_quantization_params(args)
638648
quantizers = get_pt2e_quantizers(pt2e_quant_params, args)
639-
if args.qnn:
640-
assert (
641-
args.quantization_mode is None
642-
), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
649+
quant_dtype = None
650+
if args.qnn and args.pt2e_quantize:
643651
try:
644652
# pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.quantizer.quantizer`
645-
from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
653+
from executorch.backends.qualcomm.quantizer.quantizer import (
654+
get_16a4w_qnn_ptq_config,
655+
get_default_16bit_qnn_ptq_config,
656+
QnnQuantizer,
657+
QuantDtype,
658+
)
646659

647660
# reset quantizers and pt2e_quant_params from xnnpack backend
648661
pt2e_quant_params = None
@@ -652,10 +665,41 @@ def _export_llama(modelname, args) -> str: # noqa: C901
652665
"Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
653666
)
654667

668+
backend, quant_config = args.pt2e_quantize.split("_")
669+
assert (
670+
backend == "qnn"
671+
), f"The quantization config is for backend {backend} instead of qnn."
655672
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
656673
qnn_quantizer = QnnQuantizer()
657674
# more custom quantization are supported including 16a4w etc. default to 8bit quantized
658675
custom_annotations = ()
676+
if quant_config == "8a8w":
677+
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
678+
quant_dtype = QuantDtype.use_8a8w
679+
pass
680+
elif quant_config == "16a16w":
681+
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
682+
quant_dtype = QuantDtype.use_16a16w
683+
qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
684+
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
685+
qnn_quantizer.set_bit16_op_quant_config(get_default_16bit_qnn_ptq_config())
686+
elif quant_config == "16a4w":
687+
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
688+
quant_dtype = QuantDtype.use_16a4w
689+
qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
690+
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
691+
qnn_quantizer.set_bit16_op_quant_config(get_16a4w_qnn_ptq_config())
692+
qnn_quantizer.set_per_channel_weight_dtype(
693+
weight_dtype_for_16bit_act="int4"
694+
)
695+
else:
696+
raise AssertionError(
697+
f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
698+
)
699+
700+
assert (
701+
args.quantization_mode is None
702+
), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
659703
qnn_quantizer.add_custom_quant_annotations(custom_annotations)
660704
quantizers.append(qnn_quantizer)
661705

@@ -769,25 +813,38 @@ def _export_llama(modelname, args) -> str: # noqa: C901
769813
"Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
770814
)
771815

772-
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
773-
backend_options = generate_htp_compiler_spec(use_fp16=False)
816+
use_fp16 = True
817+
skip_node_op_set = {}
818+
if args.pt2e_quantize:
819+
use_fp16 = False
820+
# TODO: fix the lowering error without skipping nodes
821+
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
822+
if quant_dtype == QuantDtype.use_8a8w:
823+
raise NotImplementedError("8a8w for llama is still under development")
824+
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
825+
elif quant_dtype == QuantDtype.use_16a16w:
826+
raise NotImplementedError("16a16w for llama is still under development")
827+
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
828+
elif quant_dtype == QuantDtype.use_16a4w:
829+
raise NotImplementedError("16a4w for llama is still under development")
774830
partitioners.append(
775831
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
776832
QnnPartitioner(
777833
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
778834
generate_qnn_executorch_compiler_spec(
779835
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
780836
soc_model=QcomChipset.SM8650, # default to SM8650
781-
backend_options=backend_options,
837+
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
838+
backend_options=generate_htp_compiler_spec(use_fp16=use_fp16),
782839
debug=False,
783840
saver=False,
784841
),
785842
skip_node_id_set={},
786-
skip_node_op_set={},
843+
skip_node_op_set=skip_node_op_set,
787844
)
788845
)
789846
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
790-
_transform(builder_exported_to_edge.export_program())
847+
_transform(builder_exported_to_edge.edge_manager.exported_program())
791848

792849
if args.generate_etrecord:
793850
if not builder_exported_to_edge.edge_manager:

0 commit comments

Comments
 (0)