intel · chensuyue · Dec 6, 2023 · Nov 28, 2023 · Nov 30, 2023 · Nov 30, 2023
diff --git a/README.md b/README.md
@@ -122,7 +122,8 @@ q_model = fit(
       </tr>
       <tr>
           <td colspan="4" align="center"><a href="./docs/source/quantization_weight_only.md">Weight-Only Quantization (INT8/INT4/FP4/NF4) </td>
-          <td colspan="4" align="center"><a href="https://github.com/intel/neural-compressor/blob/fp8_adaptor/docs/source/fp8.md">FP8 Quantization </td>
+          <td colspan="2" align="center"><a href="https://github.com/intel/neural-compressor/blob/fp8_adaptor/docs/source/fp8.md">FP8 Quantization </td>
+          <td colspan="2" align="center"><a href="./docs/source/quantization_layer_wise.md">Layer-Wise Quantization </td>
       </tr>
   </tbody>
   <thead>

diff --git a/docs/source/imgs/lwq_ort.png b/docs/source/imgs/lwq_ort.png
diff --git a/docs/source/quantization_layer_wise.md b/docs/source/quantization_layer_wise.md
@@ -0,0 +1,98 @@
+Layer Wise Quantization (LWQ)
+=====
+
+1. [Introduction](#introduction)
+
+2. [Supported Framework Model Matrix](#supported-framework-model-matrix)
+
+3. [Examples](#examples)
+
+## Introduction
+
+Large language models (LLMs) have shown exceptional performance across various tasks, meanwhile, the substantial parameter size poses significant challenges for deployment. Layer-wise quantization(LWQ) can greatly reduce the memory footprint of LLMs, usually 80-90% reduction, which means that users can quantize LLMs even on single node using GPU or CPU. We can quantize the model under memory-constrained devices, therefore making the huge-sized LLM quantization possible.
+
+<img src="./imgs/lwq.png" width=780 height=429>
+
+*Figure 1: The process of layer-wise quantization for PyTorch model. The color grey means empty parameters and the color blue represents parameters need to be quantized. Every rectangle inside model represents one layer.*
+
+<img src="./imgs/lwq_ort.png" width=900 height=400>
+
+*Figure 2: The process of layer-wise quantization for ONNX model. The graph of LLM is split into several parts, and each subgraph is quantized in turn.*
+
+## Supported Framework Model Matrix
+
+
+<table class="tg">
+<thead>
+  <tr>
+    <th colspan="2" style="text-align:center;vertical-align:middle">Types/Framework</th>
+    <th style="text-align:center;vertical-align:middle">PyTorch</th>
+    <th style="text-align:center;vertical-align:middle">ONNX Runtime</th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td style="text-align:center;vertical-align:middle" colspan="2">W8A8 Post Training Static Quantization</td>
+    <td style="text-align:center;vertical-align:middle">&#10004;</td>
+    <td style="text-align:center;vertical-align:middle">&#10004;</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;vertical-align:middle" rowspan="4">Weight-only Quantization</td>
+    <td style="text-align:center;vertical-align:middle">RTN</td>
+    <td style="text-align:center;vertical-align:middle">&#10004;</td>
+    <td style="text-align:center;vertical-align:middle" rowspan="4">&#10005;</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;vertical-align:middle">AWQ</td>
+    <td style="text-align:center;vertical-align:middle">&#10005;</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;vertical-align:middle">GPTQ</td>
+    <td style="text-align:center;vertical-align:middle">&#10004;</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;vertical-align:middle">TEQ</td>
+    <td style="text-align:center;vertical-align:middle">&#10005;</td>
+  </tr>
+</tbody>
+</table>
+
+## Examples
+
+#### PyTorch framework example
+
+```python
+from neural_compressor import PostTrainingQuantConfig, quantization
+from neural_compressor.adaptor.torch_utils.layer_wise_quant import load_empty_model
+
+fp32_model = load_empty_model(model_name_or_path, torchscript=True)
+conf = PostTrainingQuantConfig(
+    approach="weight_only",
+    recipes={
+        "layer_wise_quant": True,
+        "rtn_args": {"enable_full_range": True},
+    },
+)
+
+q_model = quantization.fit(
+    fp32_model,
+    conf,
+    calib_dataloader=eval_dataloader,
+    eval_func=lambda x: 0.1,
+)
+ouput_dir = "./saved_model"
+q_model.save(ouput_dir)
+q_model = load(ouput_dir, fp32_model, weight_only=True, layer_wise=True)
+```
+
+#### ONNX Runtime framework example
+
+```python
+from neural_compressor import quantization, PostTrainingQuantConfig
+
+conf = PostTrainingQuantConfig(recipes={"layer_wise_quant": True})
+q_model = quantization.fit(fp32_model_path, conf, calib_dataloader=dataloader)
+q_model.save(int8_model_path)
+```
+
+Refer to [ONNX Runtime llama-2 LWQ example](../../examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only)
diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md
@@ -7,9 +7,7 @@ Weight Only Quantization (WOQ)
 
 3. [Examples](#examples)
 
-4. [Layer Wise Quantization](#layer-wise-quantization)
-
-5. [WOQ Algorithms Tuning](#woq-algorithms-tuning)
+4. [WOQ Algorithms Tuning](#woq-algorithms-tuning)
 
 
 ## Introduction
@@ -143,50 +141,6 @@ The saved_results folder contains two files: `best_model.pt` and `qconfig.json`,
 
 To seek the performance of weight-only quantized models, Please go to [Intel Extension for Transformers](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/huggingface/pytorch/text-generation/quantization#1-performance) to quantize and deploy the model.
 
-
-## Layer Wise Quantization
-
-Large language models (LLMs) have shown exceptional performance across various tasks, meanwhile, the substantial parameter size poses significant challenges for deployment. Layer-wise quantization(LWQ) can greatly reduce the memory footprint of LLMs, usually 80-90% reduction, which means that users can quantize LLMs even on single node using GPU or CPU.  We can quantize the model under memory-constrained devices, therefore making the huge-sized LLM quantization possible.
-
-<img src="./imgs/lwq.png">
-
-*Figure 1: The process of layer-wise quantization. The color grey means empty parameters and the color blue represents parameters need to be quantized. Every rectangle inside model represents one layer.*
-
-### Supported Matrix
-
-| Algorithms/Framework |   PyTorch  |
-|:--------------:|:----------:|
-|       RTN      |  &#10004;  | 
-|       AWQ      |  &#10005;  |
-|      GPTQ      | &#10004; | 
-|      TEQ      | &#10005; |
-
-### Example
-```python
-from neural_compressor import PostTrainingQuantConfig, quantization
-from neural_compressor.adaptor.torch_utils.layer_wise_quant import load_empty_model
-
-fp32_model = load_empty_model(model_name_or_path, torchscript=True)
-conf = PostTrainingQuantConfig(
-    approach="weight_only",
-    recipes={
-        "layer_wise_quant": True,
-        "rtn_args": {"enable_full_range": True},
-    },
-)
-
-q_model = quantization.fit(
-    fp32_model,
-    conf,
-    calib_dataloader=eval_dataloader,
-    eval_func=lambda x: 0.1,
-)
-ouput_dir = "./saved_model"
-q_model.save(ouput_dir)
-q_model = load(ouput_dir, fp32_model, weight_only=True, layer_wise=True)
-```
-
-
 ## WOQ Algorithms Tuning
 
 To find the best algorithm, users can omit specifying a particular algorithm. In comparison to setting a specific algorithm, this tuning process will traverse through a set of pre-defined WOQ configurations and identify the optimal one with the best result. For details usage, please refer to the [tuning strategy](./tuning_strategies.md#Basic).

diff --git a/docs/source/user_guide.md b/docs/source/user_guide.md
@@ -81,9 +81,10 @@ This part provides the advanced topics that help user dive deep into Intel® Neu
 <td colspan="4" align="center"><a href="add_new_adaptor.md">Add New Adaptor</a></td>
 </tr>
 <tr>
-<td colspan="4" align="center"><a href="distillation_quantization.md">Distillation for Quantization</a></td>
-<td colspan="4" align="center"><a href="smooth_quant.md">SmoothQuant</a></td>
-<td colspan="4" align="center"><a href="quantization_weight_only.md">Weight-Only Quantization</a></td>
+<td colspan="3" align="center"><a href="distillation_quantization.md">Distillation for Quantization</a></td>
+<td colspan="3" align="center"><a href="smooth_quant.md">SmoothQuant</a></td>
+<td colspan="3" align="center"><a href="quantization_weight_only.md">Weight-Only Quantization</a></td>
+<td colspan="3" align="center"><a href="quantization_layer_wise.md">Layer-Wise Quantization</a></td>
 </tr>
 </tbody>
 </table>

diff --git a/examples/.config/model_params_onnxrt.json b/examples/.config/model_params_onnxrt.json
@@ -763,6 +763,13 @@
       "main_script": "main.py",
       "batch_size": 1
     },
+    "llama-2-7b-lwq": {
+      "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/ptq_static",
+      "dataset_location": "",
+      "input_model": "/tf_dataset2/models/onnx/llama-2-7b",
+      "main_script": "main.py",
+      "batch_size": 1
+    },
     "llama-2-7b-rtn": {
       "model_src_dir": "nlp/huggingface_model/text_generation/llama/quantization/weight_only",
       "dataset_location": "",

diff --git a/...t/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md b/...t/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/README.md
@@ -27,13 +27,15 @@ Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are
 
 Export to ONNX model:
 ```bash
-optimum-cli export onnx --model meta-llama/Llama-2-7b-hf --task text-generation-with-past ./Llama-2-7b-hf
+python prepare_model.py  --input_model="meta-llama/Llama-2-7b-hf" --output_model="./llama-2-7b-hf"
 ```
 
 # Run
 
 ## 1. Quantization
 
+### Run SmoothQuant
+
 ```bash
 bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
                   --output_model=/path/to/model_tune \ # folder path to save onnx model
@@ -44,6 +46,20 @@ bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
                   --quant_format="QOperator" # or QDQ, optional
 ```
 
+### Run layer-wise quantization
+Set `--layer-wise=True` to use layer-wise quantization to save your memory. Please note that layer-wise quantization for ONNX models is still under development and only support W8A8 quantization now. More details please refer to [layer wise quantiation](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_layer_wise.md).
+
+```bash
+bash run_quant.sh --input_model=/path/to/model \ # folder path of onnx model
+                  --output_model=/path/to/model_tune \ # folder path to save onnx model
+                  --batch_size=batch_size # optional \
+                  --dataset NeelNanda/pile-10k \
+                  --tokenizer=meta-llama/Llama-2-7b-hf \ # model name or folder path containing all relevant files for model's tokenizer
+                  --quant_format="QOperator" \ # or QDQ, optional
+                  --layer_wise=True
+```
+
+
 ## 2. Benchmark
 
 Accuracy:

diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py
@@ -17,6 +17,7 @@
 # pylint:disable=redefined-outer-name,logging-format-interpolation
 import os
 import onnx
+import json
 import torch
 import logging
 import argparse
@@ -116,6 +117,11 @@
     type=int,
     default=4
 )
+parser.add_argument(
+    '--layer_wise',
+    action='store_true', \
+    default=False,
+)
 args = parser.parse_args()
 
 # load model
@@ -131,16 +137,26 @@ def benchmark(model):
     config = LlamaConfig.from_pretrained(args.model_path)
     sess_options = ort.SessionOptions()
     sess_options.intra_op_num_threads = args.intra_op_num_threads
-    sessions = ORTModelForCausalLM.load_model(
-            os.path.join(model, 'decoder_model.onnx'), 
-            os.path.join(model, 'decoder_with_past_model.onnx'), 
+
+    if os.path.exists(os.path.join(model, "decoder_with_past_model.onnx")):
+        sessions = ORTModelForCausalLM.load_model(  # pylint: disable=E1123
+            os.path.join(model, "decoder_model.onnx"),
+            os.path.join(model, "decoder_with_past_model.onnx"),
             session_options=sess_options)
-    model = ORTModelForCausalLM(
-                sessions[0],
-                config, 
-                model, 
-                sessions[1],
-                use_cache=True)
+        model = ORTModelForCausalLM(sessions[0],  # pylint: disable=E1121
+                                    config,
+                                    model,
+                                    sessions[1],
+                                    use_cache=True)
+    else:
+        sessions = ORTModelForCausalLM.load_model(  # pylint: disable=E1123
+            os.path.join(model, "decoder_model.onnx"),
+            session_options=sess_options)
+        model = ORTModelForCausalLM(sessions[0],  # pylint: disable=E1121
+                                    config,
+                                    model,
+                                    use_cache=False,
+                                    use_io_binding=False)
 
     input_tokens = '32'
     max_new_tokens = 32
@@ -173,23 +189,50 @@ def benchmark(model):
             total_time += toc - tic
 
     print("\n", "-" * 10, "Summary:", "-" * 10)
-    latency = total_time / (num_iter - num_warmup)
     print(args)
-    print("Inference latency: %.3f sec." % latency)
+    throughput = (num_iter - num_warmup) / total_time
+    print("Throughput: {} samples/s".format(throughput))
+
+
+def replace_architectures(json_path):
+    # replace 'LLaMATokenizer' to lowercase 'LlamaTokenizer'
+    # to avoid bug 'Tokenizer class LLaMATokenizer does not exist or is not currently imported.'
+    # refer to https://github.com/huggingface/transformers/issues/22222#issuecomment-1477171703
+    with open(json_path, "r") as file:
+        data = json.load(file)
+        data["architectures"] = ["LlamaForCausalLM"]
+
+    with open(json_path, 'w') as file:
+        json.dump(data, file, indent=4)
 
 def eval_func(model):
+    model_dir = model
+    if isinstance(model, str) and model.endswith(".onnx"):
+        model_dir = os.path.dirname(model)
+
+    replace_architectures(os.path.join(model_dir, "config.json"))
+
     results = evaluate(
         model="hf-causal",
-        model_args='pretrained=' + model + ',tokenizer='+ args.tokenizer,
+        model_args="pretrained=" + model_dir + ",tokenizer="+ args.tokenizer,
         batch_size=args.batch_size,
         tasks=args.tasks,
-        model_format="onnx"
+        model_format="onnx",
     )
+
+    eval_acc = 0
     for task_name in args.tasks:
         if task_name == "wikitext":
             print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]))
+            eval_acc += results["results"][task_name]["word_perplexity"]
         else:
             print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]))
+            eval_acc += results["results"][task_name]["acc"]
+
+    if len(args.tasks) != 0:
+        eval_acc /= len(args.tasks)
+
+    return eval_acc
 
 class KVDataloader:
     def __init__(self, model_path, pad_max=196, batch_size=1, sub_folder='train'):
@@ -258,15 +301,36 @@ def __iter__(self):
 
     if args.tune:
         from neural_compressor import quantization, PostTrainingQuantConfig
-        config = PostTrainingQuantConfig(
-            calibration_sampling_size=[8],
-            recipes={'optypes_to_exclude_output_quant': ['MatMul'],
-                     'smooth_quant': True,
-                     'smooth_quant_args': {'alpha': args.smooth_quant_alpha}},
-            op_type_dict={'^((?!(MatMul|Gather|Conv)).)*$': {'weight': {'dtype': ['fp32']}, 'activation': {'dtype': ['fp32']}}})
-        for model in ['decoder_model.onnx', 'decoder_with_past_model.onnx']:
-            q_model = quantization.fit(
-                    os.path.join(args.model_path, model),
-                    config,
-                    calib_dataloader=KVDataloader(os.path.join(args.model_path, model), pad_max=args.pad_max, batch_size=1))
-            q_model.save(os.path.join(args.output_model, model))
+        if args.layer_wise:
+            # layer-wise quantization for ONNX models is still under development and only support W8A8 quantization now
+            config = PostTrainingQuantConfig(
+                calibration_sampling_size=[8],
+                recipes={'optypes_to_exclude_output_quant': ['MatMul'],
+                        'layer_wise_quant': True},
+                op_type_dict={'^((?!(MatMul|Gather|Conv)).)*$': {'weight': {'dtype': ['fp32']}, 'activation': {'dtype': ['fp32']}}})
+            for model in ['decoder_model.onnx']:
+                # only test decoder_model
+                q_model = quantization.fit(
+                        os.path.join(args.model_path, model),
+                        config,
+                        calib_dataloader=KVDataloader(os.path.join(args.model_path, model), pad_max=args.pad_max, batch_size=1))
+                q_model.save(os.path.join(args.output_model, model))
+
+            tokenizer.save_pretrained(args.output_model)
+
+        else:
+            config = PostTrainingQuantConfig(
+                calibration_sampling_size=[8],
+                recipes={'optypes_to_exclude_output_quant': ['MatMul'],
+                        'smooth_quant': True,
+                        'smooth_quant_args': {'alpha': args.smooth_quant_alpha},
+                        },
+                op_type_dict={'^((?!(MatMul|Gather|Conv)).)*$': {'weight': {'dtype': ['fp32']}, 'activation': {'dtype': ['fp32']}}})
+            for model in ['decoder_model.onnx', 'decoder_with_past_model.onnx']:
+                q_model = quantization.fit(
+                        os.path.join(args.model_path, model),
+                        config,
+                        calib_dataloader=KVDataloader(os.path.join(args.model_path, model), pad_max=args.pad_max, batch_size=1))
+                q_model.save(os.path.join(args.output_model, model))
+
+            tokenizer.save_pretrained(args.output_model)