remove experiment scripts

Hanxian97 · Hanxian97 · commit aadda53485f8 · 2024-07-23T23:10:02.000-07:00
diff --git a/test/quantization/test_naive_intNwo.py b/test/quantization/test_naive_intNwo.py
@@ -0,0 +1,46 @@
+import torch
+import torch.nn as nn
+
+import os
+import sys
+# append the path to the naive_intNwo.py file
+sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "torchao/quantization/prototype/mixed_precision/scripts"))
+from naive_intNwo import intN_weight_only
+
+from torchao.quantization import quantize_, int8_weight_only, int4_weight_only
+
+from torchao.quantization.utils import (
+    _apply_logging_hook,
+    compute_error,
+    compute_error as SQNR,
+    _fqn_to_op_to_shape_to_count,
+    LoggingTensorMode,
+)
+
+def test_weight_only_quant(quantization_bit=2, symmetric=False):
+    for x_shape in [[2, 4], [5, 5, 5, 4], [1, 4, 4]]:
+        x = torch.randn(*x_shape)
+        m = nn.Sequential(nn.Linear(4, 5))
+        y_ref = m(x)
+        quantize_(m, intN_weight_only(n=quantization_bit, group_size=2, symmetric=symmetric))
+        y_wo = m(x)
+        sqnr = compute_error(y_ref, y_wo)
+        print(sqnr)
+        assert sqnr > 44.0, "sqnr: {} is too low".format(sqnr)
+
+
+# test if the asymmetric and symmetric quantization API works with different bit widths
+for i in range(2, 9):
+    #test for asymmetric quantization
+    try:
+        test_weight_only_quant(i, False)
+        print(f"Test passed for {i}-bit using naive intNwo asymmetric quantization implementation")
+    except Exception as e:
+        print(f"Exception handled in test loop for {i}-bit asymmetric quantization. Details: {e}")
+
+    #test for symmetric quantization
+    try:
+        test_weight_only_quant(i, True)
+        print(f"Test passed for {i}-bit using naive intNwo symmetric quantization implementation")
+    except Exception as e:
+        print(f"Exception handled in test loop for {i}-bit symmetric quantization. Details: {e}")
diff --git a/torchao/quantization/prototype/mixed_precision/scripts/mp_quant_eval.py b/torchao/quantization/prototype/mixed_precision/scripts/mp_quant_eval.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 
-from naive_intNwo import intN_weight_only_asym, intN_weight_only_sym
+from naive_intNwo import intN_weight_only
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from lm_eval.models.huggingface import HFLM
@@ -28,63 +28,33 @@ def run_evaluation(repo_id, tasks, limit, device, precision, quantization, compi
     tokenizer = AutoTokenizer.from_pretrained(repo_id)
     model = AutoModelForCausalLM.from_pretrained(repo_id).to(device="cpu", dtype=precision)
 
-    if quantization == "int8dq":
-        quantize_(model.to(device=device), int8_dynamic_activation_int4_weight())
-
-    elif quantization == "int8wo":
-        quantize_(model.to(device=device), int8_weight_only())
-
-    elif quantization == "int4wo": 
-        quantize_(model.to(device=device), int4_weight_only(group_size=group_size))
-        
-    elif quantization == "autoquant":
+    if quantization == "autoquant":
         model = autoquant(model.to(device=device))
         
     # naive implementation of uniform precision quantization all layers    
     elif quantization in ["2","3","4","5","6","8"]:
-        if quant_sym == "asym":
-            quantize_(model.to(device=device), intN_weight_only_asym(n=int(quantization), group_size=group_size))
-        elif quant_sym == "sym":
-            quantize_(model.to(device=device), intN_weight_only_sym(n=int(quantization), group_size=group_size))
-        
+        quantize_(model.to(device=device), intN_weight_only(n=int(quantization), group_size=group_size, symmetric=quant_sym))
+    
+    # mix precision quantization for Llama3    
     elif quantization == "MP_llama3":
         
-        # filter for sensitive layers
+        # filter for sensitive layers (the first 3 and last 2 layers for Llama3)
         def filter_fn_sen(child: torch.nn.Module, cur_fqn:str) -> bool:
             return isinstance(child, nn.Linear) and any(skiplayer in cur_fqn for skiplayer in ['.0.', '.1.', '.2.', '.30.', '.31.'])
         
-        # filter for non-sensitive layers
+        # filter for non-sensitive layers (other 27 layers for Llama3)
         def filter_fn_nonsen(child: torch.nn.Module, cur_fqn:str) -> bool:
             return isinstance(child, nn.Linear) and not(any(skiplayer in cur_fqn for skiplayer in ['.0.', '.1.', '.2.', '.30.', '.31.']))
         
+        # quantize the sensitive layers
         if sensi_bit != 16:
-            # quantize the sensitive layers
-            if sensi_bit == 8:
-                quantize_(model.to(device=device), int8_weight_only(), filter_fn_sen)
-            elif  sensi_bit == 4: 
-                quantize_(model.to(device=device), int4_weight_only(group_size=group_size), filter_fn_sen)
-            elif sensi_bit in [6,5,3,2]:
-                if quant_sym == "asym":
-                    quantize_(model.to(device=device), intN_weight_only_asym(n=sensi_bit, group_size=group_size), filter_fn_sen)
-                elif quant_sym == "sym":
-                    quantize_(model.to(device=device), intN_weight_only_sym(n=sensi_bit, group_size=group_size), filter_fn_sen)
+            quantize_(model.to(device=device), intN_weight_only(n=sensi_bit, group_size=group_size, symmetric=quant_sym), filter_fn_sen)
 
         # quantize the less-sensitive layers
-        if non_sensi_bit == 8:
-            quantize_(model.to(device=device), int8_weight_only(), filter_fn_nonsen)
-        elif  non_sensi_bit == 4: 
-            quantize_(model.to(device=device), int4_weight_only(group_size=group_size), filter_fn_nonsen)
-        elif non_sensi_bit in [6,5,3,2]:
-            if sensi_bit == 4: 
-                if quant_sym == "asym":
-                    quantize_(model, intN_weight_only_asym(n=non_sensi_bit, group_size=group_size), filter_fn_nonsen)
-                elif quant_sym == "sym":
-                    quantize_(model, intN_weight_only_sym(n=non_sensi_bit, group_size=group_size), filter_fn_nonsen)     
-            else:
-                if quant_sym == "asym":
-                    quantize_(model.to(device=device), intN_weight_only_asym(n=non_sensi_bit, group_size=group_size), filter_fn_nonsen)
-                elif quant_sym == "sym":
-                    quantize_(model.to(device=device), intN_weight_only_sym(n=non_sensi_bit, group_size=group_size), filter_fn_nonsen)   
+        if sensi_bit == 4: 
+            quantize_(model, intN_weight_only(n=non_sensi_bit, group_size=group_size, symmetric=quant_sym), filter_fn_nonsen)     
+        else:
+            quantize_(model.to(device=device), intN_weight_only(n=non_sensi_bit, group_size=group_size, symmetric=quant_sym), filter_fn_nonsen)   
     
     if compile:
         model = torch.compile(model, mode="max-autotune", fullgraph=True)
@@ -113,13 +83,13 @@ def filter_fn_nonsen(child: torch.nn.Module, cur_fqn:str) -> bool:
     parser.add_argument('--limit', type=int, default=None, help='Number of eval samples to evaluate')
     parser.add_argument('--precision', type=lambda x: getattr(torch, x.split(".")[-1]), default=torch.bfloat16, help='dtype precision to use')
     parser.add_argument('--device', type=str, default="cuda", help='Device to use for evaluation')
-    parser.add_argument('-q', '--quantization', default = "None", help='Which quantization technique to apply')
+    parser.add_argument('-q', '--quantization', default = "None", choices = ["2", "3", "4", "5", "6", "8", "MP_llama3", "None"], help='Which quantization technique to apply, choose from ["2", "3", "4", "5", "6", "8"] for uniform quantizatoin, choose "MP_llama3" for mixed-precision for Llama3 and need to set corresponding sensi_bit and non_sensi_bit, choose "None" for no quantization')
     parser.add_argument('--compile', action='store_true', help='Whether to compile the model.')
     parser.add_argument('--batch_size', type=int, default=1, help='Batch size to use for evaluation, note int8wo and int4wo work best with small batchsizes, int8dq works better with large batchsizes')
     parser.add_argument('--max_length', type=int, default=None, help='Length of text to process at one time')
-    parser.add_argument('--sensi_bit', type=int, default=16, help='Bit setting for sensitive layers')
-    parser.add_argument('--non_sensi_bit', type=int, default=16, help='Bit setting for non-sensitive layers')
-    parser.add_argument('--quant_sym', type=str, default="asym", help='symmetric or asymmetric quantization')
-    parser.add_argument('--group_size', type=int, default=32, help='group size to perform quantization on')
+    parser.add_argument('--sensi_bit', type=int, default=16, choices = [16, 8, 6, 5, 4, 3], help='Bit setting for sensitive layers')
+    parser.add_argument('--non_sensi_bit', type=int, default=8, choices = [8, 6, 5, 4, 3, 2], help='Bit setting for non-sensitive layers')
+    parser.add_argument('--quant_sym', type=bool, default=False, help='Symmetric or asymmetric quantization, asymmetric by default')
+    parser.add_argument('--group_size', type=int, default=32, help='Group size to perform quantization on')
     args = parser.parse_args()
     run_evaluation(args.repo_id, args.tasks, args.limit, args.device, args.precision, args.quantization, args.compile, args.batch_size, args.max_length, args.sensi_bit, args.non_sensi_bit, args.quant_sym, args.group_size)
diff --git a/torchao/quantization/prototype/mixed_precision/scripts/naive_intNwo.py b/torchao/quantization/prototype/mixed_precision/scripts/naive_intNwo.py
@@ -5,13 +5,26 @@
     ZeroPointDomain,
 )
 
-def intN_weight_only_asym(group_size=32, n=8):
+from torchao.quantization import int8_weight_only, int4_weight_only
+
+
+def intN_weight_only(group_size=32, n=8, symmetric=False):
+    '''
+        Apply int N-bit weight only quantization to a linear layer.
+        Args:
+            `groupsize`: parameter for quantization, controls the granularity of quantization, smaller size is more fine grained, choices are [512, 256, 128, 64, 32]
+            `n`: number of bits to quantize to, choices are [8, 6, 5, 4, 3, 2]
+        Usage:
+            from torchao.quantization import quantize_
+            quantize_(model, intN_weight_only(n=your_bit_choice, group_size=group_size), optional_filter_func_for_desired_layers_to_quantize)
+    '''
+    # for asymmetric quantization
     def apply_intN_weight_only_quant_asym(weight):
-        # avoid circular dep
+        # avoid circular dependency
         from torchao.dtypes import to_affine_quantized
         mapping_type = MappingType.ASYMMETRIC
         block_size = (1, group_size)
-        target_dtype = torch.int8
+        target_dtype = torch.uint8
         quant_min = 0
         quant_max = 2**n-1
         eps = 1e-6
@@ -20,21 +33,28 @@ def apply_intN_weight_only_quant_asym(weight):
         zero_point_domain = ZeroPointDomain.FLOAT
         return to_affine_quantized(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, zero_point_dtype=zero_point_dtype, preserve_zero=preserve_zero,zero_point_domain=zero_point_domain)
 
-    return apply_intN_weight_only_quant_asym
-
-def intN_weight_only_sym(group_size=32, n=8):
+    # for symmetric quantization
     def apply_intN_weight_only_quant_sym(weight):
-        # avoid circular dep
+        # avoid circular dependency
         from torchao.dtypes import to_affine_quantized
         mapping_type = MappingType.SYMMETRIC
         block_size = (1, group_size)
         target_dtype = torch.int8
-        quant_min = -2**(n-1)
-        quant_max = 2**(n-1)-1
         eps = 1e-6
-        preserve_zero = True
-        zero_point_dtype = torch.bfloat16
-        zero_point_domain = ZeroPointDomain.INT
-        return to_affine_quantized(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, zero_point_dtype=zero_point_dtype, preserve_zero=preserve_zero,zero_point_domain=zero_point_domain)
+        zero_point_dtype = torch.int64
+        return to_affine_quantized(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype)
 
-    return apply_intN_weight_only_quant_sym
+    try:
+        assert n in [8, 6, 5, 4, 3, 2], "n must be one of [8, 6, 5, 4, 3, 2]"
+        if n == 8:
+            return int8_weight_only()
+        elif n == 4:
+            return int4_weight_only(group_size=group_size)
+        else:
+            if symmetric:
+                return apply_intN_weight_only_quant_sym
+            else:
+                return apply_intN_weight_only_quant_asym
+    except Exception as e:
+        raise  
+