You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
parser.add_argument('--limit', type=int, default=None, help='Number of eval samples to evaluate')
114
84
parser.add_argument('--precision', type=lambdax: getattr(torch, x.split(".")[-1]), default=torch.bfloat16, help='dtype precision to use')
115
85
parser.add_argument('--device', type=str, default="cuda", help='Device to use for evaluation')
116
-
parser.add_argument('-q', '--quantization', default="None", help='Which quantization technique to apply')
86
+
parser.add_argument('-q', '--quantization', default="None", choices= ["2", "3", "4", "5", "6", "8", "MP_llama3", "None"], help='Which quantization technique to apply, choose from ["2", "3", "4", "5", "6", "8"] for uniform quantizatoin, choose "MP_llama3" for mixed-precision for Llama3 and need to set corresponding sensi_bit and non_sensi_bit, choose "None" for no quantization')
117
87
parser.add_argument('--compile', action='store_true', help='Whether to compile the model.')
118
88
parser.add_argument('--batch_size', type=int, default=1, help='Batch size to use for evaluation, note int8wo and int4wo work best with small batchsizes, int8dq works better with large batchsizes')
119
89
parser.add_argument('--max_length', type=int, default=None, help='Length of text to process at one time')
120
-
parser.add_argument('--sensi_bit', type=int, default=16, help='Bit setting for sensitive layers')
121
-
parser.add_argument('--non_sensi_bit', type=int, default=16, help='Bit setting for non-sensitive layers')
122
-
parser.add_argument('--quant_sym', type=str, default="asym", help='symmetric or asymmetric quantization')
123
-
parser.add_argument('--group_size', type=int, default=32, help='group size to perform quantization on')
Apply int N-bit weight only quantization to a linear layer.
14
+
Args:
15
+
`groupsize`: parameter for quantization, controls the granularity of quantization, smaller size is more fine grained, choices are [512, 256, 128, 64, 32]
16
+
`n`: number of bits to quantize to, choices are [8, 6, 5, 4, 3, 2]
0 commit comments