@@ -250,23 +250,6 @@ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: i
250
250
return False
251
251
252
252
def write_tensors (self ):
253
- # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
254
- def np_fp32_to_bf16 (n : np .ndarray ):
255
- # force nan to quiet
256
- n = np .where ((n & 0x7fffffff ) > 0x7f800000 , (n & 0xffff0000 ) | (64 << 16 ), n )
257
- # flush subnormals to zero
258
- n = np .where ((n & 0x7f800000 ) == 0 , n & 0x80000000 , n )
259
- # round to nearest even
260
- n = (n + (0x7fff + ((n >> 16 ) & 1 ))) >> 16
261
- return n .astype (np .int16 )
262
-
263
- # Doing this row-wise is much, much faster than element-wise, hence the signature
264
- v_fp32_to_bf16 = np .vectorize (np_fp32_to_bf16 , otypes = [np .int16 ], signature = "(n)->(n)" )
265
- if self .lazy :
266
- # TODO: find a way to implicitly wrap np.vectorize functions
267
- # NOTE: the type is changed to reflect otypes passed to np.vectorize above
268
- v_fp32_to_bf16 = gguf .LazyNumpyTensor ._wrap_fn (v_fp32_to_bf16 , meta_noop = np .int16 )
269
-
270
253
max_name_len = max (len (s ) for _ , s in self .tensor_map .mapping .values ()) + len (".weight," )
271
254
272
255
for name , data_torch in self .get_tensors ():
@@ -319,27 +302,31 @@ def np_fp32_to_bf16(n: np.ndarray):
319
302
))
320
303
321
304
if self .ftype != gguf .LlamaFileType .ALL_F32 and extra_f16 and not extra_f32 :
322
- if self .ftype == gguf .LlamaFileType .MOSTLY_F16 :
305
+ if self .ftype == gguf .LlamaFileType .MOSTLY_BF16 :
306
+ data = gguf .quantize_bf16 (data )
307
+ assert data .dtype == np .int16
308
+ data_qtype = gguf .GGMLQuantizationType .BF16
309
+
310
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_Q8_0 and gguf .can_quantize_to_q8_0 (data ):
311
+ data = gguf .quantize_q8_0 (data )
312
+ assert data .dtype == np .uint8
313
+ data_qtype = gguf .GGMLQuantizationType .Q8_0
314
+
315
+ else : # default to float16 for quantized tensors
323
316
if data_dtype != np .float16 :
324
317
data = data .astype (np .float16 )
325
318
data_qtype = gguf .GGMLQuantizationType .F16
326
319
327
- elif self .ftype == gguf .LlamaFileType .MOSTLY_BF16 :
328
- if data_dtype != np .float32 :
329
- data = data .astype (np .float32 )
330
- data = v_fp32_to_bf16 (data .view (np .int32 ))
331
- assert data .dtype == np .int16
332
- data_qtype = gguf .GGMLQuantizationType .BF16
333
-
334
- else : # by default, convert to float32
320
+ if data_qtype is None : # by default, convert to float32
335
321
if data_dtype != np .float32 :
336
322
data = data .astype (np .float32 )
337
323
data_qtype = gguf .GGMLQuantizationType .F32
338
324
339
- assert data_qtype is not None
340
-
325
+ block_size , type_size = gguf .GGML_QUANT_SIZES [data_qtype ]
341
326
# reverse shape to make it similar to the internal ggml dimension order
342
- shape_str = f"{{{ ', ' .join (str (n ) for n in reversed (data .shape ))} }}"
327
+ shape_str = f"""{{{ ', ' .join (str (n ) for n in reversed (
328
+ (* data .shape [:- 1 ], data .shape [- 1 ] * data .dtype .itemsize // type_size * block_size ))
329
+ )} }}"""
343
330
344
331
# n_dims is implicit in the shape
345
332
logger .info (f"{ f'%-{ max_name_len } s' % f'{ new_name } ,' } { old_dtype } --> { data_qtype .name } , shape = { shape_str } " )
@@ -881,6 +868,7 @@ def set_gguf_parameters(self):
881
868
self .gguf_writer .add_head_count (head_count )
882
869
self .gguf_writer .add_head_count_kv (head_count_kv )
883
870
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
871
+ self .gguf_writer .add_file_type (self .ftype )
884
872
885
873
if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
886
874
if self .hparams ["rope_scaling" ].get ("type" ) == "linear" :
@@ -1003,6 +991,7 @@ def set_gguf_parameters(self):
1003
991
self .gguf_writer .add_head_count (head_count )
1004
992
self .gguf_writer .add_head_count_kv (head_count_kv )
1005
993
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
994
+ self .gguf_writer .add_file_type (self .ftype )
1006
995
1007
996
if self .hparams .get ("rope_scaling" ) is not None and "factor" in self .hparams ["rope_scaling" ]:
1008
997
if self .hparams ["rope_scaling" ].get ("type" ) == "linear" :
@@ -1237,6 +1226,7 @@ def set_gguf_parameters(self):
1237
1226
self .gguf_writer .add_head_count_kv (hparams ["num_key_value_heads" ])
1238
1227
self .gguf_writer .add_parallel_residual (hparams ["use_parallel_residual" ] if "use_parallel_residual" in hparams else True )
1239
1228
self .gguf_writer .add_layer_norm_eps (self .find_hparam (["layer_norm_eps" , "norm_eps" ]))
1229
+ self .gguf_writer .add_file_type (self .ftype )
1240
1230
1241
1231
_q_norms : list [dict [str , Tensor ]] | None = None
1242
1232
_k_norms : list [dict [str , Tensor ]] | None = None
@@ -1613,6 +1603,7 @@ def set_gguf_parameters(self):
1613
1603
self .gguf_writer .add_rope_dimension_count (self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ])
1614
1604
self .gguf_writer .add_head_count (self .hparams ["num_attention_heads" ])
1615
1605
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layer_norm_epsilon" ])
1606
+ self .gguf_writer .add_file_type (self .ftype )
1616
1607
1617
1608
1618
1609
@Model .register ("Qwen2ForCausalLM" )
@@ -1850,6 +1841,7 @@ def set_gguf_parameters(self):
1850
1841
self .gguf_writer .add_head_count (hparams ["num_attention_heads" ])
1851
1842
self .gguf_writer .add_head_count_kv (5 ) # hparams["num_key_value_heads"]) is wrong
1852
1843
self .gguf_writer .add_layer_norm_rms_eps (hparams ["rms_norm_eps" ])
1844
+ self .gguf_writer .add_file_type (self .ftype )
1853
1845
1854
1846
def shuffle_attn_q_weight (self , data_torch ):
1855
1847
assert data_torch .size () == (5120 , 5120 )
@@ -2029,6 +2021,7 @@ def set_gguf_parameters(self):
2029
2021
self .gguf_writer .add_head_count (self .hparams ["num_attention_heads" ])
2030
2022
self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
2031
2023
self .gguf_writer .add_head_count_kv (self .hparams ["num_key_value_heads" ])
2024
+ self .gguf_writer .add_file_type (self .ftype )
2032
2025
2033
2026
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2034
2027
num_heads = self .hparams ["num_attention_heads" ]
@@ -2437,25 +2430,15 @@ class LazyTorchTensor(gguf.LazyBase):
2437
2430
def numpy (self ) -> gguf .LazyNumpyTensor :
2438
2431
dtype = self ._dtype_map [self .dtype ]
2439
2432
return gguf .LazyNumpyTensor (
2440
- meta = np . lib . stride_tricks . as_strided ( np . zeros ( 1 , dtype ) , self .shape , ( 0 for _ in self . shape ) ),
2433
+ meta = gguf . LazyNumpyTensor . meta_with_dtype_and_shape ( dtype , self .shape ),
2441
2434
lazy = self ._lazy ,
2442
2435
args = (self ,),
2443
2436
func = (lambda s : s [0 ].numpy ())
2444
2437
)
2445
2438
2446
2439
@classmethod
2447
- def eager_to_meta (cls , t : Tensor ) -> Tensor :
2448
- if t .is_meta :
2449
- return t
2450
- return t .detach ().to ("meta" )
2451
-
2452
- @classmethod
2453
- def meta_with_dtype (cls , m : Tensor , dtype : torch .dtype ) -> Tensor :
2454
- m = m .detach ()
2455
- if not m .is_meta :
2456
- m = m .to ("meta" )
2457
- m .dtype = dtype
2458
- return m
2440
+ def meta_with_dtype_and_shape (cls , dtype : torch .dtype , shape : torch .Size ) -> Tensor :
2441
+ return torch .empty (size = shape , dtype = dtype , device = "meta" )
2459
2442
2460
2443
@classmethod
2461
2444
def __torch_function__ (cls , func , types , args = (), kwargs = None ):
@@ -2486,8 +2469,8 @@ def parse_args() -> argparse.Namespace:
2486
2469
help = "path to write to; default: based on input. {ftype} will be replaced by the outtype." ,
2487
2470
)
2488
2471
parser .add_argument (
2489
- "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "auto" ], default = "f16" ,
2490
- help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
2472
+ "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "q8_0" , " auto" ], default = "f16" ,
2473
+ help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
2491
2474
)
2492
2475
parser .add_argument (
2493
2476
"--bigendian" , action = "store_true" ,
@@ -2545,6 +2528,7 @@ def main() -> None:
2545
2528
"f32" : gguf .LlamaFileType .ALL_F32 ,
2546
2529
"f16" : gguf .LlamaFileType .MOSTLY_F16 ,
2547
2530
"bf16" : gguf .LlamaFileType .MOSTLY_BF16 ,
2531
+ "q8_0" : gguf .LlamaFileType .MOSTLY_Q8_0 ,
2548
2532
"auto" : gguf .LlamaFileType .GUESSED ,
2549
2533
}
2550
2534
0 commit comments