@@ -1089,6 +1089,8 @@ def __init__(self, *args, **kwargs):
1089
1089
raise TypeError ("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION" )
1090
1090
1091
1091
# get n_embd of the text model
1092
+ if "text_config" not in self .hparams :
1093
+ self .hparams ["text_config" ] = {}
1092
1094
text_config = {** self .hparams , ** self .hparams ["text_config" ]}
1093
1095
self .n_embd_text = text_config .get ("hidden_size" , text_config .get ("n_embd" , 0 ))
1094
1096
assert self .n_embd_text > 0 , "n_embd not found in hparams"
@@ -2583,6 +2585,82 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2583
2585
return [(self .map_tensor_name (name ), data_torch )]
2584
2586
2585
2587
2588
+ @ModelBase .register ("Qwen2VLForConditionalGeneration" , "Qwen2_5_VLForConditionalGeneration" )
2589
+ class Qwen2VLVisionModel (VisionModel ):
2590
+ def __init__ (self , * args , ** kwargs ):
2591
+ super ().__init__ (* args , ** kwargs )
2592
+ self .hparams ["image_size" ] = self .hparams .get ("image_size" , 560 )
2593
+ # rename config.json values
2594
+ self .hparams ["num_attention_heads" ] = self .hparams .get ("num_heads" )
2595
+ self .hparams ["num_hidden_layers" ] = self .hparams .get ("depth" )
2596
+ if "embed_dim" in self .hparams : # qwen2vl
2597
+ self .hparams ["intermediate_size" ] = self .hparams .get ("hidden_size" )
2598
+ self .hparams ["hidden_size" ] = self .hparams .get ("embed_dim" )
2599
+
2600
+ def set_gguf_parameters (self ):
2601
+ super ().set_gguf_parameters ()
2602
+ hparams = self .hparams
2603
+ if self .global_config ['model_type' ] == 'qwen2_vl' :
2604
+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .QWEN2VL )
2605
+ elif self .global_config ['model_type' ] == 'qwen2_5_vl' :
2606
+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .QWEN25VL )
2607
+ self .gguf_writer .add_vision_use_silu (True )
2608
+ # find n_wa_pattern (window attention pattern)
2609
+ fullatt_block_indexes = hparams .get ("fullatt_block_indexes" )
2610
+ assert fullatt_block_indexes is not None , "fullatt_block_indexes is required for qwen2_5_vl"
2611
+ n_wa_pattern = fullatt_block_indexes [0 ] + 1
2612
+ # validate n_wa_pattern
2613
+ for i in range (1 , len (fullatt_block_indexes )):
2614
+ if fullatt_block_indexes [i ] - fullatt_block_indexes [i - 1 ] != n_wa_pattern :
2615
+ raise ValueError (f"Invalid fullatt_block_indexes: { fullatt_block_indexes } " )
2616
+ self .gguf_writer .add_vision_n_wa_pattern (n_wa_pattern )
2617
+ else :
2618
+ raise ValueError (f"Unknown QwenVL model type: { self .global_config ['model_type' ]} " )
2619
+ # default values below are taken from HF tranformers code
2620
+ self .gguf_writer .add_vision_attention_layernorm_eps (self .global_config .get ("rms_norm_eps" , 1e-6 ))
2621
+
2622
+ def tensor_force_quant (self , name , new_name , bid , n_dims ):
2623
+ del bid , name , n_dims # unused
2624
+ if ".patch_embd." in new_name :
2625
+ return gguf .GGMLQuantizationType .F16
2626
+ if ".position_embd." in new_name :
2627
+ return gguf .GGMLQuantizationType .F32
2628
+ return False
2629
+
2630
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2631
+ del bid # unused
2632
+ if name .startswith ("visual." ):
2633
+ # process visual tensors
2634
+ # split QKV tensors if needed
2635
+ if ".qkv." in name :
2636
+ if data_torch .ndim == 2 : # weight
2637
+ c3 , _ = data_torch .shape
2638
+ else : # bias
2639
+ c3 = data_torch .shape [0 ]
2640
+ assert c3 % 3 == 0
2641
+ c = c3 // 3
2642
+ wq = data_torch [:c ]
2643
+ wk = data_torch [c : c * 2 ]
2644
+ wv = data_torch [c * 2 :]
2645
+ return [
2646
+ (self .map_tensor_name (name .replace ("qkv" , "q" )), wq ),
2647
+ (self .map_tensor_name (name .replace ("qkv" , "k" )), wk ),
2648
+ (self .map_tensor_name (name .replace ("qkv" , "v" )), wv ),
2649
+ ]
2650
+ elif 'patch_embed.proj.weight' in name :
2651
+ # split Conv3D into Conv2Ds
2652
+ c1 , c2 , kt , kh , kw = data_torch .shape
2653
+ del c1 , c2 , kh , kw # unused
2654
+ assert kt == 2 , "Current implmentation only support temporal_patch_size of 2"
2655
+ return [
2656
+ (gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_ENC_EMBD_PATCH ] + ".weight" , data_torch [:, :, 0 , ...]),
2657
+ (gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_ENC_EMBD_PATCH ] + ".weight.1" , data_torch [:, :, 1 , ...]),
2658
+ ]
2659
+ else :
2660
+ return [(self .map_tensor_name (name ), data_torch )]
2661
+ return [] # skip other tensors
2662
+
2663
+
2586
2664
@ModelBase .register ("WavTokenizerDec" )
2587
2665
class WavTokenizerDecModel (TextModel ):
2588
2666
model_arch = gguf .MODEL_ARCH .WAVTOKENIZER_DEC
0 commit comments