@@ -126,7 +126,7 @@ def get_weights(fn):
126
126
def quantize_q8_0 (tensor : torch .Tensor ) -> torch .CharTensor :
127
127
# equivalent to ggml_quantize_q8_0 in ggml.c
128
128
assert tensor .shape [1 ] % GGML_QK8_0 == 0
129
- tensor = tensor .view (- 1 , GGML_QK8_0 )
129
+ tensor = tensor .reshape (- 1 , GGML_QK8_0 )
130
130
scale = tensor .abs ().max (dim = - 1 , keepdim = True ).values / ((1 << 7 ) - 1 )
131
131
tensor = (tensor / scale ).round ().clamp (min = - 128 , max = 127 ).char ()
132
132
# add scale into each block
@@ -152,7 +152,7 @@ def quantize_q4_0(tensor: torch.Tensor) -> torch.CharTensor:
152
152
def quantize_q4_1 (tensor : torch .Tensor ) -> torch .CharTensor :
153
153
# equivalent to ggml_quantize_q4_1 in ggml.c
154
154
assert tensor .shape [1 ] % GGML_QK4_1 == 0
155
- tensor = tensor .view (- 1 , GGML_QK4_1 )
155
+ tensor = tensor .reshape (- 1 , GGML_QK4_1 )
156
156
abs_max_indices = tensor .max (dim = - 1 , keepdim = True ).indices
157
157
max_values = torch .take_along_dim (tensor , abs_max_indices , dim = - 1 )
158
158
abs_min_indices = tensor .min (dim = - 1 , keepdim = True ).indices
@@ -185,15 +185,13 @@ def maybe_quantize_tensor(tensor, ggml_type):
185
185
raise NotImplementedError (f"Cannot quantize tensor of dtype { tensor .dtype } ({ ggml_type } )" )
186
186
187
187
188
- def get_dtype_and_ggml_type (tensor , ggml_type ):
189
- if tensor .ndim in (2 , 3 ):
188
+ def get_dtype_and_ggml_type (name , tensor , ggml_type ):
189
+ if tensor .ndim in (2 , 3 ) and "ffn_gate_inp" not in name :
190
190
if tensor .shape [1 ] % GGML_QK8_0 == 0 :
191
191
return np .int8 , ggml_type
192
192
else :
193
193
return np .float16 , gguf .GGMLQuantizationType .F16
194
194
else :
195
- # 1d weight: convert it to float32
196
- assert tensor .ndim == 1 , tensor
197
195
return np .float32 , gguf .GGMLQuantizationType .F32
198
196
199
197
@@ -205,7 +203,7 @@ def dump_state_dict(f, ggml_type, input_dir, config):
205
203
for idx , name in enumerate (weight_names ):
206
204
weight , scales = get_weights (f"{ input_dir } /tensor{ idx :05} _000" )
207
205
meta_tensor = convert_weight (name , weight , scales , config , device = "meta" )
208
- dtype , tensor_ggml_type = get_dtype_and_ggml_type (meta_tensor , ggml_type )
206
+ dtype , tensor_ggml_type = get_dtype_and_ggml_type (name , meta_tensor , ggml_type )
209
207
quantized_meta_tensor = maybe_quantize_tensor (meta_tensor , tensor_ggml_type )
210
208
f .add_tensor_info (
211
209
f"{ name } .weight" ,
@@ -227,7 +225,7 @@ def dump_state_dict(f, ggml_type, input_dir, config):
227
225
for name in weight_names :
228
226
weight , scales = weights .pop (name )
229
227
tensor = convert_weight (name , weight , scales , config )
230
- _ , tensor_ggml_type = get_dtype_and_ggml_type (tensor , ggml_type )
228
+ _ , tensor_ggml_type = get_dtype_and_ggml_type (name , tensor , ggml_type )
231
229
array = maybe_quantize_tensor (tensor , tensor_ggml_type ).numpy ()
232
230
233
231
logging .info (
@@ -317,7 +315,10 @@ def get_weight_names(num_hidden_layers=64):
317
315
gguf .MODEL_TENSOR .FFN_GATE_INP ,
318
316
)
319
317
320
- for bid in range (num_hidden_layers ):
318
+ layers = [str (bid ) for bid in range (64 )]
319
+ layers .sort () # Lexicographic sort: 0 < 1 < 10 < 11 ... < 2 < 20 < ...
320
+
321
+ for bid in layers [:num_hidden_layers ]:
321
322
for key in layer :
322
323
weight_names .append (gguf .TENSOR_NAMES [key ].format (bid = bid ))
323
324
@@ -333,7 +334,6 @@ def ffn_size(emb_size, widening_factor):
333
334
return _ffn_size
334
335
335
336
config = {
336
- "vocab_size" : 128 * 1024 ,
337
337
"hidden_act" : "gelu" ,
338
338
"pad_token_id" : 0 ,
339
339
"eos_token_id" : 2 ,
@@ -366,8 +366,7 @@ def ffn_size(emb_size, widening_factor):
366
366
367
367
f = gguf .GGUFWriter (args .save_path , "grok" , endianess = gguf .GGUFEndian .LITTLE )
368
368
369
- f .add_name ("grok" )
370
- f .add_vocab_size (config .vocab_size )
369
+ f .add_name ("grok-1" )
371
370
f .add_context_length (config .max_position_embeddings )
372
371
f .add_embedding_length (config .hidden_size )
373
372
f .add_block_count (config .num_hidden_layers )
@@ -389,6 +388,8 @@ def ffn_size(emb_size, widening_factor):
389
388
f .add_token_scores (scores )
390
389
f .add_token_types (toktypes )
391
390
391
+ f .add_quantization_version (ggml_type )
392
+
392
393
dump_state_dict (f , ggml_type , args .input_dir , config )
393
394
f .close ()
394
395
0 commit comments