Enable conversion from .safetensors checkpoints to gguf files

junhuihe · junhuihe · commit 04e48c0aa003 · 2025-05-23T16:17:25.000+08:00
diff --git a/README.md b/README.md
@@ -292,6 +292,17 @@ python utils/generate-dummy-bitnet-model.py models/bitnet_b1_58-large --outfile
 # Run benchmark with the generated model, use -m to specify the model path, -p to specify the prompt processed, -n to specify the number of token to generate
 python utils/e2e_benchmark.py -m models/dummy-bitnet-125m.tl1.gguf -p 512 -n 128
 ```
+
+### Convert from `.safetensors` Checkpoints
+
+```sh
+# Prepare the .safetensors model file
+huggingface-cli download microsoft/bitnet-b1.58-2B-4T-bf16 --local-dir ./models/bitnet-b1.58-2B-4T-bf16
+
+# Convert to gguf model
+python ./utils/convert-helper-bitnet.py ./models/bitnet-b1.58-2B-4T-bf16
+```
+
 ### FAQ (Frequently Asked Questions)📌 
 
 #### Q1: The build dies with errors building llama.cpp due to issues with std::chrono in log.cpp?
diff --git a/utils/convert-helper-bitnet.py b/utils/convert-helper-bitnet.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+def run_command(command_list, cwd=None, check=True):
+    print(f"Executing: {' '.join(map(str, command_list))}")
+    try:
+        process = subprocess.run(command_list, cwd=cwd, check=check, capture_output=False, text=True)
+        return process
+    except subprocess.CalledProcessError as e:
+        print(f"Error executing command: {' '.join(map(str, e.cmd))}")
+        print(f"Return code: {e.returncode}")
+        raise
+
+def main():
+    if len(sys.argv) < 2:
+        script_name = Path(sys.argv[0]).name
+        print(f"Usage: python {script_name} <model-directory>")
+        sys.exit(1)
+
+    model_dir_arg = sys.argv[1]
+    model_dir = Path(model_dir_arg).resolve()
+
+    if not model_dir.is_dir():
+        print(f"Error: Model directory '{model_dir}' not found or is not a directory.")
+        sys.exit(1)
+
+    utils_dir = Path(__file__).parent.resolve()
+    project_root_dir = utils_dir.parent
+
+    preprocess_script = utils_dir / "preprocess-huggingface-bitnet.py"
+    convert_script = utils_dir / "convert-ms-to-gguf-bitnet.py"
+    
+    llama_quantize_binary = project_root_dir / "build" / "bin" / "llama-quantize"
+
+    input_file = model_dir / "model.safetensors"
+    input_backup_file = model_dir / "model.safetensors.backup"
+    preprocessed_output_file = model_dir / "model.safetensors"
+
+    gguf_f32_output = model_dir / "ggml-model-f32-bitnet.gguf"
+    gguf_i2s_output = model_dir / "ggml-model-i2s-bitnet.gguf"
+
+    if not preprocess_script.is_file():
+        print(f"Error: Preprocess script not found at '{preprocess_script}'")
+        sys.exit(1)
+    if not convert_script.is_file():
+        print(f"Error: Convert script not found at '{convert_script}'")
+        sys.exit(1)
+    if not llama_quantize_binary.is_file():
+        print(f"Error: llama-quantize binary not found at '{llama_quantize_binary}'")
+        sys.exit(1)
+
+    if not input_file.is_file():
+        print(f"Error: Input safetensors file not found at '{input_file}'")
+        sys.exit(1)
+
+    try:
+        print(f"Backing up '{input_file}' to '{input_backup_file}'")
+        if input_backup_file.exists():
+             print(f"Warning: Removing existing backup file '{input_backup_file}'")
+             input_backup_file.unlink()
+        shutil.move(input_file, input_backup_file)
+
+        print("Preprocessing huggingface checkpoint...")
+        cmd_preprocess = [
+            sys.executable,
+            str(preprocess_script),
+            "--input", str(input_backup_file),
+            "--output", str(preprocessed_output_file)
+        ]
+        run_command(cmd_preprocess)
+
+        print("Converting to GGUF (f32)...")
+        cmd_convert = [
+            sys.executable,
+            str(convert_script),
+            str(model_dir),
+            "--vocab-type", "bpe",
+            "--outtype", "f32",
+            "--concurrency", "1",
+            "--outfile", str(gguf_f32_output)
+        ]
+        run_command(cmd_convert)
+
+        print("Quantizing model to I2_S...")
+        cmd_quantize = [
+            str(llama_quantize_binary),
+            str(gguf_f32_output),
+            str(gguf_i2s_output),
+            "I2_S",
+            "1"
+        ]
+        run_command(cmd_quantize)
+
+        print("Convert successfully.")
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    finally:
+        print("Cleaning up intermediate files...")
+        if preprocessed_output_file.exists() and preprocessed_output_file != input_backup_file:
+            print(f"Removing preprocessed file: {preprocessed_output_file}")
+            try:
+                preprocessed_output_file.unlink()
+            except OSError as e:
+                print(f"Warning: Could not remove {preprocessed_output_file}: {e}")
+        
+        if gguf_f32_output.exists():
+            print(f"Removing f32 GGUF: {gguf_f32_output}")
+            try:
+                gguf_f32_output.unlink()
+            except OSError as e:
+                print(f"Warning: Could not remove {gguf_f32_output}: {e}")
+        
+        if input_backup_file.exists():
+            if not input_file.exists():
+                print(f"Restoring original '{input_file}' from '{input_backup_file}'")
+                try:
+                    shutil.move(input_backup_file, input_file)
+                except Exception as e:
+                    print(f"Warning: Could not restore {input_file} from backup: {e}")
+            else:
+                print(f"Removing backup '{input_backup_file}' as original '{input_file}' should be present.")
+                try:
+                    input_backup_file.unlink()
+                except OSError as e:
+                    print(f"Warning: Could not remove backup {input_backup_file}: {e}")
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/convert-ms-to-gguf-bitnet.py b/utils/convert-ms-to-gguf-bitnet.py
@@ -1417,6 +1417,9 @@ def write_all(
 
         of = OutputFile(fname_out, endianess=endianess)
 
+        if 'bitnet' in of.gguf.arch:
+            svocab.chat_template = "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nBITNETAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
+
         # meta data
         of.add_meta_arch(params)
         if isinstance(vocab, Vocab):
diff --git a/utils/convert.sh b/utils/convert.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+if [ -z "$1" ]; then
+  echo "Usage: $0 <model-directory>"
+  exit 1
+fi
+
+MODEL_DIR=$(realpath "$1")
+
+PREPROCESS_SCRIPT="./utils/preprocess-safetensors.py"
+CONVERT_SCRIPT="./utils/convert-ms-to-gguf-bitnet.py"
+
+INPUT_FILE="$MODEL_DIR/model.safetensors"
+OUTPUT_FILE="$MODEL_DIR/model.safetensors"
+
+echo "Preprocessing safetensors..."
+mv "$INPUT_FILE" "${INPUT_FILE}.backup"
+python "$PREPROCESS_SCRIPT" --input "${INPUT_FILE}.backup" --output "$OUTPUT_FILE"
+
+GGUF_F32_OUTPUT="$MODEL_DIR/ggml-model-f32-bitnet.gguf"
+
+echo "Converting to GGUF (f32)..."
+python "$CONVERT_SCRIPT" "$MODEL_DIR" --vocab-type bpe --outtype f32 --concurrency 1 --outfile "$GGUF_F32_OUTPUT"
+
+GGUF_I2S_OUTPUT="$MODEL_DIR/ggml-model-i2s-bitnet.gguf"
+
+echo "Quantizing model to I2_S..."
+./build/bin/llama-quantize "$GGUF_F32_OUTPUT" "$GGUF_I2S_OUTPUT" I2_S 1
+
+echo "Cleaning up intermediate files..."
+rm "$OUTPUT_FILE" "$GGUF_F32_OUTPUT"
+mv "${INPUT_FILE}.backup" "$INPUT_FILE"
+
+echo "Convert successfully."
diff --git a/utils/preprocess-huggingface-bitnet.py b/utils/preprocess-huggingface-bitnet.py
@@ -0,0 +1,50 @@
+from safetensors import safe_open
+from safetensors.torch import save_file
+import torch
+
+def quant_weight_fp16(weight):
+    weight = weight.to(torch.float)
+    s = 1.0 / weight.abs().mean().clamp_(min=1e-5)
+    new_weight = (weight * s).round().clamp(-1, 1) / s
+    return new_weight
+
+def quant_model(input, output):
+    tensors = {}
+
+    with safe_open(input, framework='pt') as f:
+        for name in f.keys():
+            tensors[name] = f.get_tensor(name)
+
+            keyword_list = [
+                'q_proj.weight', 
+                'k_proj.weight', 
+                'v_proj.weight',
+                'o_proj.weight',
+                'gate_proj.weight',
+                'up_proj.weight',
+                'down_proj.weight'
+            ]
+
+            if any(keyword in name for keyword in keyword_list):
+                print(f'[INFO] Quantizing {name}')
+                tensors[name] = quant_weight_fp16(tensors[name])
+    
+    print(f'[INFO] Saving to {output}\nThis may take a while.')
+    save_file(tensors, output)
+                
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Convert Safetensors back to Torch .pth checkpoint")
+    parser.add_argument(
+        "--input", type=str, required=True,
+    )
+    parser.add_argument(
+        "--output", type=str, required=True,
+    )
+    args = parser.parse_args()
+
+    quant_model(
+        input=args.input,
+        output=args.output,
+    )