Skip to content

Enable conversion from .safetensors checkpoints to gguf files #280

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,17 @@ python utils/generate-dummy-bitnet-model.py models/bitnet_b1_58-large --outfile
# Run benchmark with the generated model, use -m to specify the model path, -p to specify the prompt processed, -n to specify the number of token to generate
python utils/e2e_benchmark.py -m models/dummy-bitnet-125m.tl1.gguf -p 512 -n 128
```

### Convert from `.safetensors` Checkpoints

```sh
# Prepare the .safetensors model file
huggingface-cli download microsoft/bitnet-b1.58-2B-4T-bf16 --local-dir ./models/bitnet-b1.58-2B-4T-bf16

# Convert to gguf model
python ./utils/convert-helper-bitnet.py ./models/bitnet-b1.58-2B-4T-bf16
```

### FAQ (Frequently Asked Questions)📌

#### Q1: The build dies with errors building llama.cpp due to issues with std::chrono in log.cpp?
Expand Down
134 changes: 134 additions & 0 deletions utils/convert-helper-bitnet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python3

import sys
import os
import shutil
import subprocess
from pathlib import Path

def run_command(command_list, cwd=None, check=True):
print(f"Executing: {' '.join(map(str, command_list))}")
try:
process = subprocess.run(command_list, cwd=cwd, check=check, capture_output=False, text=True)
return process
except subprocess.CalledProcessError as e:
print(f"Error executing command: {' '.join(map(str, e.cmd))}")
print(f"Return code: {e.returncode}")
raise

def main():
if len(sys.argv) < 2:
script_name = Path(sys.argv[0]).name
print(f"Usage: python {script_name} <model-directory>")
sys.exit(1)

model_dir_arg = sys.argv[1]
model_dir = Path(model_dir_arg).resolve()

if not model_dir.is_dir():
print(f"Error: Model directory '{model_dir}' not found or is not a directory.")
sys.exit(1)

utils_dir = Path(__file__).parent.resolve()
project_root_dir = utils_dir.parent

preprocess_script = utils_dir / "preprocess-huggingface-bitnet.py"
convert_script = utils_dir / "convert-ms-to-gguf-bitnet.py"

llama_quantize_binary = project_root_dir / "build" / "bin" / "llama-quantize"

input_file = model_dir / "model.safetensors"
input_backup_file = model_dir / "model.safetensors.backup"
preprocessed_output_file = model_dir / "model.safetensors"

gguf_f32_output = model_dir / "ggml-model-f32-bitnet.gguf"
gguf_i2s_output = model_dir / "ggml-model-i2s-bitnet.gguf"

if not preprocess_script.is_file():
print(f"Error: Preprocess script not found at '{preprocess_script}'")
sys.exit(1)
if not convert_script.is_file():
print(f"Error: Convert script not found at '{convert_script}'")
sys.exit(1)
if not llama_quantize_binary.is_file():
print(f"Error: llama-quantize binary not found at '{llama_quantize_binary}'")
sys.exit(1)

if not input_file.is_file():
print(f"Error: Input safetensors file not found at '{input_file}'")
sys.exit(1)

try:
print(f"Backing up '{input_file}' to '{input_backup_file}'")
if input_backup_file.exists():
print(f"Warning: Removing existing backup file '{input_backup_file}'")
input_backup_file.unlink()
shutil.move(input_file, input_backup_file)

print("Preprocessing huggingface checkpoint...")
cmd_preprocess = [
sys.executable,
str(preprocess_script),
"--input", str(input_backup_file),
"--output", str(preprocessed_output_file)
]
run_command(cmd_preprocess)

print("Converting to GGUF (f32)...")
cmd_convert = [
sys.executable,
str(convert_script),
str(model_dir),
"--vocab-type", "bpe",
"--outtype", "f32",
"--concurrency", "1",
"--outfile", str(gguf_f32_output)
]
run_command(cmd_convert)

print("Quantizing model to I2_S...")
cmd_quantize = [
str(llama_quantize_binary),
str(gguf_f32_output),
str(gguf_i2s_output),
"I2_S",
"1"
]
run_command(cmd_quantize)

print("Convert successfully.")

except Exception as e:
print(f"An error occurred: {e}")
finally:
print("Cleaning up intermediate files...")
if preprocessed_output_file.exists() and preprocessed_output_file != input_backup_file:
print(f"Removing preprocessed file: {preprocessed_output_file}")
try:
preprocessed_output_file.unlink()
except OSError as e:
print(f"Warning: Could not remove {preprocessed_output_file}: {e}")

if gguf_f32_output.exists():
print(f"Removing f32 GGUF: {gguf_f32_output}")
try:
gguf_f32_output.unlink()
except OSError as e:
print(f"Warning: Could not remove {gguf_f32_output}: {e}")

if input_backup_file.exists():
if not input_file.exists():
print(f"Restoring original '{input_file}' from '{input_backup_file}'")
try:
shutil.move(input_backup_file, input_file)
except Exception as e:
print(f"Warning: Could not restore {input_file} from backup: {e}")
else:
print(f"Removing backup '{input_backup_file}' as original '{input_file}' should be present.")
try:
input_backup_file.unlink()
except OSError as e:
print(f"Warning: Could not remove backup {input_backup_file}: {e}")

if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions utils/convert-ms-to-gguf-bitnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1417,6 +1417,9 @@ def write_all(

of = OutputFile(fname_out, endianess=endianess)

if 'bitnet' in of.gguf.arch:
svocab.chat_template = "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nBITNETAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"

# meta data
of.add_meta_arch(params)
if isinstance(vocab, Vocab):
Expand Down
50 changes: 50 additions & 0 deletions utils/preprocess-huggingface-bitnet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from safetensors import safe_open
from safetensors.torch import save_file
import torch

def quant_weight_fp16(weight):
weight = weight.to(torch.float)
s = 1.0 / weight.abs().mean().clamp_(min=1e-5)
new_weight = (weight * s).round().clamp(-1, 1) / s
return new_weight

def quant_model(input, output):
tensors = {}

with safe_open(input, framework='pt') as f:
for name in f.keys():
tensors[name] = f.get_tensor(name)

keyword_list = [
'q_proj.weight',
'k_proj.weight',
'v_proj.weight',
'o_proj.weight',
'gate_proj.weight',
'up_proj.weight',
'down_proj.weight'
]

if any(keyword in name for keyword in keyword_list):
print(f'[INFO] Quantizing {name}')
tensors[name] = quant_weight_fp16(tensors[name])

print(f'[INFO] Saving to {output}\nThis may take a while.')
save_file(tensors, output)


if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Convert Safetensors back to Torch .pth checkpoint")
parser.add_argument(
"--input", type=str, required=True,
)
parser.add_argument(
"--output", type=str, required=True,
)
args = parser.parse_args()

quant_model(
input=args.input,
output=args.output,
)