Skip to content

Commit c04a42d

Browse files
authored
Merge branch 'ggerganov:master' into fix-ggml-org#2023
2 parents ac793a2 + 7d5f184 commit c04a42d

File tree

4 files changed

+268
-95
lines changed

4 files changed

+268
-95
lines changed

examples/make-ggml.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
This script converts Hugging Face llama models to GGML and quantizes them.
3+
4+
Usage:
5+
python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
6+
7+
Arguments:
8+
- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
9+
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
10+
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
11+
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
12+
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
13+
14+
Quant types:
15+
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
16+
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
17+
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
18+
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
19+
- Q2_K: smallest, extreme quality loss - not recommended
20+
- Q3_K: alias for Q3_K_M
21+
- Q3_K_S: very small, very high quality loss
22+
- Q3_K_M: very small, very high quality loss
23+
- Q3_K_L: small, substantial quality loss
24+
- Q4_K: alias for Q4_K_M
25+
- Q4_K_S: small, significant quality loss
26+
- Q4_K_M: medium, balanced quality - recommended
27+
- Q5_K: alias for Q5_K_M
28+
- Q5_K_S: large, low quality loss - recommended
29+
- Q5_K_M: large, very low quality loss - recommended
30+
- Q6_K: very large, extremely low quality loss
31+
- Q8_0: very large, extremely low quality loss - not recommended
32+
- F16: extremely large, virtually no quality loss - not recommended
33+
- F32: absolutely huge, lossless - not recommended
34+
"""
35+
import subprocess
36+
subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
37+
38+
import argparse
39+
import os
40+
from huggingface_hub import snapshot_download
41+
42+
def main(model, outname, outdir, quants, keep_fp16):
43+
ggml_version = "v3"
44+
45+
if not os.path.isdir(model):
46+
print(f"Model not found at {model}. Downloading...")
47+
try:
48+
if outname is None:
49+
outname = model.split('/')[-1]
50+
model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
51+
except Exception as e:
52+
raise Exception(f"Could not download the model: {e}")
53+
54+
if outdir is None:
55+
outdir = f'../models/{outname}'
56+
57+
if not os.path.isfile(f"{model}/config.json"):
58+
raise Exception(f"Could not find config.json in {model}")
59+
60+
os.makedirs(outdir, exist_ok=True)
61+
62+
print("Building llama.cpp")
63+
subprocess.run(f"cd .. && make quantize", shell=True, check=True)
64+
65+
fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
66+
67+
print(f"Making unquantised GGML at {fp16}")
68+
if not os.path.isfile(fp16):
69+
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
70+
else:
71+
print(f"Unquantised GGML already exists at: {fp16}")
72+
73+
print("Making quants")
74+
for type in quants:
75+
outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
76+
print(f"Making {type} : {outfile}")
77+
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
78+
79+
if not keep_fp16:
80+
os.remove(fp16)
81+
82+
if __name__ == "__main__":
83+
parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
84+
parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
85+
parser.add_argument('--outname', default=None, help='Output model(s) name')
86+
parser.add_argument('--outdir', default=None, help='Output directory')
87+
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
88+
parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
89+
90+
args = parser.parse_args()
91+
92+
main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)

ggml-cuda.cu

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2423,20 +2423,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
24232423
scoped_spin_lock lock(g_cuda_pool_lock);
24242424
int id;
24252425
CUDA_CHECK(cudaGetDevice(&id));
2426-
2426+
#ifdef DEBUG_CUDA_MALLOC
2427+
int nnz = 0;
2428+
size_t max_size = 0, tot_size = 0;
2429+
#endif
2430+
size_t best_diff = 1ull << 36;
2431+
int ibest = -1;
24272432
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
24282433
cuda_buffer& b = g_cuda_buffer_pool[id][i];
2429-
if (b.size >= size && b.ptr != nullptr) {
2430-
void * ptr = b.ptr;
2431-
*actual_size = b.size;
2432-
b.ptr = nullptr;
2433-
b.size = 0;
2434-
return ptr;
2434+
if (b.ptr != nullptr) {
2435+
#ifdef DEBUG_CUDA_MALLOC
2436+
++nnz;
2437+
tot_size += b.size;
2438+
if (b.size > max_size) max_size = b.size;
2439+
#endif
2440+
if (b.size >= size) {
2441+
size_t diff = b.size - size;
2442+
if (diff < best_diff) {
2443+
best_diff = diff;
2444+
ibest = i;
2445+
if (!best_diff) {
2446+
void * ptr = b.ptr;
2447+
*actual_size = b.size;
2448+
b.ptr = nullptr;
2449+
b.size = 0;
2450+
return ptr;
2451+
}
2452+
}
2453+
}
24352454
}
24362455
}
2456+
if (ibest >= 0) {
2457+
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
2458+
void * ptr = b.ptr;
2459+
*actual_size = b.size;
2460+
b.ptr = nullptr;
2461+
b.size = 0;
2462+
return ptr;
2463+
}
2464+
#ifdef DEBUG_CUDA_MALLOC
2465+
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
2466+
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
2467+
#endif
24372468
void * ptr;
2438-
CUDA_CHECK(cudaMalloc((void **) &ptr, size));
2439-
*actual_size = size;
2469+
size_t look_ahead_size = (size_t) (1.05 * size);
2470+
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
2471+
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
2472+
*actual_size = look_ahead_size;
24402473
return ptr;
24412474
}
24422475

@@ -2955,8 +2988,13 @@ inline void ggml_cuda_op_rope(
29552988
const int mode = ((int32_t *) src1->data)[2];
29562989
const int n_ctx = ((int32_t *) src1->data)[3];
29572990

2958-
const float theta_scale = powf(10000.0, -2.0f/n_dims);
2959-
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
2991+
// RoPE alteration for extended context
2992+
float freq_base, freq_scale;
2993+
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
2994+
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
2995+
2996+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
2997+
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
29602998

29612999
bool is_glm = mode & 4;
29623000

ggml-metal.m

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -685,8 +685,8 @@ void ggml_metal_graph_compute(
685685
GGML_ASSERT(ne02 == 1);
686686
GGML_ASSERT(ne12 == 1);
687687

688-
nth0 = 4;
689-
nth1 = 16;
688+
nth0 = 2;
689+
nth1 = 32;
690690
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
691691
} break;
692692
case GGML_TYPE_Q4_K:
@@ -743,15 +743,18 @@ void ggml_metal_graph_compute(
743743
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
744744
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
745745
}
746+
else if (src0t == GGML_TYPE_Q3_K) {
747+
#ifdef GGML_QKK_64
748+
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
749+
#else
750+
[encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
751+
#endif
752+
}
746753
else if (src0t == GGML_TYPE_Q5_K) {
747754
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
748755
}
749756
else if (src0t == GGML_TYPE_Q6_K) {
750757
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
751-
}
752-
else if (src0t == GGML_TYPE_Q3_K) {
753-
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
754-
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
755758
} else {
756759
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
757760
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];

0 commit comments

Comments
 (0)