Skip to content

Commit ba11eb9

Browse files
committed
Merge branch 'master' of github.com:ggerganov/llama.cpp into grammar-example
* 'master' of github.com:ggerganov/llama.cpp: convert : remove bug in convert.py permute function (ggml-org#3364) make-ggml.py : compatibility with more models and GGUF (ggml-org#3290) gguf : fix a few general keys (ggml-org#3341) metal : reusing llama.cpp logging (ggml-org#3152) build : add ACCELERATE_NEW_LAPACK to fix warning on macOS Sonoma (ggml-org#3342) readme : add some recent perplexity and bpw measurements to READMES, link for k-quants (ggml-org#3340) cmake : fix build-info.h on MSVC (ggml-org#3309) docs: Fix typo CLBlast_DIR var. (ggml-org#3330) nix : add cuda, use a symlinked toolkit for cmake (ggml-org#3202)
2 parents 6ab1e63 + e519621 commit ba11eb9

19 files changed

+244
-132
lines changed

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
118118
add_custom_command(
119119
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
120120
COMMENT "Generating build details from Git"
121-
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
121+
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
122122
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
123123
DEPENDS "${GIT_DIR}/index"
124124
VERBATIM
@@ -162,6 +162,8 @@ if (APPLE AND LLAMA_ACCELERATE)
162162
message(STATUS "Accelerate framework found")
163163

164164
add_compile_definitions(GGML_USE_ACCELERATE)
165+
add_compile_definitions(ACCELERATE_NEW_LAPACK)
166+
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
165167
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
166168
else()
167169
message(WARNING "Accelerate framework not found")

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,8 @@ ifndef LLAMA_NO_ACCELERATE
305305
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
306306
ifeq ($(UNAME_S),Darwin)
307307
MK_CPPFLAGS += -DGGML_USE_ACCELERATE
308+
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
309+
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
308310
MK_LDFLAGS += -framework Accelerate
309311
endif
310312
endif # LLAMA_NO_ACCELERATE

Package.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ let package = Package(
4545
.unsafeFlags(["-Wno-shorten-64-to-32"]),
4646
.define("GGML_USE_K_QUANTS"),
4747
.define("GGML_USE_ACCELERATE")
48+
.define("ACCELERATE_NEW_LAPACK")
49+
.define("ACCELERATE_LAPACK_ILP64")
4850
] + additionalSettings,
4951
linkerSettings: [
5052
.linkedFramework("Accelerate")

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,7 @@ Building the program with BLAS support may lead to some performance improvements
501501
```sh
502502
mkdir build
503503
cd build
504-
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
504+
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
505505
cmake --build . --config Release
506506
```
507507
- CMake (Windows):
@@ -597,6 +597,11 @@ Several quantization methods are supported. They differ in the resulting model d
597597
| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
598598
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
599599

600+
- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
601+
- recent k-quants improvements
602+
- [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
603+
- [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
604+
600605
### Perplexity (measuring model quality)
601606

602607
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).

convert.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ def __repr__(self) -> str:
439439
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
440440
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
441441
if n_head_kv is not None and n_head != n_head_kv:
442-
n_head //= n_head_kv
442+
n_head = n_head_kv
443443
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
444444
.swapaxes(1, 2)
445445
.reshape(weights.shape))

examples/gptneox-wip/falcon-main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,10 +367,10 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
367367
keyidx = gguf_find_key(ggufctx, "general.architecture");
368368
if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
369369
keyidx = gguf_find_key(ggufctx, "general.file_type");
370-
if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
370+
if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
371371
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
372372
if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
373-
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
373+
keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
374374
if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
375375
}
376376

examples/gptneox-wip/gptneox-main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -380,10 +380,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
380380
keyidx = gguf_find_key(ggufctx, "general.architecture");
381381
if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
382382
keyidx = gguf_find_key(ggufctx, "general.file_type");
383-
if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
383+
if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
384384
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
385385
if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
386-
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
386+
keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
387387
if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
388388
}
389389

examples/llama-bench/llama-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -903,7 +903,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
903903
}
904904
}
905905

906-
static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
906+
static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
907907
(void) level;
908908
(void) text;
909909
(void) user_data;

examples/make-ggml.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
11
#!/usr/bin/env python3
22
"""
3-
This script converts Hugging Face llama models to GGML and quantizes them.
3+
This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
44
55
Usage:
6-
python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
6+
python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
77
88
Arguments:
9-
- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
9+
- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
10+
- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
1011
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
1112
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
1213
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
1314
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
1415
15-
Quant types:
16+
Old quant types (some base model types require these):
1617
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
1718
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
1819
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
1920
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
21+
22+
New quant types (recommended):
2023
- Q2_K: smallest, extreme quality loss - not recommended
2124
- Q3_K: alias for Q3_K_M
2225
- Q3_K_S: very small, very high quality loss
@@ -40,9 +43,7 @@
4043
import os
4144
from huggingface_hub import snapshot_download
4245

43-
def main(model, outname, outdir, quants, keep_fp16):
44-
ggml_version = "v3"
45-
46+
def main(model, model_type, outname, outdir, quants, keep_fp16):
4647
if not os.path.isdir(model):
4748
print(f"Model not found at {model}. Downloading...")
4849
try:
@@ -63,31 +64,35 @@ def main(model, outname, outdir, quants, keep_fp16):
6364
print("Building llama.cpp")
6465
subprocess.run(f"cd .. && make quantize", shell=True, check=True)
6566

66-
fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
67+
fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
6768

68-
print(f"Making unquantised GGML at {fp16}")
69+
print(f"Making unquantised GGUF at {fp16}")
6970
if not os.path.isfile(fp16):
70-
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
71+
if model_type != "llama":
72+
subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
73+
else:
74+
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
7175
else:
7276
print(f"Unquantised GGML already exists at: {fp16}")
7377

7478
print("Making quants")
7579
for type in quants:
76-
outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
80+
outfile = f"{outdir}/{outname}.gguf.{type}.bin"
7781
print(f"Making {type} : {outfile}")
7882
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
7983

8084
if not keep_fp16:
8185
os.remove(fp16)
8286

8387
if __name__ == "__main__":
84-
parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
85-
parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
88+
parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
89+
parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
90+
parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
8691
parser.add_argument('--outname', default=None, help='Output model(s) name')
8792
parser.add_argument('--outdir', default=None, help='Output directory')
8893
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
8994
parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
9095

9196
args = parser.parse_args()
9297

93-
main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)
98+
main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)

examples/perplexity/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
11
# perplexity
22

33
TODO
4+
5+
## Llama 2 70B Scorechart
6+
Quantization | Model size (GiB) | Perplexity | Delta to fp16
7+
-- | -- | -- | --
8+
Q4_0 | 36.20 | 3.5550 | 3.61%
9+
Q4_1 | 40.20 | 3.5125 | 2.37%
10+
Q5_0 | 44.20 | 3.4744 | 1.26%
11+
Q2_K | 27.27 | 3.7339 | 8.82%
12+
Q3_K_S | 27.86 | 3.7019 | 7.89%
13+
Q3_K_M | 30.83 | 3.5932 | 4.72%
14+
Q3_K_L | 33.67 | 3.5617 | 3.80%
15+
Q4_K_S | 36.39 | 3.4852 | 1.57%
16+
Q4_K_M | 38.54 | 3.4725 | 1.20%
17+
Q5_K_S | 44.20 | 3.4483 | 0.50%
18+
Q5_K_M | 45.41 | 3.4451 | 0.40%
19+
Q6_K | 52.70 | 3.4367 | 0.16%
20+
fp16 | 128.5 | 3.4313 | -
21+

examples/quantize/README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,44 @@
11
# quantize
22

33
TODO
4+
5+
## Llama 2 7B
6+
7+
Quantization | Bits per Weight (BPW)
8+
-- | --
9+
Q2_K | 3.35
10+
Q3_K_S | 3.50
11+
Q3_K_M | 3.91
12+
Q3_K_L | 4.27
13+
Q4_K_S | 4.58
14+
Q4_K_M | 4.84
15+
Q5_K_S | 5.52
16+
Q5_K_M | 5.68
17+
Q6_K | 6.56
18+
19+
## Llama 2 13B
20+
Quantization | Bits per Weight (BPW)
21+
-- | --
22+
Q2_K | 3.34
23+
Q3_K_S | 3.48
24+
Q3_K_M | 3.89
25+
Q3_K_L | 4.26
26+
Q4_K_S | 4.56
27+
Q4_K_M | 4.83
28+
Q5_K_S | 5.51
29+
Q5_K_M | 5.67
30+
Q6_K | 6.56
31+
32+
# Llama 2 70B
33+
34+
Quantization | Bits per Weight (BPW)
35+
-- | --
36+
Q2_K | 3.40
37+
Q3_K_S | 3.47
38+
Q3_K_M | 3.85
39+
Q3_K_L | 4.19
40+
Q4_K_S | 4.53
41+
Q4_K_M | 4.80
42+
Q5_K_S | 5.50
43+
Q5_K_M | 5.65
44+
Q6_K | 6.56

flake.nix

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,20 @@
3535
);
3636
pkgs = import nixpkgs { inherit system; };
3737
nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
38+
cudatoolkit_joined = with pkgs; symlinkJoin {
39+
# HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit
40+
# see https://github.com/NixOS/nixpkgs/issues/224291
41+
# copied from jaxlib
42+
name = "${cudaPackages.cudatoolkit.name}-merged";
43+
paths = [
44+
cudaPackages.cudatoolkit.lib
45+
cudaPackages.cudatoolkit.out
46+
] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [
47+
# for some reason some of the required libs are in the targets/x86_64-linux
48+
# directory; not sure why but this works around it
49+
"${cudaPackages.cudatoolkit}/targets/${system}"
50+
];
51+
};
3852
llama-python =
3953
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
4054
postPatch = ''
@@ -70,6 +84,13 @@
7084
"-DLLAMA_CLBLAST=ON"
7185
];
7286
};
87+
packages.cuda = pkgs.stdenv.mkDerivation {
88+
inherit name src meta postPatch nativeBuildInputs postInstall;
89+
buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ];
90+
cmakeFlags = cmakeFlags ++ [
91+
"-DLLAMA_CUBLAS=ON"
92+
];
93+
};
7394
packages.rocm = pkgs.stdenv.mkDerivation {
7495
inherit name src meta postPatch nativeBuildInputs postInstall;
7596
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];

ggml-metal.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
#pragma once
2121

22+
#include "ggml.h"
23+
2224
#include <stddef.h>
2325
#include <stdbool.h>
2426

@@ -33,6 +35,8 @@ struct ggml_cgraph;
3335
extern "C" {
3436
#endif
3537

38+
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
39+
3640
struct ggml_metal_context;
3741

3842
// number of command buffers to use

0 commit comments

Comments
 (0)