Skip to content

Commit 8f98035

Browse files
committed
Merge branch 'master' into HEAD
2 parents 67ba34e + 9225bae commit 8f98035

30 files changed

+3524
-531
lines changed

CMakeLists.txt

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
7575
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7676
option(LLAMA_METAL "llama: use Metal" OFF)
7777
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
78+
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
7879

7980
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
8081
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -225,6 +226,14 @@ if (LLAMA_BLAS)
225226
endif()
226227
endif()
227228

229+
if (LLAMA_K_QUANTS)
230+
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
231+
add_compile_definitions(GGML_USE_K_QUANTS)
232+
if (LLAMA_QKK_64)
233+
add_compile_definitions(GGML_QKK_64)
234+
endif()
235+
endif()
236+
228237
if (LLAMA_CUBLAS)
229238
cmake_minimum_required(VERSION 3.17)
230239

@@ -250,6 +259,15 @@ if (LLAMA_CUBLAS)
250259
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
251260
endif()
252261

262+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
263+
if (LLAMA_CUDA_DMMV_F16)
264+
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
265+
else()
266+
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
267+
endif()
268+
endif()
269+
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
270+
253271
else()
254272
message(WARNING "cuBLAS not found")
255273
endif()
@@ -280,11 +298,6 @@ if (LLAMA_METAL)
280298
)
281299
endif()
282300

283-
if (LLAMA_K_QUANTS)
284-
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
285-
add_compile_definitions(GGML_USE_K_QUANTS)
286-
endif()
287-
288301
if (LLAMA_CLBLAST)
289302
find_package(CLBlast)
290303
if (CLBlast_FOUND)
@@ -493,22 +506,6 @@ if (BUILD_SHARED_LIBS)
493506
endif()
494507
endif()
495508

496-
if (GGML_SOURCES_CUDA)
497-
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
498-
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native")
499-
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
500-
501-
set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native")
502-
set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
503-
504-
if (BUILD_SHARED_LIBS)
505-
set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native")
506-
set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
507-
endif()
508-
509-
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native")
510-
endif()
511-
512509

513510
#
514511
# programs, examples and tests

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,11 @@ endif
4343

4444
# keep standard at C11 and C++11
4545
# -Ofast tends to produce faster code, but may not be available for some compilers.
46-
#OPT = -Ofast
46+
ifdef LLAMA_FAST
47+
OPT = -Ofast
48+
else
4749
OPT = -O3
50+
endif
4851
CFLAGS = -I. $(OPT) -std=c11 -fPIC
4952
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
5053
LDFLAGS =
@@ -131,6 +134,10 @@ ifndef LLAMA_NO_K_QUANTS
131134
CFLAGS += -DGGML_USE_K_QUANTS
132135
CXXFLAGS += -DGGML_USE_K_QUANTS
133136
OBJS += k_quants.o
137+
ifdef LLAMA_QKK_64
138+
CFLAGS += -DGGML_QKK_64
139+
CXXFLAGS += -DGGML_QKK_64
140+
endif
134141
endif
135142

136143
ifndef LLAMA_NO_ACCELERATE

README.md

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@
55
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
66
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
77

8+
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
9+
810
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
911

1012
**Hot topics:**
1113

12-
- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
13-
- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
14-
- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
15-
- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
16-
- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652
17-
- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
14+
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
15+
- New roadmap: https://github.com/users/ggerganov/projects/7
16+
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
17+
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
1818

1919
<details>
2020
<summary>Table of Contents</summary>
@@ -33,6 +33,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
3333
<li><a href="#quantization">Quantization</a></li>
3434
<li><a href="#interactive-mode">Interactive mode</a></li>
3535
<li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
36+
<li><a href="#using-openllama">Using OpenLLaMA</a></li>
3637
<li><a href="#using-gpt4all">Using GPT4All</a></li>
3738
<li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
3839
<li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
@@ -344,7 +345,7 @@ Building the program with BLAS support may lead to some performance improvements
344345
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
345346
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
346347
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
347-
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. |
348+
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
348349
349350
- #### CLBlast
350351
@@ -378,7 +379,7 @@ Building the program with BLAS support may lead to some performance improvements
378379
```sh
379380
git clone https://github.com/CNugteren/CLBlast.git
380381
mkdir CLBlast/build
381-
cd CLBLast/build
382+
cd CLBlast/build
382383
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
383384
cmake --build . --config Release
384385
cmake --install . --prefix /some/path
@@ -547,6 +548,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
547548
>
548549
```
549550

551+
### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
552+
553+
OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
554+
555+
- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
556+
- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
557+
550558
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
551559
552560
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
@@ -676,12 +684,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled
676684
```
677685
GGML_OPENCL_PLATFORM=0
678686
GGML_OPENCL_DEVICE=0
679-
export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
680-
./main (...)
687+
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
681688
```
682689

683690
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
684691

692+
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
693+
685694
### Docker
686695

687696
#### Prerequisites

build.zig

Lines changed: 44 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,58 @@
11
const std = @import("std");
22

3+
// Zig Version: 0.11.0-dev.3379+629f0d23b
34
pub fn build(b: *std.build.Builder) void {
45
const target = b.standardTargetOptions(.{});
5-
const optimize = b.standardReleaseOptions();
6-
const want_lto = b.option(bool, "lto", "Want -fLTO");
7-
8-
const lib = b.addStaticLibrary("llama", null);
9-
lib.want_lto = want_lto;
10-
lib.setTarget(target);
11-
lib.setBuildMode(optimize);
6+
const optimize = b.standardOptimizeOption(.{});
7+
const lib = b.addStaticLibrary(.{
8+
.name = "llama",
9+
.target = target,
10+
.optimize = optimize,
11+
});
12+
lib.linkLibC();
1213
lib.linkLibCpp();
1314
lib.addIncludePath(".");
14-
lib.addIncludePath("examples");
15+
lib.addIncludePath("./examples");
1516
lib.addCSourceFiles(&.{
1617
"ggml.c",
1718
}, &.{"-std=c11"});
1819
lib.addCSourceFiles(&.{
1920
"llama.cpp",
2021
}, &.{"-std=c++11"});
21-
lib.install();
22-
23-
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
24-
25-
const exe = build_example("main", build_args);
26-
_ = build_example("quantize", build_args);
27-
_ = build_example("perplexity", build_args);
28-
_ = build_example("embedding", build_args);
29-
30-
// create "zig build run" command for ./main
31-
32-
const run_cmd = exe.run();
33-
run_cmd.step.dependOn(b.getInstallStep());
34-
if (b.args) |args| {
35-
run_cmd.addArgs(args);
22+
b.installArtifact(lib);
23+
24+
const examples = .{
25+
"main",
26+
"baby-llama",
27+
"embedding",
28+
// "metal",
29+
"perplexity",
30+
"quantize",
31+
"quantize-stats",
32+
"save-load-state",
33+
// "server",
34+
"simple",
35+
"train-text-from-scratch",
36+
};
37+
38+
inline for (examples) |example_name| {
39+
const exe = b.addExecutable(.{
40+
.name = example_name,
41+
.target = target,
42+
.optimize = optimize,
43+
});
44+
exe.addIncludePath(".");
45+
exe.addIncludePath("./examples");
46+
exe.addCSourceFiles(&.{
47+
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
48+
"examples/common.cpp",
49+
}, &.{"-std=c++11"});
50+
exe.linkLibrary(lib);
51+
b.installArtifact(exe);
52+
const run_cmd = b.addRunArtifact(exe);
53+
run_cmd.step.dependOn(b.getInstallStep());
54+
if (b.args) |args| run_cmd.addArgs(args);
55+
const run_step = b.step("run_" ++ example_name, "Run the app");
56+
run_step.dependOn(&run_cmd.step);
3657
}
37-
38-
const run_step = b.step("run", "Run the app");
39-
run_step.dependOn(&run_cmd.step);
40-
}
41-
42-
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
43-
const b = args.b;
44-
const lib = args.lib;
45-
const want_lto = args.want_lto;
46-
47-
const exe = b.addExecutable(name, null);
48-
exe.want_lto = want_lto;
49-
lib.setTarget(args.target);
50-
lib.setBuildMode(args.optimize);
51-
exe.addIncludePath(".");
52-
exe.addIncludePath("examples");
53-
exe.addCSourceFiles(&.{
54-
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
55-
"examples/common.cpp",
56-
}, &.{"-std=c++11"});
57-
exe.linkLibrary(lib);
58-
exe.install();
59-
60-
return exe;
6158
}

0 commit comments

Comments
 (0)