Skip to content

Commit 5c23503

Browse files
committed
2 parents d3b06e3 + 0eb332a commit 5c23503

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+8626
-7579
lines changed

.github/ISSUE_TEMPLATE/custom.md renamed to .github/ISSUE_TEMPLATE/bug.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
---
2-
name: Issue and enhancement template
3-
about: Used to report issues and request enhancements for llama.cpp
4-
title: "[User] Insert summary of your issue or enhancement.."
5-
labels: ''
2+
name: Bug template
3+
about: Used to report bugs in llama.cpp
4+
labels: ["bug-unconfirmed"]
65
assignees: ''
76

87
---
@@ -46,7 +45,7 @@ $ g++ --version
4645

4746
# Failure Information (for bugs)
4847

49-
Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
48+
Please help provide information about the failure / bug.
5049

5150
# Steps to Reproduce
5251

.github/ISSUE_TEMPLATE/enhancement.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
---
2+
name: Enhancement template
3+
about: Used to request enhancements for llama.cpp
4+
labels: ["enhancement"]
5+
assignees: ''
6+
7+
---
8+
9+
# Prerequisites
10+
11+
Please answer the following questions for yourself before submitting an issue.
12+
13+
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
14+
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
15+
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
16+
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
17+
18+
# Feature Description
19+
20+
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
21+
22+
# Motivation
23+
24+
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
25+
26+
# Possible Implementation
27+
28+
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
.DS_Store
1616
.build/
1717
.cache/
18+
.ccls-cache/
1819
.direnv/
1920
.envrc
2021
.swiftpm

CMakeLists.txt

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
8282
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
8383
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
8484
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
85+
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
8586
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
8687
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
8788
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
@@ -93,7 +94,6 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
9394
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
9495
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
9596
option(LLAMA_MPI "llama: use MPI" OFF)
96-
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
9797
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
9898

9999
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
@@ -277,13 +277,8 @@ if (LLAMA_BLAS)
277277
endif()
278278
endif()
279279

280-
if (LLAMA_K_QUANTS)
281-
set(GGML_HEADERS_EXTRA k_quants.h)
282-
set(GGML_SOURCES_EXTRA k_quants.c)
283-
add_compile_definitions(GGML_USE_K_QUANTS)
284-
if (LLAMA_QKK_64)
285-
add_compile_definitions(GGML_QKK_64)
286-
endif()
280+
if (LLAMA_QKK_64)
281+
add_compile_definitions(GGML_QKK_64)
287282
endif()
288283

289284
if (LLAMA_CUBLAS)
@@ -305,6 +300,9 @@ if (LLAMA_CUBLAS)
305300
if (LLAMA_CUDA_FORCE_DMMV)
306301
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
307302
endif()
303+
if (LLAMA_CUDA_FORCE_MMQ)
304+
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
305+
endif()
308306
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
309307
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
310308
if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -331,6 +329,7 @@ if (LLAMA_CUBLAS)
331329
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
332330
else()
333331
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
332+
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
334333
endif()
335334
endif()
336335
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -404,6 +403,9 @@ if (LLAMA_HIPBLAS)
404403
if (LLAMA_CUDA_FORCE_DMMV)
405404
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
406405
endif()
406+
if (LLAMA_CUDA_FORCE_MMQ)
407+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
408+
endif()
407409
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
408410
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
409411
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@@ -665,6 +667,8 @@ add_library(ggml OBJECT
665667
ggml-alloc.h
666668
ggml-backend.c
667669
ggml-backend.h
670+
ggml-quants.c
671+
ggml-quants.h
668672
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
669673
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
670674
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}

Makefile

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -345,13 +345,9 @@ else
345345
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
346346
endif
347347

348-
ifndef LLAMA_NO_K_QUANTS
349-
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
350-
OBJS += k_quants.o
351348
ifdef LLAMA_QKK_64
352349
MK_CPPFLAGS += -DGGML_QKK_64
353350
endif
354-
endif
355351

356352
ifndef LLAMA_NO_ACCELERATE
357353
# Mac OS - include Accelerate framework.
@@ -368,7 +364,7 @@ ifdef LLAMA_MPI
368364
MK_CPPFLAGS += -DGGML_USE_MPI
369365
MK_CFLAGS += -Wno-cast-qual
370366
MK_CXXFLAGS += -Wno-cast-qual
371-
OBJS += ggml-mpi.o
367+
OBJS += ggml-mpi.o
372368
endif # LLAMA_MPI
373369

374370
ifdef LLAMA_OPENBLAS
@@ -385,7 +381,7 @@ endif # LLAMA_BLIS
385381
ifdef LLAMA_CUBLAS
386382
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
387383
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
388-
OBJS += ggml-cuda.o
384+
OBJS += ggml-cuda.o
389385
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
390386
ifdef LLAMA_CUDA_NVCC
391387
NVCC = $(LLAMA_CUDA_NVCC)
@@ -394,15 +390,15 @@ else
394390
endif #LLAMA_CUDA_NVCC
395391
ifdef CUDA_DOCKER_ARCH
396392
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
397-
endif # CUDA_DOCKER_ARCH
398-
ifdef CUDA_NATIVE_ARCH
399-
NVCCFLAGS += -arch=$(CUDA_NATIVE_ARCH)
400393
else
401394
NVCCFLAGS += -arch=native
402-
endif # CUDA_NATIVE_ARCH
395+
endif # CUDA_DOCKER_ARCH
403396
ifdef LLAMA_CUDA_FORCE_DMMV
404397
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
405398
endif # LLAMA_CUDA_FORCE_DMMV
399+
ifdef LLAMA_CUDA_FORCE_MMQ
400+
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
401+
endif # LLAMA_CUDA_FORCE_MMQ
406402
ifdef LLAMA_CUDA_DMMV_X
407403
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
408404
else
@@ -500,11 +496,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
500496
$(CC) $(CFLAGS) -c $< -o $@
501497
endif # LLAMA_MPI
502498

503-
ifndef LLAMA_NO_K_QUANTS
504-
k_quants.o: k_quants.c k_quants.h
505-
$(CC) $(CFLAGS) -c $< -o $@
506-
endif # LLAMA_NO_K_QUANTS
507-
508499
# combine build flags with cmdline overrides
509500
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
510501
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
@@ -545,15 +536,18 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
545536
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
546537
$(CC) $(CFLAGS) -c $< -o $@
547538

548-
OBJS += ggml-alloc.o ggml-backend.o
539+
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
540+
$(CC) $(CFLAGS) -c $< -o $@
541+
542+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
549543

550544
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
551545
$(CXX) $(CXXFLAGS) -c $< -o $@
552546

553-
COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
554-
COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
547+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
548+
COMMON_DEPS = common.o sampling.o grammar-parser.o
555549

556-
common.o: common/common.cpp $(COMMON_H_DEPS)
550+
common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
557551
$(CXX) $(CXXFLAGS) -c $< -o $@
558552

559553
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)

Package.swift

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,12 @@ let package = Package(
4242
"llama.cpp",
4343
"ggml-alloc.c",
4444
"ggml-backend.c",
45-
"k_quants.c",
45+
"ggml-quants.c",
4646
] + additionalSources,
4747
resources: resources,
4848
publicHeadersPath: "spm-headers",
4949
cSettings: [
5050
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
51-
.define("GGML_USE_K_QUANTS"),
5251
.define("GGML_USE_ACCELERATE")
5352
// NOTE: NEW_LAPACK will required iOS version 16.4+
5453
// We should consider add this in the future when we drop support for iOS 14

build.zig

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,10 @@ pub fn build(b: *std.build.Builder) !void {
116116
var make = try Maker.init(b);
117117
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
118118

119-
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
120-
try make.addFlag("-DGGML_USE_K_QUANTS");
121-
const k_quants = make.obj("k_quants", "k_quants.c");
122-
try make.objs.append(k_quants);
123-
}
124-
125119
const ggml = make.obj("ggml", "ggml.c");
126120
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
127121
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
122+
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
128123
const llama = make.obj("llama", "llama.cpp");
129124
const common = make.obj("common", "common/common.cpp");
130125
const console = make.obj("console", "common/console.cpp");
@@ -133,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void {
133128
const train = make.obj("train", "common/train.cpp");
134129
const clip = make.obj("clip", "examples/llava/clip.cpp");
135130

136-
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
137-
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
138-
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
139-
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
140-
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
141-
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
131+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
132+
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
133+
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
134+
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
135+
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
136+
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
142137

143-
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
138+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
144139
if (server.target.isWindows()) {
145140
server.linkSystemLibrary("ws2_32");
146141
}

0 commit comments

Comments
 (0)