Skip to content

Commit cde52d6

Browse files
committed
Merge 'origin/master' into hipblas
2 parents 8e8054a + 84e09a7 commit cde52d6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+3388
-1554
lines changed

.gitignore

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ build/
1616
build-em/
1717
build-debug/
1818
build-release/
19+
build-ci-debug/
20+
build-ci-release/
1921
build-static/
2022
build-cublas/
2123
build-opencl/
@@ -25,9 +27,10 @@ build-no-accel/
2527
build-sanitize-addr/
2628
build-sanitize-thread/
2729
out/
30+
tmp/
2831

2932
models/*
30-
*.bin
33+
models-mnt
3134

3235
/main
3336
/quantize
@@ -58,3 +61,18 @@ qnt-*.txt
5861
perf-*.txt
5962

6063
examples/jeopardy/results.txt
64+
65+
66+
pyproject.toml
67+
poetry.lock
68+
poetry.toml
69+
70+
# Test binaries
71+
tests/test-double-float
72+
tests/test-grad0
73+
tests/test-opt
74+
tests/test-quantize-fns
75+
tests/test-quantize-perf
76+
tests/test-sampling
77+
tests/test-tokenizer-0
78+

CMakeLists.txt

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,7 @@ if (BUILD_SHARED_LIBS)
547547
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
548548
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
549549
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
550+
install(TARGETS ggml_shared LIBRARY)
550551
endif()
551552

552553
add_library(llama
@@ -568,8 +569,32 @@ if (BUILD_SHARED_LIBS)
568569
if (LLAMA_METAL)
569570
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
570571
endif()
572+
install(TARGETS llama LIBRARY)
571573
endif()
572574

575+
include(GNUInstallDirs)
576+
install(
577+
FILES convert.py
578+
PERMISSIONS
579+
OWNER_READ
580+
OWNER_WRITE
581+
OWNER_EXECUTE
582+
GROUP_READ
583+
GROUP_EXECUTE
584+
WORLD_READ
585+
WORLD_EXECUTE
586+
DESTINATION ${CMAKE_INSTALL_BINDIR})
587+
install(
588+
FILES convert-lora-to-ggml.py
589+
PERMISSIONS
590+
OWNER_READ
591+
OWNER_WRITE
592+
OWNER_EXECUTE
593+
GROUP_READ
594+
GROUP_EXECUTE
595+
WORLD_READ
596+
WORLD_EXECUTE
597+
DESTINATION ${CMAKE_INSTALL_BINDIR})
573598

574599
#
575600
# programs, examples and tests

Makefile

Lines changed: 71 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
3+
4+
# Binaries only useful for tests
5+
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
36

47
default: $(BUILD_TARGETS)
58

@@ -90,6 +93,28 @@ ifeq ($(UNAME_S),Haiku)
9093
CXXFLAGS += -pthread
9194
endif
9295

96+
# detect Windows
97+
ifneq ($(findstring _NT,$(UNAME_S)),)
98+
_WIN32 := 1
99+
endif
100+
101+
# library name prefix
102+
ifneq ($(_WIN32),1)
103+
LIB_PRE := lib
104+
endif
105+
106+
# Dynamic Shared Object extension
107+
ifneq ($(_WIN32),1)
108+
DSO_EXT := .so
109+
else
110+
DSO_EXT := .dll
111+
endif
112+
113+
# Windows Sockets 2 (Winsock) for network-capable apps
114+
ifeq ($(_WIN32),1)
115+
LWINSOCK2 := -lws2_32
116+
endif
117+
93118
ifdef LLAMA_GPROF
94119
CFLAGS += -pg
95120
CXXFLAGS += -pg
@@ -102,7 +127,7 @@ endif
102127
# Architecture specific
103128
# TODO: probably these flags need to be tweaked on some architectures
104129
# feel free to update the Makefile for your architecture and send a pull request or issue
105-
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
130+
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
106131
# Use all CPU extensions that are available:
107132
CFLAGS += -march=native -mtune=native
108133
CXXFLAGS += -march=native -mtune=native
@@ -168,8 +193,12 @@ ifdef LLAMA_CUBLAS
168193
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
169194
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
170195
OBJS += ggml-cuda.o
171-
NVCC = nvcc
172196
NVCCFLAGS = --forward-unknown-to-host-compiler
197+
ifdef LLAMA_CUDA_NVCC
198+
NVCC = $(LLAMA_CUDA_NVCC)
199+
else
200+
NVCC = nvcc
201+
endif #LLAMA_CUDA_NVCC
173202
ifdef CUDA_DOCKER_ARCH
174203
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
175204
else
@@ -198,19 +227,23 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
198227
else
199228
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
200229
endif
201-
230+
ifdef LLAMA_CUDA_CCBIN
231+
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
232+
endif
202233
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
203234
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
204235
endif # LLAMA_CUBLAS
205236

206237
ifdef LLAMA_CLBLAST
207-
CFLAGS += -DGGML_USE_CLBLAST
208-
CXXFLAGS += -DGGML_USE_CLBLAST
238+
239+
CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
240+
CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
241+
209242
# Mac provides OpenCL as a framework
210243
ifeq ($(UNAME_S),Darwin)
211244
LDFLAGS += -lclblast -framework OpenCL
212245
else
213-
LDFLAGS += -lclblast -lOpenCL
246+
LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
214247
endif
215248
OBJS += ggml-opencl.o
216249

@@ -311,17 +344,20 @@ llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
311344
common.o: examples/common.cpp examples/common.h
312345
$(CXX) $(CXXFLAGS) -c $< -o $@
313346

347+
grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
348+
$(CXX) $(CXXFLAGS) -c $< -o $@
349+
314350
libllama.so: llama.o ggml.o $(OBJS)
315351
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
316352

317353
clean:
318-
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h
354+
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
319355

320356
#
321357
# Examples
322358
#
323359

324-
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
360+
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
325361
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
326362
@echo
327363
@echo '==== Run ./main -h for help. ===='
@@ -346,14 +382,14 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
346382
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
347383

348384
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
349-
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
385+
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
350386

351-
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
387+
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
352388
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
353389

354390

355-
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
356-
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
391+
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
392+
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
357393

358394
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
359395
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -370,13 +406,32 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
370406
# Tests
371407
#
372408

409+
tests: $(TEST_TARGETS)
410+
373411
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
374412
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
375413
./$@
376414

377415
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
378416
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
379417

380-
.PHONY: tests clean
381-
tests:
382-
bash ./tests/run-tests.sh
418+
tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
419+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
420+
421+
tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
422+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
423+
424+
tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
425+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
426+
427+
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
428+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
429+
430+
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
431+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
432+
433+
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
434+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
435+
436+
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
437+
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

README.md

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,23 @@ In order to build llama.cpp you have three different options.
242242
zig build -Doptimize=ReleaseFast
243243
```
244244

245+
- Using `gmake` (FreeBSD):
246+
247+
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
248+
2. Add your user to **video** group
249+
3. Install compilation dependencies.
250+
251+
```bash
252+
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
253+
opencl clblast openblas
254+
255+
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
256+
```
257+
258+
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
259+
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
260+
the instructions for use and activate this options in this document below.
261+
245262
### Metal Build
246263

247264
Using Metal allows the computation to be executed on the GPU for Apple devices:
@@ -384,7 +401,7 @@ Building the program with BLAS support may lead to some performance improvements
384401

385402
| Option | Legal values | Default | Description |
386403
|-------------------------|------------------------|---------|-------------|
387-
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 7.0/Turing/RTX 2000 or higher). Does not affect k-quants. |
404+
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
388405
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
389406
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
390407
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |

ci/README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# CI
2+
3+
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
4+
5+
https://github.com/ggml-org/ci
6+
7+
It monitors the `master` branch for new commits and runs the
8+
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
9+
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
10+
to cover various hardware architectures, including GPU and Apple Silicon instances.
11+
12+
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
13+
Only the branches of this repo are monitored for this keyword.
14+
15+
It is a good practice, before publishing changes to execute the full CI locally on your machine:
16+
17+
```bash
18+
mkdir tmp
19+
20+
# CPU-only build
21+
bash ./ci/run.sh ./tmp/results ./tmp/mnt
22+
23+
# with CUDA support
24+
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
25+
```

0 commit comments

Comments
 (0)