Skip to content

Commit 0df7d63

Browse files
authored
Include server in releases + other build system cleanups (#1610)
Set `LLAMA_BUILD_SERVER` in workflow so the `server` example gets build. This currently only applies to Windows builds because it seems like only Windows binary artifacts are included in releases. Add `server` example target to `Makefile` (still uses `LLAMA_BUILD_SERVER` define and does not build by default) Fix issue where `vdot` binary wasn't removed when running `make clean`. Fix compile warnings in `server` example. Add `.hpp` files to trigger workflow (the server example has one).
1 parent 97c9b77 commit 0df7d63

File tree

3 files changed

+27
-18
lines changed

3 files changed

+27
-18
lines changed

.github/workflows/build.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ on:
1010
push:
1111
branches:
1212
- master
13-
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
13+
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
1414
pull_request:
1515
types: [opened, synchronize, reopened]
16-
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
16+
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
1717

1818
env:
1919
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -157,15 +157,15 @@ jobs:
157157
matrix:
158158
include:
159159
- build: 'avx2'
160-
defines: ''
160+
defines: '-DLLAMA_BUILD_SERVER=ON'
161161
- build: 'avx'
162-
defines: '-DLLAMA_AVX2=OFF'
162+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
163163
- build: 'avx512'
164-
defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
164+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
165165
- build: 'clblast'
166-
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
166+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
167167
- build: 'openblas'
168-
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
168+
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
169169

170170
steps:
171171
- name: Clone
@@ -292,7 +292,7 @@ jobs:
292292
run: |
293293
mkdir build
294294
cd build
295-
cmake .. -DLLAMA_CUBLAS=ON
295+
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
296296
cmake --build . --config Release
297297
298298
- name: Get commit hash

Makefile

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Define the default target now so that it is always the first target
2-
default: main quantize quantize-stats perplexity embedding vdot
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
3+
4+
ifdef LLAMA_BUILD_SERVER
5+
BUILD_TARGETS += server
6+
endif
7+
8+
default: $(BUILD_TARGETS)
39

410
ifndef UNAME_S
511
UNAME_S := $(shell uname -s)
@@ -210,7 +216,7 @@ libllama.so: llama.o ggml.o $(OBJS)
210216
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
211217

212218
clean:
213-
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
219+
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
214220

215221
#
216222
# Examples
@@ -237,6 +243,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
237243
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
238244
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
239245

246+
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
247+
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
248+
240249
build-info.h: $(wildcard .git/index) scripts/build-info.sh
241250
@sh scripts/build-info.sh > $@.tmp
242251
@if ! cmp -s $@.tmp $@; then \

examples/server/server.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ struct llama_server_context
6161
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
6262
// compare the evaluated prompt with the new prompt
6363
int new_prompt_len = 0;
64-
for (int i = 0;i < prompt_tokens.size(); i++) {
64+
for (size_t i = 0; i < prompt_tokens.size(); i++) {
6565
if (i < processed_tokens.size() &&
6666
processed_tokens[i] == prompt_tokens[i])
6767
{
@@ -71,7 +71,7 @@ struct llama_server_context
7171
{
7272
embd_inp.push_back(prompt_tokens[i]);
7373
if(new_prompt_len == 0) {
74-
if(i - 1 < n_past) {
74+
if(int32_t(i) - 1 < n_past) {
7575
processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
7676
}
7777
// Evaluate the new fragment prompt from the last token processed.
@@ -136,7 +136,7 @@ struct llama_server_context
136136
{
137137
// out of user input, sample next token
138138
const float temp = params.temp;
139-
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
139+
// const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
140140
const float top_p = params.top_p;
141141
const float tfs_z = params.tfs_z;
142142
const float typical_p = params.typical_p;
@@ -306,12 +306,12 @@ struct llama_server_context
306306
// Avoid add the no show words to the response
307307
for (std::vector<llama_token> word_tokens : no_show_words)
308308
{
309-
int match_token = 1;
309+
size_t match_token = 1;
310310
if (tokens_predicted.front() == word_tokens.front())
311311
{
312312
bool execute_matching = true;
313313
if (tokens_predicted.size() > 1) { // if previus tokens had been tested
314-
for (int i = 1; i < word_tokens.size(); i++)
314+
for (size_t i = 1; i < word_tokens.size(); i++)
315315
{
316316
if (i >= tokens_predicted.size()) {
317317
match_token = i;
@@ -601,7 +601,7 @@ int main(int argc, char **argv)
601601

602602
Server svr;
603603

604-
svr.Get("/", [](const Request &req, Response &res)
604+
svr.Get("/", [](const Request &, Response &res)
605605
{ res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
606606

607607
svr.Post("/completion", [&llama](const Request &req, Response &res)
@@ -649,7 +649,7 @@ int main(int argc, char **argv)
649649
{"tokens_predicted", llama.num_tokens_predicted}};
650650
return res.set_content(data.dump(), "application/json");
651651
}
652-
catch (json::exception e)
652+
catch (const json::exception &e)
653653
{
654654
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
655655
json data = {
@@ -701,7 +701,7 @@ int main(int argc, char **argv)
701701
{"content", result },
702702
{"stop", !llama.has_next_token }};
703703
return res.set_content(data.dump(), "application/json");
704-
} catch (json::exception e) {
704+
} catch (const json::exception &e) {
705705
// Some tokens have bad UTF-8 strings, the json parser is very sensitive
706706
json data = {
707707
{"content", "" },

0 commit comments

Comments
 (0)