Skip to content

Commit a057bff

Browse files
ggerganovphymbert
authored andcommitted
server : refactor (ggml-org#5882)
* server : refactoring (wip) * server : remove llava/clip objects from build * server : fix empty prompt handling + all slots idle logic * server : normalize id vars * server : code style * server : simplify model chat template validation * server : code style * server : minor * llama : llama_chat_apply_template support null buf * server : do not process embedding requests when disabled * server : reorganize structs and enums + naming fixes * server : merge oai.hpp in utils.hpp * server : refactor system prompt update at start * server : disable cached prompts with self-extend * server : do not process more than n_batch tokens per iter * server: tests: embeddings use a real embeddings model (ggml-org#5908) * server, tests : bump batch to fit 1 embedding prompt * server: tests: embeddings fix build type Debug is randomly failing (ggml-org#5911) * server: tests: embeddings, use different KV Cache size * server: tests: embeddings, fixed prompt do not exceed n_batch, increase embedding timeout, reduce number of concurrent embeddings * server: tests: embeddings, no need to wait for server idle as it can timout * server: refactor: clean up http code (ggml-org#5912) * server : avoid n_available var ggml-ci * server: refactor: better http codes * server : simplify json parsing + add comment about t_last * server : rename server structs * server : allow to override FQDN in tests ggml-ci * server : add comments --------- Co-authored-by: Pierrick Hymbert <[email protected]>
1 parent 71917d8 commit a057bff

File tree

14 files changed

+2265
-2711
lines changed

14 files changed

+2265
-2711
lines changed

.github/workflows/server.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ jobs:
5858
cmake \
5959
python3-pip \
6060
wget \
61-
psmisc
61+
psmisc \
62+
language-pack-en
6263
6364
- name: Build
6465
id: cmake_build

Makefile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -724,10 +724,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
724724
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
725725
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
726726

727-
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
727+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
728728
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
729-
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
730-
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
729+
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
731730

732731
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
733732
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

examples/server-embd.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ async def main():
1313
model_url = "http://127.0.0.1:6900"
1414
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
1515
url= f"{model_url}/embedding",
16-
json= {"content": str(i)*1024}
16+
json= {"content": str(0)*1024}
1717
) for i in range(n)])
1818

1919
for response in responses:

examples/server/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
set(TARGET server)
22
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
33
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
4-
add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
4+
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
55
install(TARGETS ${TARGET} RUNTIME)
66
target_compile_definitions(${TARGET} PRIVATE
77
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
88
)
9-
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
9+
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
1010
if (WIN32)
1111
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
1212
endif()

examples/server/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ Notice that each `probs` is an array of length `n_probs`.
436436
"next_token": {
437437
"has_next_token": true,
438438
"n_remain": -1,
439-
"num_tokens_predicted": 0,
439+
"n_decoded": 0,
440440
"stopped_eos": false,
441441
"stopped_limit": false,
442442
"stopped_word": false,

examples/server/oai.hpp

Lines changed: 0 additions & 225 deletions
This file was deleted.

0 commit comments

Comments
 (0)