Skip to content

Commit 1c56553

Browse files
ngxsonhodlen
authored andcommitted
server : refactored the task processing logic (ggml-org#5065)
* server: add llama_server_queue struct * server: add llama_server_response_event * server: add comments * server: move all mutexes away from server.cpp * server: correct multitask response * server: only add back deferred tasks when one slot is available * server: fix a race condition cause by "request_completion"
1 parent b5c088a commit 1c56553

File tree

5 files changed

+869
-685
lines changed

5 files changed

+869
-685
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -619,7 +619,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
619619
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
620620
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
621621

622-
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
622+
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
623623
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
624624

625625
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)

examples/server/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
set(TARGET server)
22
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
33
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
4-
add_executable(${TARGET} server.cpp json.hpp httplib.h)
4+
add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
55
install(TARGETS ${TARGET} RUNTIME)
66
target_compile_definitions(${TARGET} PRIVATE
77
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>

examples/server/oai.hpp

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
#pragma once
2+
3+
#include <string>
4+
#include <vector>
5+
#include <set>
6+
#include <mutex>
7+
#include <condition_variable>
8+
#include <unordered_map>
9+
10+
#include "json.hpp"
11+
#include "utils.hpp"
12+
13+
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
14+
15+
using json = nlohmann::json;
16+
17+
inline static json oaicompat_completion_params_parse(
18+
const json &body /* openai api json semantics */)
19+
{
20+
json llama_params;
21+
22+
llama_params["__oaicompat"] = true;
23+
24+
// Map OpenAI parameters to llama.cpp parameters
25+
//
26+
// For parameters that are defined by the OpenAI documentation (e.g.
27+
// temperature), we explicitly specify OpenAI's intended default; we
28+
// need to do that because sometimes OpenAI disagrees with llama.cpp
29+
//
30+
// https://platform.openai.com/docs/api-reference/chat/create
31+
llama_sampling_params default_sparams;
32+
llama_params["model"] = json_value(body, "model", std::string("unknown"));
33+
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
34+
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
35+
llama_params["temperature"] = json_value(body, "temperature", 0.0);
36+
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
37+
llama_params["top_p"] = json_value(body, "top_p", 1.0);
38+
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
39+
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
40+
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
41+
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
42+
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
43+
llama_params["stream"] = json_value(body, "stream", false);
44+
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
45+
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
46+
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
47+
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
48+
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
49+
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
50+
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
51+
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
52+
53+
if (body.count("grammar") != 0) {
54+
llama_params["grammar"] = json_value(body, "grammar", json::object());
55+
}
56+
57+
// Handle 'stop' field
58+
if (body.contains("stop") && body["stop"].is_string()) {
59+
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
60+
} else {
61+
llama_params["stop"] = json_value(body, "stop", json::array());
62+
}
63+
64+
// Ensure there is ChatML-specific end sequence among stop words
65+
llama_params["stop"].push_back("<|im_end|>");
66+
67+
return llama_params;
68+
}
69+
70+
inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
71+
{
72+
json result = response.result_json;
73+
74+
bool stopped_word = result.count("stopped_word") != 0;
75+
bool stopped_eos = json_value(result, "stopped_eos", false);
76+
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
77+
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
78+
std::string content = json_value(result, "content", std::string(""));
79+
80+
std::string finish_reason = "length";
81+
if (stopped_word || stopped_eos) {
82+
finish_reason = "stop";
83+
}
84+
85+
json choices =
86+
streaming ? json::array({json{{"finish_reason", finish_reason},
87+
{"index", 0},
88+
{"delta", json::object()}}})
89+
: json::array({json{{"finish_reason", finish_reason},
90+
{"index", 0},
91+
{"message", json{{"content", content},
92+
{"role", "assistant"}}}}});
93+
94+
std::time_t t = std::time(0);
95+
96+
json res =
97+
json{{"choices", choices},
98+
{"created", t},
99+
{"model",
100+
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
101+
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
102+
{"usage",
103+
json{{"completion_tokens", num_tokens_predicted},
104+
{"prompt_tokens", num_prompt_tokens},
105+
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
106+
{"id", gen_chatcmplid()}};
107+
108+
if (server_verbose) {
109+
res["__verbose"] = result;
110+
}
111+
112+
if (result.contains("completion_probabilities")) {
113+
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
114+
}
115+
116+
return res;
117+
}
118+
119+
// return value is vector as there is one case where we might need to generate two responses
120+
inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
121+
json result = response.result_json;
122+
123+
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
124+
return std::vector<json>({response.result_json});
125+
}
126+
127+
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
128+
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
129+
130+
bool stopped_word = json_value(result, "stopped_word", false);
131+
bool stopped_eos = json_value(result, "stopped_eos", false);
132+
bool stopped_limit = json_value(result, "stopped_limit", false);
133+
std::string content = json_value(result, "content", std::string(""));
134+
135+
std::string finish_reason;
136+
if (stopped_word || stopped_eos) {
137+
finish_reason = "stop";
138+
}
139+
if (stopped_limit) {
140+
finish_reason = "length";
141+
}
142+
143+
std::time_t t = std::time(0);
144+
145+
json choices;
146+
147+
if (!finish_reason.empty()) {
148+
choices = json::array({json{{"finish_reason", finish_reason},
149+
{"index", 0},
150+
{"delta", json::object()}}});
151+
} else {
152+
if (first) {
153+
if (content.empty()) {
154+
choices = json::array({json{{"finish_reason", nullptr},
155+
{"index", 0},
156+
{"delta", json{{"role", "assistant"}}}}});
157+
} else {
158+
// We have to send this as two updates to conform to openai behavior
159+
json initial_ret = json{{"choices", json::array({json{
160+
{"finish_reason", nullptr},
161+
{"index", 0},
162+
{"delta", json{
163+
{"role", "assistant"}
164+
}}}})},
165+
{"created", t},
166+
{"id", gen_chatcmplid()},
167+
{"model", modelname},
168+
{"object", "chat.completion.chunk"}};
169+
170+
json second_ret = json{
171+
{"choices", json::array({json{{"finish_reason", nullptr},
172+
{"index", 0},
173+
{"delta", json{
174+
{"content", content}}}
175+
}})},
176+
{"created", t},
177+
{"id", gen_chatcmplid()},
178+
{"model", modelname},
179+
{"object", "chat.completion.chunk"}};
180+
181+
return std::vector<json>({initial_ret, second_ret});
182+
}
183+
} else {
184+
// Some idiosyncrasy in task processing logic makes several trailing calls
185+
// with empty content, we ignore these at the calee site.
186+
if (content.empty()) {
187+
return std::vector<json>({json::object()});
188+
}
189+
190+
choices = json::array({json{
191+
{"finish_reason", nullptr},
192+
{"index", 0},
193+
{"delta",
194+
json{
195+
{"content", content},
196+
}},
197+
}});
198+
}
199+
}
200+
201+
json ret = json{{"choices", choices},
202+
{"created", t},
203+
{"id", gen_chatcmplid()},
204+
{"model", modelname},
205+
{"object", "chat.completion.chunk"}};
206+
207+
return std::vector<json>({ret});
208+
}

0 commit comments

Comments
 (0)