Skip to content

Commit 160468e

Browse files
committed
adapt to upstream changes
1 parent ff8e910 commit 160468e

File tree

1 file changed

+8
-12
lines changed

1 file changed

+8
-12
lines changed

backend/cpp/llama/grpc-server.cpp

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -527,14 +527,6 @@ struct llama_server_context
527527
slot_params default_params;
528528
llama_sampling_params default_sparams;
529529

530-
if (data.count("__oaicompat") != 0) {
531-
slot->oaicompat = true;
532-
slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
533-
} else {
534-
slot->oaicompat = false;
535-
slot->oaicompat_model = "";
536-
}
537-
538530
slot->params.stream = json_value(data, "stream", false);
539531
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
540532
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
@@ -2032,9 +2024,9 @@ static void params_parse(const backend::ModelOptions* request,
20322024
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
20332025
std::vector<std::string> split_arg{ it, {} };
20342026

2035-
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
2027+
GGML_ASSERT(split_arg.size() <= llama_max_devices());
20362028

2037-
for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
2029+
for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
20382030
if (i_device < split_arg.size()) {
20392031
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
20402032
}
@@ -2116,7 +2108,9 @@ class BackendServiceImpl final : public backend::Backend::Service {
21162108
}
21172109
grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
21182110
json data = parse_options(true, request, llama);
2119-
const int task_id = llama.request_completion(data, false, false, -1);
2111+
const int task_id = llama.queue_tasks.get_new_id();
2112+
llama.queue_results.add_waiting_task_id(task_id);
2113+
llama.request_completion(task_id, data, false, false, -1);
21202114
while (true)
21212115
{
21222116
task_result result = llama.next_result(task_id);
@@ -2152,7 +2146,9 @@ class BackendServiceImpl final : public backend::Backend::Service {
21522146

21532147
grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
21542148
json data = parse_options(false, request, llama);
2155-
const int task_id = llama.request_completion(data, false, false, -1);
2149+
const int task_id = llama.queue_tasks.get_new_id();
2150+
llama.queue_results.add_waiting_task_id(task_id);
2151+
llama.request_completion(task_id, data, false, false, -1);
21562152
std::string completion_text;
21572153
task_result result = llama.next_result(task_id);
21582154
if (!result.error && result.stop) {

0 commit comments

Comments
 (0)