|
| 1 | +#pragma once |
| 2 | + |
| 3 | +#include <string> |
| 4 | +#include <vector> |
| 5 | +#include <set> |
| 6 | +#include <mutex> |
| 7 | +#include <condition_variable> |
| 8 | +#include <unordered_map> |
| 9 | + |
| 10 | +#include "json.hpp" |
| 11 | +#include "utils.hpp" |
| 12 | + |
| 13 | +#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" |
| 14 | + |
| 15 | +using json = nlohmann::json; |
| 16 | + |
| 17 | +inline static json oaicompat_completion_params_parse( |
| 18 | + const json &body /* openai api json semantics */) |
| 19 | +{ |
| 20 | + json llama_params; |
| 21 | + |
| 22 | + llama_params["__oaicompat"] = true; |
| 23 | + |
| 24 | + // Map OpenAI parameters to llama.cpp parameters |
| 25 | + // |
| 26 | + // For parameters that are defined by the OpenAI documentation (e.g. |
| 27 | + // temperature), we explicitly specify OpenAI's intended default; we |
| 28 | + // need to do that because sometimes OpenAI disagrees with llama.cpp |
| 29 | + // |
| 30 | + // https://platform.openai.com/docs/api-reference/chat/create |
| 31 | + llama_sampling_params default_sparams; |
| 32 | + llama_params["model"] = json_value(body, "model", std::string("unknown")); |
| 33 | + llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' |
| 34 | + llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); |
| 35 | + llama_params["temperature"] = json_value(body, "temperature", 0.0); |
| 36 | + llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); |
| 37 | + llama_params["top_p"] = json_value(body, "top_p", 1.0); |
| 38 | + llama_params["n_predict"] = json_value(body, "max_tokens", -1); |
| 39 | + llama_params["logit_bias"] = json_value(body, "logit_bias",json::object()); |
| 40 | + llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); |
| 41 | + llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); |
| 42 | + llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); |
| 43 | + llama_params["stream"] = json_value(body, "stream", false); |
| 44 | + llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat); |
| 45 | + llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); |
| 46 | + llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); |
| 47 | + llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl); |
| 48 | + llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p); |
| 49 | + llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n); |
| 50 | + llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); |
| 51 | + llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z); |
| 52 | + |
| 53 | + if (body.count("grammar") != 0) { |
| 54 | + llama_params["grammar"] = json_value(body, "grammar", json::object()); |
| 55 | + } |
| 56 | + |
| 57 | + // Handle 'stop' field |
| 58 | + if (body.contains("stop") && body["stop"].is_string()) { |
| 59 | + llama_params["stop"] = json::array({body["stop"].get<std::string>()}); |
| 60 | + } else { |
| 61 | + llama_params["stop"] = json_value(body, "stop", json::array()); |
| 62 | + } |
| 63 | + |
| 64 | + // Ensure there is ChatML-specific end sequence among stop words |
| 65 | + llama_params["stop"].push_back("<|im_end|>"); |
| 66 | + |
| 67 | + return llama_params; |
| 68 | +} |
| 69 | + |
| 70 | +inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false) |
| 71 | +{ |
| 72 | + json result = response.result_json; |
| 73 | + |
| 74 | + bool stopped_word = result.count("stopped_word") != 0; |
| 75 | + bool stopped_eos = json_value(result, "stopped_eos", false); |
| 76 | + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); |
| 77 | + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); |
| 78 | + std::string content = json_value(result, "content", std::string("")); |
| 79 | + |
| 80 | + std::string finish_reason = "length"; |
| 81 | + if (stopped_word || stopped_eos) { |
| 82 | + finish_reason = "stop"; |
| 83 | + } |
| 84 | + |
| 85 | + json choices = |
| 86 | + streaming ? json::array({json{{"finish_reason", finish_reason}, |
| 87 | + {"index", 0}, |
| 88 | + {"delta", json::object()}}}) |
| 89 | + : json::array({json{{"finish_reason", finish_reason}, |
| 90 | + {"index", 0}, |
| 91 | + {"message", json{{"content", content}, |
| 92 | + {"role", "assistant"}}}}}); |
| 93 | + |
| 94 | + std::time_t t = std::time(0); |
| 95 | + |
| 96 | + json res = |
| 97 | + json{{"choices", choices}, |
| 98 | + {"created", t}, |
| 99 | + {"model", |
| 100 | + json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, |
| 101 | + {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, |
| 102 | + {"usage", |
| 103 | + json{{"completion_tokens", num_tokens_predicted}, |
| 104 | + {"prompt_tokens", num_prompt_tokens}, |
| 105 | + {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, |
| 106 | + {"id", gen_chatcmplid()}}; |
| 107 | + |
| 108 | + if (server_verbose) { |
| 109 | + res["__verbose"] = result; |
| 110 | + } |
| 111 | + |
| 112 | + if (result.contains("completion_probabilities")) { |
| 113 | + res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); |
| 114 | + } |
| 115 | + |
| 116 | + return res; |
| 117 | +} |
| 118 | + |
| 119 | +// return value is vector as there is one case where we might need to generate two responses |
| 120 | +inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) { |
| 121 | + json result = response.result_json; |
| 122 | + |
| 123 | + if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { |
| 124 | + return std::vector<json>({response.result_json}); |
| 125 | + } |
| 126 | + |
| 127 | + bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; |
| 128 | + std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); |
| 129 | + |
| 130 | + bool stopped_word = json_value(result, "stopped_word", false); |
| 131 | + bool stopped_eos = json_value(result, "stopped_eos", false); |
| 132 | + bool stopped_limit = json_value(result, "stopped_limit", false); |
| 133 | + std::string content = json_value(result, "content", std::string("")); |
| 134 | + |
| 135 | + std::string finish_reason; |
| 136 | + if (stopped_word || stopped_eos) { |
| 137 | + finish_reason = "stop"; |
| 138 | + } |
| 139 | + if (stopped_limit) { |
| 140 | + finish_reason = "length"; |
| 141 | + } |
| 142 | + |
| 143 | + std::time_t t = std::time(0); |
| 144 | + |
| 145 | + json choices; |
| 146 | + |
| 147 | + if (!finish_reason.empty()) { |
| 148 | + choices = json::array({json{{"finish_reason", finish_reason}, |
| 149 | + {"index", 0}, |
| 150 | + {"delta", json::object()}}}); |
| 151 | + } else { |
| 152 | + if (first) { |
| 153 | + if (content.empty()) { |
| 154 | + choices = json::array({json{{"finish_reason", nullptr}, |
| 155 | + {"index", 0}, |
| 156 | + {"delta", json{{"role", "assistant"}}}}}); |
| 157 | + } else { |
| 158 | + // We have to send this as two updates to conform to openai behavior |
| 159 | + json initial_ret = json{{"choices", json::array({json{ |
| 160 | + {"finish_reason", nullptr}, |
| 161 | + {"index", 0}, |
| 162 | + {"delta", json{ |
| 163 | + {"role", "assistant"} |
| 164 | + }}}})}, |
| 165 | + {"created", t}, |
| 166 | + {"id", gen_chatcmplid()}, |
| 167 | + {"model", modelname}, |
| 168 | + {"object", "chat.completion.chunk"}}; |
| 169 | + |
| 170 | + json second_ret = json{ |
| 171 | + {"choices", json::array({json{{"finish_reason", nullptr}, |
| 172 | + {"index", 0}, |
| 173 | + {"delta", json{ |
| 174 | + {"content", content}}} |
| 175 | + }})}, |
| 176 | + {"created", t}, |
| 177 | + {"id", gen_chatcmplid()}, |
| 178 | + {"model", modelname}, |
| 179 | + {"object", "chat.completion.chunk"}}; |
| 180 | + |
| 181 | + return std::vector<json>({initial_ret, second_ret}); |
| 182 | + } |
| 183 | + } else { |
| 184 | + // Some idiosyncrasy in task processing logic makes several trailing calls |
| 185 | + // with empty content, we ignore these at the calee site. |
| 186 | + if (content.empty()) { |
| 187 | + return std::vector<json>({json::object()}); |
| 188 | + } |
| 189 | + |
| 190 | + choices = json::array({json{ |
| 191 | + {"finish_reason", nullptr}, |
| 192 | + {"index", 0}, |
| 193 | + {"delta", |
| 194 | + json{ |
| 195 | + {"content", content}, |
| 196 | + }}, |
| 197 | + }}); |
| 198 | + } |
| 199 | + } |
| 200 | + |
| 201 | + json ret = json{{"choices", choices}, |
| 202 | + {"created", t}, |
| 203 | + {"id", gen_chatcmplid()}, |
| 204 | + {"model", modelname}, |
| 205 | + {"object", "chat.completion.chunk"}}; |
| 206 | + |
| 207 | + return std::vector<json>({ret}); |
| 208 | +} |
0 commit comments