Skip to content

Commit 36dffac

Browse files
committed
merge
Signed-off-by: Ashish Tiwari <[email protected]>
2 parents cd9ad54 + abd398c commit 36dffac

24 files changed

+471
-339
lines changed

apisix/balancer/ewma.lua

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ local next = next
1818
local error = error
1919

2020
local DECAY_TIME = 10 -- this value is in seconds
21+
local SHM_TTL = 60
2122
local LOCK_KEY = ":ewma_key"
2223

2324
local shm_ewma = ngx_shared["balancer-ewma"]
@@ -58,15 +59,15 @@ local function decay_ewma(ewma, last_touched_at, rtt, now)
5859
end
5960

6061
local function store_stats(upstream, ewma, now)
61-
local success, err, forcible = shm_last_touched_at:set(upstream, now)
62+
local success, err, forcible = shm_last_touched_at:set(upstream, now, SHM_TTL)
6263
if not success then
6364
core.log.error("shm_last_touched_at:set failed: ", err)
6465
end
6566
if forcible then
6667
core.log.warn("shm_last_touched_at:set valid items forcibly overwritten")
6768
end
6869

69-
success, err, forcible = shm_ewma:set(upstream, ewma)
70+
success, err, forcible = shm_ewma:set(upstream, ewma, SHM_TTL)
7071
if not success then
7172
core.log.error("shm_ewma:set failed: ", err)
7273
end

apisix/cli/ngx_tpl.lua

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,7 @@ http {
809809
set $llm_content_risk_level '';
810810
set $apisix_upstream_response_time $upstream_response_time;
811811
set $request_type 'traditional_http';
812+
set $request_llm_model '';
812813
813814
set $llm_time_to_first_token '0';
814815
set $llm_model '';

apisix/core/ctx.lua

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ do
238238
request_type = true,
239239
apisix_upstream_response_time = true,
240240
llm_time_to_first_token = true,
241+
request_llm_model = true,
241242
llm_model = true,
242243
llm_prompt_tokens = true,
243244
llm_completion_tokens = true,
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
--
2+
-- Licensed to the Apache Software Foundation (ASF) under one or more
3+
-- contributor license agreements. See the NOTICE file distributed with
4+
-- this work for additional information regarding copyright ownership.
5+
-- The ASF licenses this file to You under the Apache License, Version 2.0
6+
-- (the "License"); you may not use this file except in compliance with
7+
-- the License. You may obtain a copy of the License at
8+
--
9+
-- http://www.apache.org/licenses/LICENSE-2.0
10+
--
11+
-- Unless required by applicable law or agreed to in writing, software
12+
-- distributed under the License is distributed on an "AS IS" BASIS,
13+
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
-- See the License for the specific language governing permissions and
15+
-- limitations under the License.
16+
--
17+
18+
return require("apisix.plugins.ai-drivers.openai-base").new(
19+
{
20+
path = "/completions",
21+
port = 443,
22+
options = {
23+
remove_model = true
24+
}
25+
}
26+
)

apisix/plugins/ai-drivers/openai-base.lua

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ function _M.new(opts)
4747
host = opts.host,
4848
port = opts.port,
4949
path = opts.path,
50+
remove_model = opts.options and opts.options.remove_model
5051
}
5152
return setmetatable(self, mt)
5253
end
@@ -255,7 +256,9 @@ function _M.request(self, ctx, conf, request_table, extra_opts)
255256
request_table[opt] = val
256257
end
257258
end
258-
259+
if self.remove_model then
260+
request_table.model = nil
261+
end
259262
local req_json, err = core.json.encode(request_table)
260263
if not req_json then
261264
return nil, err

apisix/plugins/ai-proxy-multi.lua

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ local ipairs = ipairs
3030
local type = type
3131

3232
local priority_balancer = require("apisix.balancer.priority")
33+
local endpoint_regex = "^(https?)://([^:/]+):?(%d*)/?.*$"
3334

3435
local pickers = {}
3536
local lrucache_server_picker = core.lrucache.new({
@@ -73,6 +74,13 @@ function _M.check_schema(conf)
7374
end
7475

7576
for _, instance in ipairs(conf.instances) do
77+
local endpoint = instance and instance.override and instance.override.endpoint
78+
if endpoint then
79+
local scheme, host, _ = endpoint:match(endpoint_regex)
80+
if not scheme or not host then
81+
return false, "invalid endpoint"
82+
end
83+
end
7684
local ai_driver, err = pcall(require, "apisix.plugins.ai-drivers." .. instance.provider)
7785
if not ai_driver then
7886
core.log.warn("fail to require ai provider: ", instance.provider, ", err", err)
@@ -143,7 +151,7 @@ end
143151

144152
local function resolve_endpoint(instance_conf)
145153
local endpoint = core.table.try_read_attr(instance_conf, "override", "endpoint")
146-
local scheme, host, port, _ = endpoint:match("^(https?)://([^:/]+):?(%d*)(/?.*)$")
154+
local scheme, host, port, _ = endpoint:match(endpoint_regex)
147155
if port == "" then
148156
port = (scheme == "https") and "443" or "80"
149157
end

apisix/plugins/ai-proxy/base.lua

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ local _M = {}
2727
function _M.set_logging(ctx, summaries, payloads)
2828
if summaries then
2929
ctx.llm_summary = {
30+
request_model = ctx.var.request_llm_model,
3031
model = ctx.var.llm_model,
3132
duration = ctx.var.llm_time_to_first_token,
3233
prompt_tokens = ctx.var.llm_prompt_tokens,
@@ -70,6 +71,9 @@ function _M.before_proxy(conf, ctx)
7071
else
7172
ctx.var.request_type = "ai_chat"
7273
end
74+
if request_body.model then
75+
ctx.var.request_llm_model = request_body.model
76+
end
7377
local model = ai_instance.options and ai_instance.options.model or request_body.model
7478
if model then
7579
ctx.var.llm_model = model

apisix/plugins/ai-proxy/schema.lua

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ local ai_instance_schema = {
6868
"deepseek",
6969
"aimlapi",
7070
"openai-compatible",
71+
"azure-openai"
7172
}, -- add more providers later
7273
},
7374
priority = {
@@ -129,6 +130,7 @@ _M.ai_proxy_schema = {
129130
"deepseek",
130131
"aimlapi",
131132
"openai-compatible",
133+
"azure-openai"
132134
}, -- add more providers later
133135

134136
},

apisix/plugins/prometheus/exporter.lua

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ function _M.http_init(prometheus_enabled_in_stream)
211211
metrics.status = prometheus:counter("http_status",
212212
"HTTP status codes per service in APISIX",
213213
{"code", "route", "matched_uri", "matched_host", "service", "consumer", "node",
214-
"request_type", "llm_model",
214+
"request_type", "request_llm_model", "llm_model",
215215
unpack(extra_labels("http_status"))},
216216
status_metrics_exptime)
217217

@@ -223,14 +223,14 @@ function _M.http_init(prometheus_enabled_in_stream)
223223
metrics.latency = prometheus:histogram("http_latency",
224224
"HTTP request latency in milliseconds per service in APISIX",
225225
{"type", "route", "service", "consumer", "node",
226-
"request_type", "llm_model",
226+
"request_type", "request_llm_model", "llm_model",
227227
unpack(extra_labels("http_latency"))},
228228
buckets, latency_metrics_exptime)
229229

230230
metrics.bandwidth = prometheus:counter("bandwidth",
231231
"Total bandwidth in bytes consumed per service in APISIX",
232232
{"type", "route", "service", "consumer", "node",
233-
"request_type", "llm_model",
233+
"request_type", "request_llm_model", "llm_model",
234234
unpack(extra_labels("bandwidth"))},
235235
bandwidth_metrics_exptime)
236236

@@ -241,30 +241,30 @@ function _M.http_init(prometheus_enabled_in_stream)
241241
metrics.llm_latency = prometheus:histogram("llm_latency",
242242
"LLM request latency in milliseconds",
243243
{"route_id", "service_id", "consumer", "node",
244-
"request_type", "llm_model",
244+
"request_type", "request_llm_model", "llm_model",
245245
unpack(extra_labels("llm_latency"))},
246246
llm_latency_buckets,
247247
llm_latency_exptime)
248248

249249
metrics.llm_prompt_tokens = prometheus:counter("llm_prompt_tokens",
250250
"LLM service consumed prompt tokens",
251251
{"route_id", "service_id", "consumer", "node",
252-
"request_type", "llm_model",
252+
"request_type", "request_llm_model", "llm_model",
253253
unpack(extra_labels("llm_prompt_tokens"))},
254254
llm_prompt_tokens_exptime)
255255

256256
metrics.llm_completion_tokens = prometheus:counter("llm_completion_tokens",
257257
"LLM service consumed completion tokens",
258258
{"route_id", "service_id", "consumer", "node",
259-
"request_type", "llm_model",
259+
"request_type", "request_llm_model", "llm_model",
260260
unpack(extra_labels("llm_completion_tokens"))},
261261
llm_completion_tokens_exptime)
262262

263263
metrics.llm_active_connections = prometheus:gauge("llm_active_connections",
264264
"Number of active connections to LLM service",
265265
{"route", "route_id", "matched_uri", "matched_host",
266266
"service", "service_id", "consumer", "node",
267-
"request_type", "llm_model",
267+
"request_type", "request_llm_model", "llm_model",
268268
unpack(extra_labels("llm_active_connections"))},
269269
llm_active_connections_exptime)
270270

@@ -338,58 +338,58 @@ function _M.http_log(conf, ctx)
338338
metrics.status:inc(1,
339339
gen_arr(vars.status, route_id, matched_uri, matched_host,
340340
service_id, consumer_name, balancer_ip,
341-
vars.request_type, vars.llm_model,
341+
vars.request_type, vars.request_llm_model, vars.llm_model,
342342
unpack(extra_labels("http_status", ctx))))
343343

344344
local latency, upstream_latency, apisix_latency = latency_details(ctx)
345345
local latency_extra_label_values = extra_labels("http_latency", ctx)
346346

347347
metrics.latency:observe(latency,
348348
gen_arr("request", route_id, service_id, consumer_name, balancer_ip,
349-
vars.request_type, vars.llm_model,
349+
vars.request_type, vars.request_llm_model, vars.llm_model,
350350
unpack(latency_extra_label_values)))
351351

352352
if upstream_latency then
353353
metrics.latency:observe(upstream_latency,
354354
gen_arr("upstream", route_id, service_id, consumer_name, balancer_ip,
355-
vars.request_type, vars.llm_model,
355+
vars.request_type, vars.request_llm_model, vars.llm_model,
356356
unpack(latency_extra_label_values)))
357357
end
358358

359359
metrics.latency:observe(apisix_latency,
360360
gen_arr("apisix", route_id, service_id, consumer_name, balancer_ip,
361-
vars.request_type, vars.llm_model,
361+
vars.request_type, vars.request_llm_model, vars.llm_model,
362362
unpack(latency_extra_label_values)))
363363

364364
local bandwidth_extra_label_values = extra_labels("bandwidth", ctx)
365365

366366
metrics.bandwidth:inc(vars.request_length,
367367
gen_arr("ingress", route_id, service_id, consumer_name, balancer_ip,
368-
vars.request_type, vars.llm_model,
368+
vars.request_type, vars.request_llm_model, vars.llm_model,
369369
unpack(bandwidth_extra_label_values)))
370370

371371
metrics.bandwidth:inc(vars.bytes_sent,
372372
gen_arr("egress", route_id, service_id, consumer_name, balancer_ip,
373-
vars.request_type, vars.llm_model,
373+
vars.request_type, vars.request_llm_model, vars.llm_model,
374374
unpack(bandwidth_extra_label_values)))
375375

376376
local llm_time_to_first_token = vars.llm_time_to_first_token
377377
if llm_time_to_first_token ~= "" then
378378
metrics.llm_latency:observe(tonumber(llm_time_to_first_token),
379379
gen_arr(route_id, service_id, consumer_name, balancer_ip,
380-
vars.request_type, vars.llm_model,
380+
vars.request_type, vars.request_llm_model, vars.llm_model,
381381
unpack(extra_labels("llm_latency", ctx))))
382382
end
383383
if vars.llm_prompt_tokens ~= "" then
384384
metrics.llm_prompt_tokens:inc(tonumber(vars.llm_prompt_tokens),
385385
gen_arr(route_id, service_id, consumer_name, balancer_ip,
386-
vars.request_type, vars.llm_model,
386+
vars.request_type, vars.request_llm_model, vars.llm_model,
387387
unpack(extra_labels("llm_prompt_tokens", ctx))))
388388
end
389389
if vars.llm_completion_tokens ~= "" then
390390
metrics.llm_completion_tokens:inc(tonumber(vars.llm_completion_tokens),
391391
gen_arr(route_id, service_id, consumer_name, balancer_ip,
392-
vars.request_type, vars.llm_model,
392+
vars.request_type, vars.request_llm_model, vars.llm_model,
393393
unpack(extra_labels("llm_completion_tokens", ctx))))
394394
end
395395
end
@@ -787,7 +787,7 @@ local function inc_llm_active_connections(ctx, value)
787787
value,
788788
gen_arr(route_name, route_id, matched_uri,
789789
matched_host, service_name, service_id, consumer_name, balancer_ip,
790-
vars.request_type, vars.llm_model,
790+
vars.request_type, vars.request_llm_model, vars.llm_model,
791791
unpack(extra_labels("llm_active_connections", ctx)))
792792
)
793793
end

docs/en/latest/plugins/ai-proxy-multi.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ keywords:
77
- ai-proxy-multi
88
- AI
99
- LLM
10-
description: The ai-proxy-multi Plugin extends the capabilities of ai-proxy with load balancing, retries, fallbacks, and health chekcs, simplifying the integration with OpenAI, DeepSeek, AIMLAPI, and other OpenAI-compatible APIs.
10+
description: The ai-proxy-multi Plugin extends the capabilities of ai-proxy with load balancing, retries, fallbacks, and health chekcs, simplifying the integration with OpenAI, DeepSeek, Azure, AIMLAPI, and other OpenAI-compatible APIs.
1111
---
1212

1313
<!--
@@ -35,7 +35,7 @@ description: The ai-proxy-multi Plugin extends the capabilities of ai-proxy with
3535

3636
## Description
3737

38-
The `ai-proxy-multi` Plugin simplifies access to LLM and embedding models by transforming Plugin configurations into the designated request format for OpenAI, DeepSeek, AIMLAPI, and other OpenAI-compatible APIs. It extends the capabilities of [`ai-proxy-multi`](./ai-proxy.md) with load balancing, retries, fallbacks, and health checks.
38+
The `ai-proxy-multi` Plugin simplifies access to LLM and embedding models by transforming Plugin configurations into the designated request format for OpenAI, DeepSeek, Azure, AIMLAPI, and other OpenAI-compatible APIs. It extends the capabilities of [`ai-proxy-multi`](./ai-proxy.md) with load balancing, retries, fallbacks, and health checks.
3939

4040
In addition, the Plugin also supports logging LLM request information in the access log, such as token usage, model, time to the first response, and more.
4141

@@ -58,7 +58,7 @@ In addition, the Plugin also supports logging LLM request information in the acc
5858
| balancer.key | string | False | | | Used when `type` is `chash`. When `hash_on` is set to `header` or `cookie`, `key` is required. When `hash_on` is set to `consumer`, `key` is not required as the consumer name will be used as the key automatically. |
5959
| instances | array[object] | True | | | LLM instance configurations. |
6060
| instances.name | string | True | | | Name of the LLM service instance. |
61-
| instances.provider | string | True | | [openai, deepseek, aimlapi, openai-compatible] | LLM service provider. When set to `openai`, the Plugin will proxy the request to `api.openai.com`. When set to `deepseek`, the Plugin will proxy the request to `api.deepseek.com`. When set to `aimlapi`, the Plugin uses the OpenAI-compatible driver and proxies the request to `api.aimlapi.com` by default. When set to `openai-compatible`, the Plugin will proxy the request to the custom endpoint configured in `override`. |
61+
| instances.provider | string | True | | [openai, deepseek, azure-openai, aimlapi, openai-compatible] | LLM service provider. When set to `openai`, the Plugin will proxy the request to `api.openai.com`. When set to `deepseek`, the Plugin will proxy the request to `api.deepseek.com`. When set to `aimlapi`, the Plugin uses the OpenAI-compatible driver and proxies the request to `api.aimlapi.com` by default. When set to `openai-compatible`, the Plugin will proxy the request to the custom endpoint configured in `override`. |
6262
| instances.priority | integer | False | 0 | | Priority of the LLM instance in load balancing. `priority` takes precedence over `weight`. |
6363
| instances.weight | string | True | 0 | greater or equal to 0 | Weight of the LLM instance in load balancing. |
6464
| instances.auth | object | True | | | Authentication configurations. |

0 commit comments

Comments
 (0)