apache
diff --git a/‎apisix/balancer/ewma.lua‎
Lines changed: 3 additions & 2 deletions b/‎apisix/balancer/ewma.lua‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎apisix/cli/ngx_tpl.lua‎
Lines changed: 1 addition & 0 deletions b/‎apisix/cli/ngx_tpl.lua‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎apisix/core/ctx.lua‎
Lines changed: 1 addition & 0 deletions b/‎apisix/core/ctx.lua‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎apisix/plugins/ai-drivers/azure-openai.lua‎
Lines changed: 26 additions & 0 deletions b/‎apisix/plugins/ai-drivers/azure-openai.lua‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎apisix/plugins/ai-drivers/openai-base.lua‎
Lines changed: 4 additions & 1 deletion b/‎apisix/plugins/ai-drivers/openai-base.lua‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎apisix/plugins/ai-proxy-multi.lua‎
Lines changed: 9 additions & 1 deletion b/‎apisix/plugins/ai-proxy-multi.lua‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎apisix/plugins/ai-proxy/base.lua‎
Lines changed: 4 additions & 0 deletions b/‎apisix/plugins/ai-proxy/base.lua‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎apisix/plugins/ai-proxy/schema.lua‎
Lines changed: 2 additions & 0 deletions b/‎apisix/plugins/ai-proxy/schema.lua‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎apisix/plugins/prometheus/exporter.lua‎
Lines changed: 17 additions & 17 deletions b/‎apisix/plugins/prometheus/exporter.lua‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎docs/en/latest/plugins/ai-proxy-multi.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/en/latest/plugins/ai-proxy-multi.md‎
Lines changed: 3 additions & 3 deletions
@@ -18,6 +18,7 @@ local next = next
 local error = error
 
 local DECAY_TIME = 10 -- this value is in seconds
+local SHM_TTL = 60
 local LOCK_KEY = ":ewma_key"
 
 local shm_ewma = ngx_shared["balancer-ewma"]
@@ -58,15 +59,15 @@ local function decay_ewma(ewma, last_touched_at, rtt, now)
 end
 
 local function store_stats(upstream, ewma, now)
-    local success, err, forcible = shm_last_touched_at:set(upstream, now)
+    local success, err, forcible = shm_last_touched_at:set(upstream, now, SHM_TTL)
     if not success then
         core.log.error("shm_last_touched_at:set failed: ", err)
     end
     if forcible then
         core.log.warn("shm_last_touched_at:set valid items forcibly overwritten")
     end
 
-    success, err, forcible = shm_ewma:set(upstream, ewma)
+    success, err, forcible = shm_ewma:set(upstream, ewma, SHM_TTL)
     if not success then
         core.log.error("shm_ewma:set failed: ", err)
     end
 
@@ -809,6 +809,7 @@ http {
             set $llm_content_risk_level         '';
             set $apisix_upstream_response_time  $upstream_response_time;
             set $request_type               'traditional_http';
+            set $request_llm_model              '';
 
             set $llm_time_to_first_token        '0';
             set $llm_model                      '';
 
@@ -238,6 +238,7 @@ do
         request_type               = true,
         apisix_upstream_response_time = true,
         llm_time_to_first_token    = true,
+        request_llm_model          = true,
         llm_model                  = true,
         llm_prompt_tokens          = true,
         llm_completion_tokens      = true,
 
@@ -0,0 +1,26 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one or more
+-- contributor license agreements.  See the NOTICE file distributed with
+-- this work for additional information regarding copyright ownership.
+-- The ASF licenses this file to You under the Apache License, Version 2.0
+-- (the "License"); you may not use this file except in compliance with
+-- the License.  You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+return require("apisix.plugins.ai-drivers.openai-base").new(
+    {
+        path = "/completions",
+        port = 443,
+        options = {
+            remove_model = true
+        }
+    }
+)
@@ -47,6 +47,7 @@ function _M.new(opts)
         host = opts.host,
         port = opts.port,
         path = opts.path,
+        remove_model = opts.options and opts.options.remove_model
     }
     return setmetatable(self, mt)
 end
@@ -255,7 +256,9 @@ function _M.request(self, ctx, conf, request_table, extra_opts)
             request_table[opt] = val
         end
     end
-
+    if self.remove_model then
+        request_table.model = nil
+    end
     local req_json, err = core.json.encode(request_table)
     if not req_json then
         return nil, err
 
@@ -30,6 +30,7 @@ local ipairs = ipairs
 local type = type
 
 local priority_balancer = require("apisix.balancer.priority")
+local endpoint_regex = "^(https?)://([^:/]+):?(%d*)/?.*$"
 
 local pickers = {}
 local lrucache_server_picker = core.lrucache.new({
@@ -73,6 +74,13 @@ function _M.check_schema(conf)
     end
 
     for _, instance in ipairs(conf.instances) do
+        local endpoint = instance and instance.override and instance.override.endpoint
+        if endpoint then
+            local scheme, host, _ = endpoint:match(endpoint_regex)
+            if not scheme or not host  then
+                return false, "invalid endpoint"
+            end
+        end
         local ai_driver, err = pcall(require, "apisix.plugins.ai-drivers." .. instance.provider)
         if not ai_driver then
             core.log.warn("fail to require ai provider: ", instance.provider, ", err", err)
@@ -143,7 +151,7 @@ end
 
 local function resolve_endpoint(instance_conf)
     local endpoint = core.table.try_read_attr(instance_conf, "override", "endpoint")
-    local scheme, host, port, _ = endpoint:match("^(https?)://([^:/]+):?(%d*)(/?.*)$")
+    local scheme, host, port, _ = endpoint:match(endpoint_regex)
     if port == "" then
         port = (scheme == "https") and "443" or "80"
     end
 
@@ -27,6 +27,7 @@ local _M = {}
 function _M.set_logging(ctx, summaries, payloads)
     if summaries then
         ctx.llm_summary = {
+            request_model = ctx.var.request_llm_model,
             model = ctx.var.llm_model,
             duration = ctx.var.llm_time_to_first_token,
             prompt_tokens = ctx.var.llm_prompt_tokens,
@@ -70,6 +71,9 @@ function _M.before_proxy(conf, ctx)
     else
         ctx.var.request_type = "ai_chat"
     end
+    if request_body.model then
+        ctx.var.request_llm_model = request_body.model
+    end
     local model = ai_instance.options and ai_instance.options.model or request_body.model
     if model then
         ctx.var.llm_model = model
 
@@ -68,6 +68,7 @@ local ai_instance_schema = {
                     "deepseek",
                     "aimlapi",
                     "openai-compatible",
+                    "azure-openai"
                 }, -- add more providers later
             },
             priority = {
@@ -129,6 +130,7 @@ _M.ai_proxy_schema = {
                 "deepseek",
                 "aimlapi",
                 "openai-compatible",
+                "azure-openai"
             }, -- add more providers later
 
         },
 
@@ -211,7 +211,7 @@ function _M.http_init(prometheus_enabled_in_stream)
     metrics.status = prometheus:counter("http_status",
             "HTTP status codes per service in APISIX",
             {"code", "route", "matched_uri", "matched_host", "service", "consumer", "node",
-            "request_type", "llm_model",
+            "request_type", "request_llm_model", "llm_model",
             unpack(extra_labels("http_status"))},
             status_metrics_exptime)
 
@@ -223,14 +223,14 @@ function _M.http_init(prometheus_enabled_in_stream)
     metrics.latency = prometheus:histogram("http_latency",
         "HTTP request latency in milliseconds per service in APISIX",
         {"type", "route", "service", "consumer", "node",
-        "request_type", "llm_model",
+        "request_type", "request_llm_model", "llm_model",
         unpack(extra_labels("http_latency"))},
         buckets, latency_metrics_exptime)
 
     metrics.bandwidth = prometheus:counter("bandwidth",
             "Total bandwidth in bytes consumed per service in APISIX",
             {"type", "route", "service", "consumer", "node",
-            "request_type", "llm_model",
+            "request_type", "request_llm_model", "llm_model",
             unpack(extra_labels("bandwidth"))},
             bandwidth_metrics_exptime)
 
@@ -241,30 +241,30 @@ function _M.http_init(prometheus_enabled_in_stream)
     metrics.llm_latency = prometheus:histogram("llm_latency",
         "LLM request latency in milliseconds",
         {"route_id", "service_id", "consumer", "node",
-        "request_type", "llm_model",
+        "request_type", "request_llm_model", "llm_model",
         unpack(extra_labels("llm_latency"))},
         llm_latency_buckets,
         llm_latency_exptime)
 
     metrics.llm_prompt_tokens = prometheus:counter("llm_prompt_tokens",
             "LLM service consumed prompt tokens",
             {"route_id", "service_id", "consumer", "node",
-            "request_type", "llm_model",
+            "request_type", "request_llm_model", "llm_model",
             unpack(extra_labels("llm_prompt_tokens"))},
             llm_prompt_tokens_exptime)
 
     metrics.llm_completion_tokens = prometheus:counter("llm_completion_tokens",
             "LLM service consumed completion tokens",
             {"route_id", "service_id", "consumer", "node",
-            "request_type", "llm_model",
+            "request_type", "request_llm_model", "llm_model",
             unpack(extra_labels("llm_completion_tokens"))},
             llm_completion_tokens_exptime)
 
     metrics.llm_active_connections = prometheus:gauge("llm_active_connections",
             "Number of active connections to LLM service",
             {"route", "route_id", "matched_uri", "matched_host",
             "service", "service_id", "consumer", "node",
-            "request_type", "llm_model",
+            "request_type", "request_llm_model", "llm_model",
             unpack(extra_labels("llm_active_connections"))},
             llm_active_connections_exptime)
 
@@ -338,58 +338,58 @@ function _M.http_log(conf, ctx)
     metrics.status:inc(1,
         gen_arr(vars.status, route_id, matched_uri, matched_host,
                 service_id, consumer_name, balancer_ip,
-                vars.request_type, vars.llm_model,
+                vars.request_type, vars.request_llm_model, vars.llm_model,
                 unpack(extra_labels("http_status", ctx))))
 
     local latency, upstream_latency, apisix_latency = latency_details(ctx)
     local latency_extra_label_values = extra_labels("http_latency", ctx)
 
     metrics.latency:observe(latency,
         gen_arr("request", route_id, service_id, consumer_name, balancer_ip,
-        vars.request_type, vars.llm_model,
+        vars.request_type, vars.request_llm_model, vars.llm_model,
         unpack(latency_extra_label_values)))
 
     if upstream_latency then
         metrics.latency:observe(upstream_latency,
             gen_arr("upstream", route_id, service_id, consumer_name, balancer_ip,
-            vars.request_type, vars.llm_model,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
             unpack(latency_extra_label_values)))
     end
 
     metrics.latency:observe(apisix_latency,
         gen_arr("apisix", route_id, service_id, consumer_name, balancer_ip,
-        vars.request_type, vars.llm_model,
+        vars.request_type, vars.request_llm_model, vars.llm_model,
         unpack(latency_extra_label_values)))
 
     local bandwidth_extra_label_values = extra_labels("bandwidth", ctx)
 
     metrics.bandwidth:inc(vars.request_length,
         gen_arr("ingress", route_id, service_id, consumer_name, balancer_ip,
-        vars.request_type, vars.llm_model,
+        vars.request_type, vars.request_llm_model, vars.llm_model,
         unpack(bandwidth_extra_label_values)))
 
     metrics.bandwidth:inc(vars.bytes_sent,
         gen_arr("egress", route_id, service_id, consumer_name, balancer_ip,
-        vars.request_type, vars.llm_model,
+        vars.request_type, vars.request_llm_model, vars.llm_model,
         unpack(bandwidth_extra_label_values)))
 
     local llm_time_to_first_token = vars.llm_time_to_first_token
     if llm_time_to_first_token ~= "" then
         metrics.llm_latency:observe(tonumber(llm_time_to_first_token),
             gen_arr(route_id, service_id, consumer_name, balancer_ip,
-            vars.request_type, vars.llm_model,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
             unpack(extra_labels("llm_latency", ctx))))
     end
     if vars.llm_prompt_tokens ~= "" then
         metrics.llm_prompt_tokens:inc(tonumber(vars.llm_prompt_tokens),
             gen_arr(route_id, service_id, consumer_name, balancer_ip,
-            vars.request_type, vars.llm_model,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
             unpack(extra_labels("llm_prompt_tokens", ctx))))
     end
     if vars.llm_completion_tokens ~= "" then
         metrics.llm_completion_tokens:inc(tonumber(vars.llm_completion_tokens),
             gen_arr(route_id, service_id, consumer_name, balancer_ip,
-            vars.request_type, vars.llm_model,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
             unpack(extra_labels("llm_completion_tokens", ctx))))
     end
 end
@@ -787,7 +787,7 @@ local function inc_llm_active_connections(ctx, value)
         value,
         gen_arr(route_name, route_id, matched_uri,
             matched_host, service_name, service_id, consumer_name, balancer_ip,
-            vars.request_type, vars.llm_model,
+            vars.request_type, vars.request_llm_model, vars.llm_model,
         unpack(extra_labels("llm_active_connections", ctx)))
     )
 end
 
@@ -7,7 +7,7 @@ keywords:
   - ai-proxy-multi
   - AI
   - LLM
-description: The ai-proxy-multi Plugin extends the capabilities of ai-proxy with load balancing, retries, fallbacks, and health chekcs, simplifying the integration with OpenAI, DeepSeek, AIMLAPI, and other OpenAI-compatible APIs.
+description: The ai-proxy-multi Plugin extends the capabilities of ai-proxy with load balancing, retries, fallbacks, and health chekcs, simplifying the integration with OpenAI, DeepSeek, Azure, AIMLAPI, and other OpenAI-compatible APIs.
 ---
 
 <!--
@@ -35,7 +35,7 @@ description: The ai-proxy-multi Plugin extends the capabilities of ai-proxy with
 
 ## Description
 
-The `ai-proxy-multi` Plugin simplifies access to LLM and embedding models by transforming Plugin configurations into the designated request format for OpenAI, DeepSeek, AIMLAPI, and other OpenAI-compatible APIs. It extends the capabilities of [`ai-proxy-multi`](./ai-proxy.md) with load balancing, retries, fallbacks, and health checks.
+The `ai-proxy-multi` Plugin simplifies access to LLM and embedding models by transforming Plugin configurations into the designated request format for OpenAI, DeepSeek, Azure, AIMLAPI, and other OpenAI-compatible APIs. It extends the capabilities of [`ai-proxy-multi`](./ai-proxy.md) with load balancing, retries, fallbacks, and health checks.
 
 In addition, the Plugin also supports logging LLM request information in the access log, such as token usage, model, time to the first response, and more.
 
@@ -58,7 +58,7 @@ In addition, the Plugin also supports logging LLM request information in the acc
 | balancer.key                       | string         | False    |                                   |              | Used when `type` is `chash`. When `hash_on` is set to `header` or `cookie`, `key` is required. When `hash_on` is set to `consumer`, `key` is not required as the consumer name will be used as the key automatically. |
 | instances                          | array[object]  | True     |                                   |              | LLM instance configurations. |
 | instances.name                     | string         | True     |                                   |              | Name of the LLM service instance. |
-| instances.provider                 | string         | True     |                                   | [openai, deepseek, aimlapi, openai-compatible] | LLM service provider. When set to `openai`, the Plugin will proxy the request to `api.openai.com`. When set to `deepseek`, the Plugin will proxy the request to `api.deepseek.com`. When set to `aimlapi`, the Plugin uses the OpenAI-compatible driver and proxies the request to `api.aimlapi.com` by default. When set to `openai-compatible`, the Plugin will proxy the request to the custom endpoint configured in `override`. |
+| instances.provider                 | string         | True     |                                   | [openai, deepseek, azure-openai, aimlapi, openai-compatible] | LLM service provider. When set to `openai`, the Plugin will proxy the request to `api.openai.com`. When set to `deepseek`, the Plugin will proxy the request to `api.deepseek.com`. When set to `aimlapi`, the Plugin uses the OpenAI-compatible driver and proxies the request to `api.aimlapi.com` by default. When set to `openai-compatible`, the Plugin will proxy the request to the custom endpoint configured in `override`. |
 | instances.priority                  | integer        | False    | 0                               |              | Priority of the LLM instance in load balancing. `priority` takes precedence over `weight`. |
 | instances.weight                    | string         | True     | 0                               | greater or equal to 0 | Weight of the LLM instance in load balancing. |
 | instances.auth                      | object         | True     |                                   |              | Authentication configurations. |