diff --git a/pkg/ext-proc/handlers/response.go b/pkg/ext-proc/handlers/response.go
index 85b459742..6f8293e38 100644
--- a/pkg/ext-proc/handlers/response.go
+++ b/pkg/ext-proc/handlers/response.go
@@ -73,6 +73,7 @@ func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.Proce
return nil, fmt.Errorf("unmarshaling response body: %v", err)
}
reqCtx.Response = res
+ reqCtx.ResponseSize = len(body.ResponseBody.Body)
// ResponseComplete is to indicate the response is complete. In non-streaming
// case, it will be set to be true once the response is processed; in
// streaming case, it will be set to be true once the last chunk is processed.
diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go
index abc7ebe08..d152e12f4 100644
--- a/pkg/ext-proc/handlers/server.go
+++ b/pkg/ext-proc/handlers/server.go
@@ -95,6 +95,9 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
if err == nil && reqCtx.ResponseComplete {
reqCtx.ResponseCompleteTimestamp = time.Now()
metrics.RecordRequestLatencies(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
+ metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize)
+ metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.PromptTokens)
+ metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.CompletionTokens)
}
klog.V(3).Infof("Request context after HandleResponseBody: %+v", reqCtx)
default:
@@ -138,5 +141,6 @@ type RequestContext struct {
ResponseCompleteTimestamp time.Time
RequestSize int
Response Response
+ ResponseSize int
ResponseComplete bool
}
diff --git a/pkg/ext-proc/metrics/README.md b/pkg/ext-proc/metrics/README.md
index fdc24eacc..1094bc23d 100644
--- a/pkg/ext-proc/metrics/README.md
+++ b/pkg/ext-proc/metrics/README.md
@@ -6,13 +6,46 @@ This documentation is the current state of exposed metrics.
* [Exposed Metrics](#exposed-metrics)
* [Scrape Metrics](#scrape-metrics)
+## Requirements
+
+Response metrics are only supported in non-streaming mode, with the follow up [issue](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) to address streaming mode.
+
+Currently there are two options:
+- If requests don't use response streaming, then you can enable `Buffered` mode for response in `EnvoyExtensionPolicy`, this will buffer the response body at the proxy and forward it to the endpoint picker, which allows the endpoint picker to report response metrics.
+
+- If requests use response streaming, then it is not recommended to enable `Buffered` mode, the response body processing mode should be left empty in the `EnvoyExtensionPolicy` (default). In this case response bodies will not be forwarded to the endpoint picker, and therefore response metrics will not be reported.
+
+
+```
+apiVersion: gateway.envoyproxy.io/v1alpha1
+kind: EnvoyExtensionPolicy
+metadata:
+ name: ext-proc-policy
+ namespace: default
+spec:
+ extProc:
+ - backendRefs:
+ - group: ""
+ kind: Service
+ name: inference-gateway-ext-proc
+ port: 9002
+ processingMode:
+ request:
+ body: Buffered
+ response:
+ body: Buffered
+```
+
## Exposed metrics
| Metric name | Metric Type | Description | Labels | Status |
| ------------|--------------| ----------- | ------ | ------ |
-| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> ` | ALPHA |
-| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> ` | ALPHA |
-| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> ` | ALPHA |
+| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
+| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
+| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
+| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
+| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
+| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA |
## Scrape Metrics
diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go
index 407144a4a..8cb7bd274 100644
--- a/pkg/ext-proc/metrics/metrics.go
+++ b/pkg/ext-proc/metrics/metrics.go
@@ -51,6 +51,43 @@ var (
},
[]string{"model_name", "target_model_name"},
)
+
+ responseSizes = compbasemetrics.NewHistogramVec(
+ &compbasemetrics.HistogramOpts{
+ Subsystem: InferenceModelComponent,
+ Name: "response_sizes",
+ Help: "Inference model responses size distribution in bytes for each model and target model.",
+ // Most models have a response token < 8192 tokens. Each token, in average, has 4 characters.
+ // 8192 * 4 = 32768.
+ Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536},
+ StabilityLevel: compbasemetrics.ALPHA,
+ },
+ []string{"model_name", "target_model_name"},
+ )
+
+ inputTokens = compbasemetrics.NewHistogramVec(
+ &compbasemetrics.HistogramOpts{
+ Subsystem: InferenceModelComponent,
+ Name: "input_tokens",
+ Help: "Inference model input token count distribution for requests in each model.",
+ // Most models have a input context window less than 1 million tokens.
+ Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536, 131072, 262144, 524288, 1048576},
+ StabilityLevel: compbasemetrics.ALPHA,
+ },
+ []string{"model_name", "target_model_name"},
+ )
+
+ outputTokens = compbasemetrics.NewHistogramVec(
+ &compbasemetrics.HistogramOpts{
+ Subsystem: InferenceModelComponent,
+ Name: "output_tokens",
+ Help: "Inference model output token count distribution for requests in each model.",
+ // Most models generates output less than 8192 tokens.
+ Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192},
+ StabilityLevel: compbasemetrics.ALPHA,
+ },
+ []string{"model_name", "target_model_name"},
+ )
)
var registerMetrics sync.Once
@@ -61,6 +98,9 @@ func Register() {
legacyregistry.MustRegister(requestCounter)
legacyregistry.MustRegister(requestLatencies)
legacyregistry.MustRegister(requestSizes)
+ legacyregistry.MustRegister(responseSizes)
+ legacyregistry.MustRegister(inputTokens)
+ legacyregistry.MustRegister(outputTokens)
})
}
@@ -84,3 +124,22 @@ func RecordRequestLatencies(modelName, targetModelName string, received time.Tim
requestLatencies.WithLabelValues(modelName, targetModelName).Observe(elapsedSeconds)
return true
}
+
+// RecordResponseSizes records the response sizes.
+func RecordResponseSizes(modelName, targetModelName string, size int) {
+ responseSizes.WithLabelValues(modelName, targetModelName).Observe(float64(size))
+}
+
+// RecordInputTokens records input tokens count.
+func RecordInputTokens(modelName, targetModelName string, size int) {
+ if size > 0 {
+ inputTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size))
+ }
+}
+
+// RecordOutputTokens records output tokens count.
+func RecordOutputTokens(modelName, targetModelName string, size int) {
+ if size > 0 {
+ outputTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size))
+ }
+}
diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go
index 80241baa9..57774b11a 100644
--- a/pkg/ext-proc/metrics/metrics_test.go
+++ b/pkg/ext-proc/metrics/metrics_test.go
@@ -12,6 +12,9 @@ import (
const RequestTotalMetric = InferenceModelComponent + "_request_total"
const RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds"
const RequestSizesMetric = InferenceModelComponent + "_request_sizes"
+const ResponseSizesMetric = InferenceModelComponent + "_response_sizes"
+const InputTokensMetric = InferenceModelComponent + "_input_tokens"
+const OutputTokensMetric = InferenceModelComponent + "_output_tokens"
func TestRecordRequestCounterandSizes(t *testing.T) {
type requests struct {
@@ -160,3 +163,97 @@ func TestRecordRequestLatencies(t *testing.T) {
})
}
}
+
+func TestRecordResponseMetrics(t *testing.T) {
+ type responses struct {
+ modelName string
+ targetModelName string
+ inputToken int
+ outputToken int
+ respSize int
+ }
+ scenarios := []struct {
+ name string
+ resp []responses
+ }{{
+ name: "multiple requests",
+ resp: []responses{
+ {
+ modelName: "m10",
+ targetModelName: "t10",
+ respSize: 1200,
+ inputToken: 10,
+ outputToken: 100,
+ },
+ {
+ modelName: "m10",
+ targetModelName: "t10",
+ respSize: 500,
+ inputToken: 20,
+ outputToken: 200,
+ },
+ {
+ modelName: "m10",
+ targetModelName: "t11",
+ respSize: 2480,
+ inputToken: 30,
+ outputToken: 300,
+ },
+ {
+ modelName: "m20",
+ targetModelName: "t20",
+ respSize: 80,
+ inputToken: 40,
+ outputToken: 400,
+ },
+ },
+ }}
+ Register()
+ for _, scenario := range scenarios {
+ t.Run(scenario.name, func(t *testing.T) {
+ for _, resp := range scenario.resp {
+ RecordInputTokens(resp.modelName, resp.targetModelName, resp.inputToken)
+ RecordOutputTokens(resp.modelName, resp.targetModelName, resp.outputToken)
+ RecordResponseSizes(resp.modelName, resp.targetModelName, resp.respSize)
+ }
+ wantResponseSize, err := os.Open("testdata/response_sizes_metric")
+ defer func() {
+ if err := wantResponseSize.Close(); err != nil {
+ t.Error(err)
+ }
+ }()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantResponseSize, ResponseSizesMetric); err != nil {
+ t.Error(err)
+ }
+
+ wantInputToken, err := os.Open("testdata/input_tokens_metric")
+ defer func() {
+ if err := wantInputToken.Close(); err != nil {
+ t.Error(err)
+ }
+ }()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantInputToken, InputTokensMetric); err != nil {
+ t.Error(err)
+ }
+
+ wantOutputToken, err := os.Open("testdata/output_tokens_metric")
+ defer func() {
+ if err := wantOutputToken.Close(); err != nil {
+ t.Error(err)
+ }
+ }()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantOutputToken, OutputTokensMetric); err != nil {
+ t.Error(err)
+ }
+ })
+ }
+}
diff --git a/pkg/ext-proc/metrics/testdata/input_tokens_metric b/pkg/ext-proc/metrics/testdata/input_tokens_metric
new file mode 100644
index 000000000..245c7dfa7
--- /dev/null
+++ b/pkg/ext-proc/metrics/testdata/input_tokens_metric
@@ -0,0 +1,68 @@
+# HELP inference_model_input_tokens [ALPHA] Inference model input token count distribution for requests in each model.
+# TYPE inference_model_input_tokens histogram
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16384"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32778"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="65536"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="131072"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="262144"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="524288"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2
+inference_model_input_tokens_sum{model_name="m10",target_model_name="t10"} 30
+inference_model_input_tokens_count{model_name="m10",target_model_name="t10"} 2
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16384"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32778"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="65536"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="131072"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="262144"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="524288"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1
+inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1
+inference_model_input_tokens_sum{model_name="m10",target_model_name="t11"} 30
+inference_model_input_tokens_count{model_name="m10",target_model_name="t11"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16384"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32778"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="65536"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="131072"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="262144"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="524288"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1
+inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1
+inference_model_input_tokens_sum{model_name="m20",target_model_name="t20"} 40
+inference_model_input_tokens_count{model_name="m20",target_model_name="t20"} 1
diff --git a/pkg/ext-proc/metrics/testdata/output_tokens_metric b/pkg/ext-proc/metrics/testdata/output_tokens_metric
new file mode 100644
index 000000000..40bbe3272
--- /dev/null
+++ b/pkg/ext-proc/metrics/testdata/output_tokens_metric
@@ -0,0 +1,47 @@
+# HELP inference_model_output_tokens [ALPHA] Inference model output token count distribution for requests in each model.
+# TYPE inference_model_output_tokens histogram
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 1
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2
+inference_model_output_tokens_sum{model_name="m10",target_model_name="t10"} 300
+inference_model_output_tokens_count{model_name="m10",target_model_name="t10"} 2
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 0
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1
+inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1
+inference_model_output_tokens_sum{model_name="m10",target_model_name="t11"} 300
+inference_model_output_tokens_count{model_name="m10",target_model_name="t11"} 1
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 0
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 0
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 0
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1
+inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1
+inference_model_output_tokens_sum{model_name="m20",target_model_name="t20"} 400
+inference_model_output_tokens_count{model_name="m20",target_model_name="t20"} 1
diff --git a/pkg/ext-proc/metrics/testdata/response_sizes_metric b/pkg/ext-proc/metrics/testdata/response_sizes_metric
new file mode 100644
index 000000000..7f981090c
--- /dev/null
+++ b/pkg/ext-proc/metrics/testdata/response_sizes_metric
@@ -0,0 +1,56 @@
+# HELP inference_model_response_sizes [ALPHA] Inference model responses size distribution in bytes for each model and target model.
+# TYPE inference_model_response_sizes histogram
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="1"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="8"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="16"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="32"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="64"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="128"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="256"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="512"} 1
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="1024"} 1
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="2048"} 2
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="4096"} 2
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="8192"} 2
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="16384"} 2
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="32778"} 2
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="65536"} 2
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2
+inference_model_response_sizes_sum{model_name="m10",target_model_name="t10"} 1700
+inference_model_response_sizes_count{model_name="m10",target_model_name="t10"} 2
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="1"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="8"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="16"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="32"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="64"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="128"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="256"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="512"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="1024"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="2048"} 0
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="4096"} 1
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="8192"} 1
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="16384"} 1
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="32778"} 1
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="65536"} 1
+inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1
+inference_model_response_sizes_sum{model_name="m10",target_model_name="t11"} 2480
+inference_model_response_sizes_count{model_name="m10",target_model_name="t11"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="1"} 0
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="8"} 0
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="16"} 0
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="32"} 0
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="64"} 0
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="128"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="256"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="512"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="1024"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="2048"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="4096"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="8192"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="16384"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="32778"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="65536"} 1
+inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1
+inference_model_response_sizes_sum{model_name="m20",target_model_name="t20"} 80
+inference_model_response_sizes_count{model_name="m20",target_model_name="t20"} 1