diff --git a/pkg/ext-proc/handlers/response.go b/pkg/ext-proc/handlers/response.go index 85b459742..6f8293e38 100644 --- a/pkg/ext-proc/handlers/response.go +++ b/pkg/ext-proc/handlers/response.go @@ -73,6 +73,7 @@ func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.Proce return nil, fmt.Errorf("unmarshaling response body: %v", err) } reqCtx.Response = res + reqCtx.ResponseSize = len(body.ResponseBody.Body) // ResponseComplete is to indicate the response is complete. In non-streaming // case, it will be set to be true once the response is processed; in // streaming case, it will be set to be true once the last chunk is processed. diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go index abc7ebe08..d152e12f4 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/ext-proc/handlers/server.go @@ -95,6 +95,9 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { if err == nil && reqCtx.ResponseComplete { reqCtx.ResponseCompleteTimestamp = time.Now() metrics.RecordRequestLatencies(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.PromptTokens) + metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.CompletionTokens) } klog.V(3).Infof("Request context after HandleResponseBody: %+v", reqCtx) default: @@ -138,5 +141,6 @@ type RequestContext struct { ResponseCompleteTimestamp time.Time RequestSize int Response Response + ResponseSize int ResponseComplete bool } diff --git a/pkg/ext-proc/metrics/README.md b/pkg/ext-proc/metrics/README.md index fdc24eacc..1094bc23d 100644 --- a/pkg/ext-proc/metrics/README.md +++ b/pkg/ext-proc/metrics/README.md @@ -6,13 +6,46 @@ This documentation is the current state of exposed metrics. * [Exposed Metrics](#exposed-metrics) * [Scrape Metrics](#scrape-metrics) +## Requirements + +Response metrics are only supported in non-streaming mode, with the follow up [issue](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) to address streaming mode. + +Currently there are two options: +- If requests don't use response streaming, then you can enable `Buffered` mode for response in `EnvoyExtensionPolicy`, this will buffer the response body at the proxy and forward it to the endpoint picker, which allows the endpoint picker to report response metrics. + +- If requests use response streaming, then it is not recommended to enable `Buffered` mode, the response body processing mode should be left empty in the `EnvoyExtensionPolicy` (default). In this case response bodies will not be forwarded to the endpoint picker, and therefore response metrics will not be reported. + + +``` +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyExtensionPolicy +metadata: + name: ext-proc-policy + namespace: default +spec: + extProc: + - backendRefs: + - group: "" + kind: Service + name: inference-gateway-ext-proc + port: 9002 + processingMode: + request: + body: Buffered + response: + body: Buffered +``` + ## Exposed metrics | Metric name | Metric Type | Description | Labels | Status | | ------------|--------------| ----------- | ------ | ------ | -| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> ` | ALPHA | -| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> ` | ALPHA | -| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> ` | ALPHA | +| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | ## Scrape Metrics diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go index 407144a4a..8cb7bd274 100644 --- a/pkg/ext-proc/metrics/metrics.go +++ b/pkg/ext-proc/metrics/metrics.go @@ -51,6 +51,43 @@ var ( }, []string{"model_name", "target_model_name"}, ) + + responseSizes = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: InferenceModelComponent, + Name: "response_sizes", + Help: "Inference model responses size distribution in bytes for each model and target model.", + // Most models have a response token < 8192 tokens. Each token, in average, has 4 characters. + // 8192 * 4 = 32768. + Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536}, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"model_name", "target_model_name"}, + ) + + inputTokens = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: InferenceModelComponent, + Name: "input_tokens", + Help: "Inference model input token count distribution for requests in each model.", + // Most models have a input context window less than 1 million tokens. + Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536, 131072, 262144, 524288, 1048576}, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"model_name", "target_model_name"}, + ) + + outputTokens = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: InferenceModelComponent, + Name: "output_tokens", + Help: "Inference model output token count distribution for requests in each model.", + // Most models generates output less than 8192 tokens. + Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"model_name", "target_model_name"}, + ) ) var registerMetrics sync.Once @@ -61,6 +98,9 @@ func Register() { legacyregistry.MustRegister(requestCounter) legacyregistry.MustRegister(requestLatencies) legacyregistry.MustRegister(requestSizes) + legacyregistry.MustRegister(responseSizes) + legacyregistry.MustRegister(inputTokens) + legacyregistry.MustRegister(outputTokens) }) } @@ -84,3 +124,22 @@ func RecordRequestLatencies(modelName, targetModelName string, received time.Tim requestLatencies.WithLabelValues(modelName, targetModelName).Observe(elapsedSeconds) return true } + +// RecordResponseSizes records the response sizes. +func RecordResponseSizes(modelName, targetModelName string, size int) { + responseSizes.WithLabelValues(modelName, targetModelName).Observe(float64(size)) +} + +// RecordInputTokens records input tokens count. +func RecordInputTokens(modelName, targetModelName string, size int) { + if size > 0 { + inputTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size)) + } +} + +// RecordOutputTokens records output tokens count. +func RecordOutputTokens(modelName, targetModelName string, size int) { + if size > 0 { + outputTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size)) + } +} diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go index 80241baa9..57774b11a 100644 --- a/pkg/ext-proc/metrics/metrics_test.go +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -12,6 +12,9 @@ import ( const RequestTotalMetric = InferenceModelComponent + "_request_total" const RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" const RequestSizesMetric = InferenceModelComponent + "_request_sizes" +const ResponseSizesMetric = InferenceModelComponent + "_response_sizes" +const InputTokensMetric = InferenceModelComponent + "_input_tokens" +const OutputTokensMetric = InferenceModelComponent + "_output_tokens" func TestRecordRequestCounterandSizes(t *testing.T) { type requests struct { @@ -160,3 +163,97 @@ func TestRecordRequestLatencies(t *testing.T) { }) } } + +func TestRecordResponseMetrics(t *testing.T) { + type responses struct { + modelName string + targetModelName string + inputToken int + outputToken int + respSize int + } + scenarios := []struct { + name string + resp []responses + }{{ + name: "multiple requests", + resp: []responses{ + { + modelName: "m10", + targetModelName: "t10", + respSize: 1200, + inputToken: 10, + outputToken: 100, + }, + { + modelName: "m10", + targetModelName: "t10", + respSize: 500, + inputToken: 20, + outputToken: 200, + }, + { + modelName: "m10", + targetModelName: "t11", + respSize: 2480, + inputToken: 30, + outputToken: 300, + }, + { + modelName: "m20", + targetModelName: "t20", + respSize: 80, + inputToken: 40, + outputToken: 400, + }, + }, + }} + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, resp := range scenario.resp { + RecordInputTokens(resp.modelName, resp.targetModelName, resp.inputToken) + RecordOutputTokens(resp.modelName, resp.targetModelName, resp.outputToken) + RecordResponseSizes(resp.modelName, resp.targetModelName, resp.respSize) + } + wantResponseSize, err := os.Open("testdata/response_sizes_metric") + defer func() { + if err := wantResponseSize.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantResponseSize, ResponseSizesMetric); err != nil { + t.Error(err) + } + + wantInputToken, err := os.Open("testdata/input_tokens_metric") + defer func() { + if err := wantInputToken.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantInputToken, InputTokensMetric); err != nil { + t.Error(err) + } + + wantOutputToken, err := os.Open("testdata/output_tokens_metric") + defer func() { + if err := wantOutputToken.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantOutputToken, OutputTokensMetric); err != nil { + t.Error(err) + } + }) + } +} diff --git a/pkg/ext-proc/metrics/testdata/input_tokens_metric b/pkg/ext-proc/metrics/testdata/input_tokens_metric new file mode 100644 index 000000000..245c7dfa7 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/input_tokens_metric @@ -0,0 +1,68 @@ +# HELP inference_model_input_tokens [ALPHA] Inference model input token count distribution for requests in each model. +# TYPE inference_model_input_tokens histogram +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="32778"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="131072"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="262144"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="524288"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="1.048576e+06"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 +inference_model_input_tokens_sum{model_name="m10",target_model_name="t10"} 30 +inference_model_input_tokens_count{model_name="m10",target_model_name="t10"} 2 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="32778"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="131072"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="262144"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="524288"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="1.048576e+06"} 1 +inference_model_input_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_input_tokens_sum{model_name="m10",target_model_name="t11"} 30 +inference_model_input_tokens_count{model_name="m10",target_model_name="t11"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="32778"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="131072"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="262144"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="524288"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="1.048576e+06"} 1 +inference_model_input_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_input_tokens_sum{model_name="m20",target_model_name="t20"} 40 +inference_model_input_tokens_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/output_tokens_metric b/pkg/ext-proc/metrics/testdata/output_tokens_metric new file mode 100644 index 000000000..40bbe3272 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/output_tokens_metric @@ -0,0 +1,47 @@ +# HELP inference_model_output_tokens [ALPHA] Inference model output token count distribution for requests in each model. +# TYPE inference_model_output_tokens histogram +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="16"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="32"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="64"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="128"} 1 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="256"} 2 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="512"} 2 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="1024"} 2 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 +inference_model_output_tokens_sum{model_name="m10",target_model_name="t10"} 300 +inference_model_output_tokens_count{model_name="m10",target_model_name="t10"} 2 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="1"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="8"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="16"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="32"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="64"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="128"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="256"} 0 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="512"} 1 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="1024"} 1 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="2048"} 1 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 +inference_model_output_tokens_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_output_tokens_sum{model_name="m10",target_model_name="t11"} 300 +inference_model_output_tokens_count{model_name="m10",target_model_name="t11"} 1 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="1"} 0 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="8"} 0 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="16"} 0 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="32"} 0 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="64"} 0 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="128"} 0 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="256"} 0 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="512"} 1 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 +inference_model_output_tokens_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_output_tokens_sum{model_name="m20",target_model_name="t20"} 400 +inference_model_output_tokens_count{model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/response_sizes_metric b/pkg/ext-proc/metrics/testdata/response_sizes_metric new file mode 100644 index 000000000..7f981090c --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/response_sizes_metric @@ -0,0 +1,56 @@ +# HELP inference_model_response_sizes [ALPHA] Inference model responses size distribution in bytes for each model and target model. +# TYPE inference_model_response_sizes histogram +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="1"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="8"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="16"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="32"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="64"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="128"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="256"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="512"} 1 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="1024"} 1 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="2048"} 2 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="4096"} 2 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="8192"} 2 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="16384"} 2 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="32778"} 2 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="65536"} 2 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t10",le="+Inf"} 2 +inference_model_response_sizes_sum{model_name="m10",target_model_name="t10"} 1700 +inference_model_response_sizes_count{model_name="m10",target_model_name="t10"} 2 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="1"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="8"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="16"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="32"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="64"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="128"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="256"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="512"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="1024"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="2048"} 0 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="4096"} 1 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="8192"} 1 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="16384"} 1 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="32778"} 1 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="65536"} 1 +inference_model_response_sizes_bucket{model_name="m10",target_model_name="t11",le="+Inf"} 1 +inference_model_response_sizes_sum{model_name="m10",target_model_name="t11"} 2480 +inference_model_response_sizes_count{model_name="m10",target_model_name="t11"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="1"} 0 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="8"} 0 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="16"} 0 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="32"} 0 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="64"} 0 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="128"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="256"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="512"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="1024"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="2048"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="4096"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="8192"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="16384"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="32778"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="65536"} 1 +inference_model_response_sizes_bucket{model_name="m20",target_model_name="t20",le="+Inf"} 1 +inference_model_response_sizes_sum{model_name="m20",target_model_name="t20"} 80 +inference_model_response_sizes_count{model_name="m20",target_model_name="t20"} 1