diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go index dea51ba45..3661820d8 100644 --- a/api/v1alpha1/inferencemodel_types.go +++ b/api/v1alpha1/inferencemodel_types.go @@ -58,23 +58,29 @@ type InferenceModelList struct { // creation timestamp, will be selected to remain valid. In the event of a race // condition, one will be selected at random. type InferenceModelSpec struct { - // ModelName is the name of the model as the users set in the "model" parameter in the requests. - // The name should be unique among the workloads that reference the same backend pool. - // This is the parameter that will be used to match the request with. In the future, we may - // allow to match on other request parameters. The other approach to support matching - // on other request parameters is to use a different ModelName per HTTPFilter. - // Names can be reserved without implementing an actual model in the pool. + // ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. + // ModelNames must be unique for a referencing InferencePool + // (names can be reused for a different pool in the same cluster). + // The modelName with the oldest creation timestamp is retained, and the incoming + // InferenceModel is sets the Ready status to false with a corresponding reason. + // In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. + // Names can be reserved without an underlying model configured in the pool. // This can be done by specifying a target model and setting the weight to zero, // an error will be returned specifying that no valid target model is found. // - // +kubebuilder:validation:MaxLength=253 + // +kubebuilder:validation:MaxLength=256 // +kubebuilder:validation:Required ModelName string `json:"modelName"` // Criticality defines how important it is to serve the model compared to other models referencing the same pool. + // Criticality impacts how traffic is handled in resource constrained situations. It handles this by + // queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will + // fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, + // and the proportionality of fairness will be configurable. // + // Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. + // Any implementations that may consume this field may treat an unset value as the 'Standard' range. // +optional - // +kubebuilder:default="Default" Criticality *Criticality `json:"criticality,omitempty"` // TargetModels allow multiple versions of a model for traffic splitting. @@ -83,6 +89,7 @@ type InferenceModelSpec struct { // // +optional // +kubebuilder:validation:MaxItems=10 + // +kubebuilder:validation:XValidation:message="Weights should be set for all models, or none of the models.",rule="self.all(model, has(model.weight)) || self.all(model, !has(model.weight))" TargetModels []TargetModel `json:"targetModels,omitempty"` // PoolRef is a reference to the inference pool, the pool must exist in the same namespace. @@ -120,16 +127,19 @@ type PoolObjectReference struct { } // Criticality defines how important it is to serve the model compared to other models. -// +kubebuilder:validation:Enum=Critical;Default;Sheddable +// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default. +// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior. +// +kubebuilder:validation:Enum=Critical;Standard;Sheddable type Criticality string const ( // Critical defines the highest level of criticality. Requests to this band will be shed last. Critical Criticality = "Critical" - // Default defines the default criticality level and is more important than Sheddable but less + // Standard defines the base criticality level and is more important than Sheddable but less // important than Critical. Requests in this band will be shed before critical traffic. - Default Criticality = "Default" + // Most models are expected to fall within this band. + Standard Criticality = "Standard" // Sheddable defines the lowest level of criticality. Requests to this band will be shed before // all other bands. @@ -160,16 +170,13 @@ type TargetModel struct { // implementation supports. Weight is not a percentage and the sum of // weights does not need to equal 100. // - // If only one model is specified and it has a weight greater than 0, 100% - // of the traffic is forwarded to that model. If weight is set to 0, no - // traffic should be forwarded for this model. If unspecified, weight - // defaults to 1. + // If a weight is set for any targetModel, it must be set for all targetModels. + // Conversely weights are optional, so long as ALL targetModels do not specify a weight. // // +optional - // +kubebuilder:default=1 // +kubebuilder:validation:Minimum=0 // +kubebuilder:validation:Maximum=1000000 - Weight int32 `json:"weight,omitempty"` + Weight *int32 `json:"weight,omitempty"` } // InferenceModelStatus defines the observed state of InferenceModel diff --git a/api/v1alpha1/inferencepool_types.go b/api/v1alpha1/inferencepool_types.go index a9e15d311..d89b8df53 100644 --- a/api/v1alpha1/inferencepool_types.go +++ b/api/v1alpha1/inferencepool_types.go @@ -44,11 +44,9 @@ type InferencePoolList struct { // InferencePoolSpec defines the desired state of InferencePool type InferencePoolSpec struct { - // Selector defines a map of label to watch model server pods - // that should be included in the InferencePool. ModelServers should not - // be with any other Service or InferencePool, that behavior is not supported - // and will result in sub-optimal utilization. - // In some cases, implementations may translate this to a Service selector, so this matches the simple + // Selector defines a map of labels to watch model server pods + // that should be included in the InferencePool. + // In some cases, implementations may translate this field to a Service selector, so this matches the simple // map used for Service selectors instead of the full Kubernetes LabelSelector type. // // +kubebuilder:validation:Required diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 861f79015..27fe7579d 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -95,7 +95,9 @@ func (in *InferenceModelSpec) DeepCopyInto(out *InferenceModelSpec) { if in.TargetModels != nil { in, out := &in.TargetModels, &out.TargetModels *out = make([]TargetModel, len(*in)) - copy(*out, *in) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } } out.PoolRef = in.PoolRef } @@ -253,6 +255,11 @@ func (in *PoolObjectReference) DeepCopy() *PoolObjectReference { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TargetModel) DeepCopyInto(out *TargetModel) { *out = *in + if in.Weight != nil { + in, out := &in.Weight, &out.Weight + *out = new(int32) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TargetModel. diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml index ffdceddb1..bca196059 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml @@ -55,25 +55,32 @@ spec: condition, one will be selected at random. properties: criticality: - default: Default - description: Criticality defines how important it is to serve the - model compared to other models referencing the same pool. + description: |- + Criticality defines how important it is to serve the model compared to other models referencing the same pool. + Criticality impacts how traffic is handled in resource constrained situations. It handles this by + queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will + fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, + and the proportionality of fairness will be configurable. + + Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. + Any implementations that may consume this field may treat an unset value as the 'Standard' range. enum: - Critical - - Default + - Standard - Sheddable type: string modelName: description: |- - ModelName is the name of the model as the users set in the "model" parameter in the requests. - The name should be unique among the workloads that reference the same backend pool. - This is the parameter that will be used to match the request with. In the future, we may - allow to match on other request parameters. The other approach to support matching - on other request parameters is to use a different ModelName per HTTPFilter. - Names can be reserved without implementing an actual model in the pool. + ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. + ModelNames must be unique for a referencing InferencePool + (names can be reused for a different pool in the same cluster). + The modelName with the oldest creation timestamp is retained, and the incoming + InferenceModel is sets the Ready status to false with a corresponding reason. + In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. + Names can be reserved without an underlying model configured in the pool. This can be done by specifying a target model and setting the weight to zero, an error will be returned specifying that no valid target model is found. - maxLength: 253 + maxLength: 256 type: string poolRef: description: PoolRef is a reference to the inference pool, the pool @@ -121,7 +128,6 @@ spec: maxLength: 253 type: string weight: - default: 1 description: |- Weight is used to determine the proportion of traffic that should be sent to this model when multiple target models are specified. @@ -133,10 +139,8 @@ spec: implementation supports. Weight is not a percentage and the sum of weights does not need to equal 100. - If only one model is specified and it has a weight greater than 0, 100% - of the traffic is forwarded to that model. If weight is set to 0, no - traffic should be forwarded for this model. If unspecified, weight - defaults to 1. + If a weight is set for any targetModel, it must be set for all targetModels. + Conversely weights are optional, so long as ALL targetModels do not specify a weight. format: int32 maximum: 1000000 minimum: 0 @@ -146,6 +150,9 @@ spec: type: object maxItems: 10 type: array + x-kubernetes-validations: + - message: Weights should be set for all models, or none of the models. + rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight)) required: - modelName - poolRef diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index de5f40bc4..8e0ff54d2 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -58,11 +58,9 @@ spec: pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ type: string description: |- - Selector defines a map of label to watch model server pods - that should be included in the InferencePool. ModelServers should not - be with any other Service or InferencePool, that behavior is not supported - and will result in sub-optimal utilization. - In some cases, implementations may translate this to a Service selector, so this matches the simple + Selector defines a map of labels to watch model server pods + that should be included in the InferencePool. + In some cases, implementations may translate this field to a Service selector, so this matches the simple map used for Service selectors instead of the full Kubernetes LabelSelector type. type: object targetPortNumber: diff --git a/docs/proposals/002-api-proposal/proposal.md b/docs/proposals/002-api-proposal/proposal.md index 8f406d867..f6d0c9e70 100644 --- a/docs/proposals/002-api-proposal/proposal.md +++ b/docs/proposals/002-api-proposal/proposal.md @@ -78,6 +78,7 @@ The API design is based on these axioms: - This solution should be composable with other Gateway solutions and flexible to fit customer needs - The MVP will heavily assume requests are done using the OpenAI spec, but open to extension in the future - The Gateway should route in a way that does not generate a queue of requests at the model server level +- Model serving differs from web-serving in critical ways. One of these is the existence of multiple models for the same service, which can materially impact behavior, depending on the model served. As opposed to a web-service that has mechanisms to render implementation changes invisible to an end user The [PoC](https://youtu.be/NUBZg_uqqXk?si=v681EeYdGUGEVqQQ&t=1458) was focused on lower-level scheduling. And the API follows that similar logic, which lead to the proposal of the **InferencePool**. @@ -126,9 +127,7 @@ type InferencePool struct { type InferencePoolSpec struct { // ModelServerSelector uses label selection to watch model server pods - // that should be included in the InferencePool. ModelServers should not - // be with any other Service or InferencePool, that behavior is not supported - // and will result in sub-optimal utilization. + // that should be included in the InferencePool. ModelServerSelector map[string]string `json:"modelServerSelector,omitempty"` } ``` @@ -136,17 +135,13 @@ type InferencePoolSpec struct { **InferenceModel** ```golang // InferenceModel represents a set of Models/Adapters that are multiplexed onto one -// or more Inferencepools. This resource is managed by the "Inference Workload Owner" +// or more InferencePools. This resource is managed by the "Inference Workload Owner" // persona. The Inference Workload Owner persona is: a team that trains, verifies, and // leverages a large language model from a model frontend, drives the lifecycle // and rollout of new versions of those models, and defines the specific // performance and latency goals for the model. These workloads are -// expected to operate within an InferencePool sharing compute capacity with other -// InferenceModels, defined by the Inference Platform Admin. We allow a user who -// has multiple InferenceModels across multiple pools (with the same config) to -// specify the configuration exactly once, and deploy to many pools -// simultaneously. Enabling a simpler config and single source of truth -// for a given user. InferenceModel ModelNames are unique for a given InferencePool, +// expected to coexist within an InferencePool: sharing compute capacity with other +// InferenceModels, with sharing limitations defined by the Inference Platform Admin. type InferenceModel struct { metav1.ObjectMeta metav1.TypeMeta @@ -155,21 +150,26 @@ type InferenceModel struct { } type InferenceModelSpec struct { - // The name of the model as the users set in the "model" parameter in the requests. - // The name should be unique among the workloads that reference the same backend pool. - // This is the parameter that will be used to match the request with. In the future, we may - // allow to match on other request parameters. The other approach to support matching on - // on other request parameters is to use a different ModelName per HTTPFilter. - // Names can be reserved without implementing an actual model in the pool. + // The name of the model as it will be set in the "model" parameter for an incoming request. + // ModelNames are expected to be unique for a specific InferencePool + // (names can be reused for a different pool in the same cluster). + // The modelName with the oldest creation timestamp is retained, and the incoming + // InferenceModel is sets the Ready status to false with a corresponding reason. + // In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. + // Names can be reserved without an underlying model configured in the pool. // This can be done by specifying a target model and setting the weight to zero, // an error will be returned specifying that no valid target model is found. ModelName string // Optional // Defines how important it is to serve the model compared to other models referencing the same pool. + // Criticality impacts how traffic is handled in resource constrained situations. It handles this by + // queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will + // fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, + // and the proportionality of fairness will be configurable. Criticality *Criticality // Optional. - // Allow multiple versions of a model for traffic splitting. - // If not specified, the target model name is defaulted to the ModelName parameter. + // Allow multiple versions of a model for traffic splitting. + // If not specified, the target model name is defaulted to the ModelName parameter. // ModelName is often in reference to a LoRA adapter. TargetModels []TargetModel // Reference to the InferencePool that the model registers to. It must exist in the same namespace. @@ -177,6 +177,8 @@ type InferenceModelSpec struct { } // Defines how important it is to serve the model compared to other models. +// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field should ALWAYS be optional(use a pointer), and set no default. +// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior. type Criticality string const ( // Most important. Requests to this band will be shed last. @@ -200,7 +202,7 @@ type TargetModel struct { Name string // Weight is used to determine the percentage of traffic that should be // sent to this target model when multiple versions of the model are specified. - Weight int + Weight *int } // LocalObjectReference identifies an API object within the namespace of the diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go index 70f000b87..569fb1bdc 100644 --- a/pkg/ext-proc/backend/datastore.go +++ b/pkg/ext-proc/backend/datastore.go @@ -91,15 +91,15 @@ func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string { } r := rand.New(source) for _, model := range model.Spec.TargetModels { - weights += model.Weight + weights += *model.Weight } klog.V(3).Infof("Weights for Model(%v) total to: %v", model.Name, weights) randomVal := r.Int31n(weights) for _, model := range model.Spec.TargetModels { - if randomVal < model.Weight { + if randomVal < *model.Weight { return model.Name } - randomVal -= model.Weight + randomVal -= *model.Weight } return "" } diff --git a/pkg/ext-proc/backend/datastore_test.go b/pkg/ext-proc/backend/datastore_test.go index d4ad48e17..323b3bb00 100644 --- a/pkg/ext-proc/backend/datastore_test.go +++ b/pkg/ext-proc/backend/datastore_test.go @@ -58,11 +58,11 @@ func TestRandomWeightedDraw(t *testing.T) { TargetModels: []v1alpha1.TargetModel{ { Name: "canary", - Weight: 50, + Weight: pointer(50), }, { Name: "v1", - Weight: 50, + Weight: pointer(50), }, }, }, @@ -76,15 +76,15 @@ func TestRandomWeightedDraw(t *testing.T) { TargetModels: []v1alpha1.TargetModel{ { Name: "canary", - Weight: 25, + Weight: pointer(25), }, { Name: "v1.1", - Weight: 55, + Weight: pointer(55), }, { Name: "v1", - Weight: 50, + Weight: pointer(50), }, }, }, @@ -98,15 +98,15 @@ func TestRandomWeightedDraw(t *testing.T) { TargetModels: []v1alpha1.TargetModel{ { Name: "canary", - Weight: 20, + Weight: pointer(20), }, { Name: "v1.1", - Weight: 20, + Weight: pointer(20), }, { Name: "v1", - Weight: 10, + Weight: pointer(10), }, }, }, @@ -127,3 +127,7 @@ func TestRandomWeightedDraw(t *testing.T) { }) } } + +func pointer(v int32) *int32 { + return &v +} diff --git a/pkg/ext-proc/test/hermetic_test.go b/pkg/ext-proc/test/hermetic_test.go index d98031ee7..acbd74a94 100644 --- a/pkg/ext-proc/test/hermetic_test.go +++ b/pkg/ext-proc/test/hermetic_test.go @@ -41,7 +41,7 @@ func TestHandleRequestBody(t *testing.T) { TargetModels: []v1alpha1.TargetModel{ { Name: "my-model-v1", - Weight: 100, + Weight: pointer(100), }, }, }, @@ -172,3 +172,7 @@ func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, t.Logf("Received request %+v", res) return res, err } + +func pointer(v int32) *int32 { + return &v +}