From 4d0a1e78a7b6c8d2bbae20c7a730220348496cbd Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Mon, 3 Jul 2023 14:59:18 +0200 Subject: [PATCH 01/10] api: Generate CRDs from APIs --- Makefile | 29 +- config/crd/bases/ibm.com_quotasubtree-v1.yaml | 62 ---- config/crd/bases/ibm.com_quotasubtrees.yaml | 122 +++++++ .../crd/bases/mcad.ibm.com_appwrappers.yaml | 304 ++++++++++-------- config/crd/bases/mcad.ibm.com_queuejobs.yaml | 184 +++++++---- .../bases/mcad.ibm.com_schedulingspecs.yaml | 156 ++++++--- .../crds/ibm.com_quotasubtree-v1.yaml | 62 ---- .../crds/ibm.com_quotasubtrees.yaml | 122 +++++++ .../crds/mcad.ibm.com_appwrappers.yaml | 302 +++++++++-------- .../crds/mcad.ibm.com_queuejobs.yaml | 184 +++++++---- .../crds/mcad.ibm.com_schedulingspecs.yaml | 156 ++++++--- pkg/apis/controller/v1beta1/appwrapper.go | 56 ++-- pkg/apis/controller/v1beta1/doc.go | 19 +- pkg/apis/quotaplugins/quotasubtree/v1/doc.go | 19 ++ 14 files changed, 1116 insertions(+), 661 deletions(-) delete mode 100644 config/crd/bases/ibm.com_quotasubtree-v1.yaml create mode 100644 config/crd/bases/ibm.com_quotasubtrees.yaml delete mode 100644 deployment/mcad-controller/crds/ibm.com_quotasubtree-v1.yaml create mode 100644 deployment/mcad-controller/crds/ibm.com_quotasubtrees.yaml diff --git a/Makefile b/Makefile index 68cd689cb..5b6f84d94 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,17 @@ GIT_BRANCH:=$(shell git symbolic-ref --short HEAD 2>&1 | grep -v fatal) #define the GO_BUILD_ARGS if you need to pass additional arguments to the go build GO_BUILD_ARGS?= +## Location to install dependencies to +LOCALBIN ?= $(shell pwd)/bin +$(LOCALBIN): + mkdir -p $(LOCALBIN) + +## Tool Versions +CONTROLLER_TOOLS_VERSION ?= v0.9.2 + +## Tool Binaries +CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen + # Reset branch name if this a Travis CI environment ifneq ($(strip $(TRAVIS_BRANCH)),) GIT_BRANCH:=${TRAVIS_BRANCH} @@ -35,7 +46,7 @@ endif .PHONY: print-global-variables -# Build the controler executable for use in docker image build +# Build the controller executable for use in docker image build mcad-controller: init generate-code ifeq ($(strip $(GO_BUILD_ARGS)),) $(info Compiling controller) @@ -67,6 +78,15 @@ verify-tag-name: print-global-variables # Check for invalid tag name t=${TAG} && [ $${#t} -le 128 ] || { echo "Target name $$t has 128 or more chars"; false; } +.PHONY: controller-gen +controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary. +$(CONTROLLER_GEN): $(LOCALBIN) + test -s $(LOCALBIN)/controller-gen || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_TOOLS_VERSION) + +.PHONY: manifests +manifests: controller-gen ## Generate CustomResourceDefinition objects. + $(CONTROLLER_GEN) crd:allowDangerousTypes=true paths="./pkg/apis/..." output:crd:artifacts:config=config/crd/bases + generate-code: pkg/apis/controller/v1beta1/zz_generated.deepcopy.go pkg/apis/controller/v1beta1/zz_generated.deepcopy.go: ${BIN_DIR}/deepcopy-gen @@ -141,18 +161,17 @@ clean: #CRD file maintenance rules DEPLOYMENT_CRD_DIR=deployment/mcad-controller/crds CRD_BASE_DIR=config/crd/bases -MCAD_CRDS= ${DEPLOYMENT_CRD_DIR}/ibm.com_quotasubtree-v1.yaml \ +MCAD_CRDS= ${DEPLOYMENT_CRD_DIR}/ibm.com_quotasubtrees.yaml \ ${DEPLOYMENT_CRD_DIR}/mcad.ibm.com_appwrappers.yaml \ ${DEPLOYMENT_CRD_DIR}/mcad.ibm.com_queuejobs.yaml \ ${DEPLOYMENT_CRD_DIR}/mcad.ibm.com_schedulingspecs.yaml update-deployment-crds: ${MCAD_CRDS} -${DEPLOYMENT_CRD_DIR}/mcad.ibm.com_schedulingspecs.yaml : ${CRD_BASE_DIR}/mcad.ibm.com_schedulingspecs.yaml +${DEPLOYMENT_CRD_DIR}/ibm.com_quotasubtrees.yaml : ${CRD_BASE_DIR}/ibm.com_quotasubtrees.yaml ${DEPLOYMENT_CRD_DIR}/mcad.ibm.com_appwrappers.yaml : ${CRD_BASE_DIR}/mcad.ibm.com_appwrappers.yaml ${DEPLOYMENT_CRD_DIR}/mcad.ibm.com_queuejobs.yaml : ${CRD_BASE_DIR}/mcad.ibm.com_queuejobs.yaml ${DEPLOYMENT_CRD_DIR}/mcad.ibm.com_schedulingspecs.yaml : ${CRD_BASE_DIR}/mcad.ibm.com_schedulingspecs.yaml - $(DEPLOYMENT_CRD_DIR)/%: ${CRD_BASE_DIR}/% - cp $< $@ \ No newline at end of file + cp $< $@ diff --git a/config/crd/bases/ibm.com_quotasubtree-v1.yaml b/config/crd/bases/ibm.com_quotasubtree-v1.yaml deleted file mode 100644 index 38bd14197..000000000 --- a/config/crd/bases/ibm.com_quotasubtree-v1.yaml +++ /dev/null @@ -1,62 +0,0 @@ -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: quotasubtrees.ibm.com - finalizers: [] -spec: - group: ibm.com - scope: Namespaced - names: - kind: QuotaSubtree - singular: quotasubtree - plural: quotasubtrees - shortNames: - - qst - versions: - - name: v1 - served: true - storage: true - subresources: - status: {} - schema: - openAPIV3Schema: - type: object - properties: - status: - type: object - x-kubernetes-preserve-unknown-fields: true - spec: - type: object - properties: - parent: - type: string - parentNamespace: - type: string - children: - type: array - items: - type: object - properties: - name: - type: string - namespace: - type: string - quotas: - type: object - properties: - disabled: - type: boolean - hardLimit: - type: boolean - requests: - type: object - properties: - cpu: - x-kubernetes-int-or-string: true - pattern: '^[0-9]*(m)?$' - memory: - x-kubernetes-int-or-string: true - pattern: '^[0-9]*(Ei|Pi|Ti|Gi|Mi|Ki|E|P|T|G|M|K)?$' - nvidia.com/gpu: - x-kubernetes-int-or-string: true - pattern: '^[0-9]?$' diff --git a/config/crd/bases/ibm.com_quotasubtrees.yaml b/config/crd/bases/ibm.com_quotasubtrees.yaml new file mode 100644 index 000000000..a410ab505 --- /dev/null +++ b/config/crd/bases/ibm.com_quotasubtrees.yaml @@ -0,0 +1,122 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null + name: quotasubtrees.ibm.com +spec: + group: ibm.com + names: + kind: QuotaSubtree + listKind: QuotaSubtreeList + plural: quotasubtrees + singular: quotasubtree + scope: Namespaced + versions: + - name: v1 + schema: + openAPIV3Schema: + description: QuotaSubtree is a specification for a quota subtree resource + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: QuotaSubtreeSpec is the spec for a resource plan + properties: + children: + items: + description: Child is the spec for a QuotaSubtree resource + properties: + name: + type: string + namespace: + type: string + path: + type: string + quotas: + description: Quota is the spec for a QuotaSubtree resource + properties: + disabled: + type: boolean + hardLimit: + type: boolean + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + type: object + type: object + type: array + parent: + type: string + parentNamespace: + type: string + type: object + status: + description: QuotaSubtreeStatus is the status for a QuotaSubtree resource + properties: + children: + items: + description: ResourceAllocation is the spec for the child status + properties: + allocated: + description: ResourceAllocationStatus is the spec for the child + resource usage + properties: + requests: + additionalProperties: + type: string + type: object + type: object + name: + type: string + namespace: + type: string + path: + type: string + type: object + type: array + totalAllocation: + description: ResourceAllocation is the spec for the child status + properties: + allocated: + description: ResourceAllocationStatus is the spec for the child + resource usage + properties: + requests: + additionalProperties: + type: string + type: object + type: object + name: + type: string + namespace: + type: string + path: + type: string + type: object + required: + - children + - totalAllocation + type: object + required: + - spec + type: object + served: true + storage: true diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 0f7598754..4cae0914f 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -1,10 +1,10 @@ - --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.4.1 + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null name: appwrappers.mcad.ibm.com spec: group: mcad.ibm.com @@ -53,22 +53,10 @@ spec: resource type format: int32 type: integer - apiVersion: - description: 'APIVersion defines the versioned schema of - this representation of an object. Servers should convert - recognized schemas to the latest internal value, and may - reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string completionstatus: - description: Optional field that drives completion status of appwrapper. - This field within an item of an appwrapper determines the full state of appwrapper. - The completionstatus field contains a list of conditions that make the associate item considered - completed, for instance :- completion conditions could be "Complete" or "Failed". - The associated item's level .status.conditions[].type field is monitored for any one of these conditions. Once all items with this - option is set and the conditionstatus is met the entire appwrapper state will be changed to one of the valid appwrapper completion state. Note :- this is an AND - operation for all items where this option is set. See the list of appwrapper states for a list of valid complete states. + description: Optional field for users to determine completion + status of item type: string - custompodresources: description: Optional section that specifies resource requirements for non-standard k8s resources, follows same format as @@ -109,14 +97,6 @@ spec: type: object x-kubernetes-embedded-resource: true x-kubernetes-preserve-unknown-fields: true - kind: - description: 'Kind is a string value representing the REST - resource this object represents. Servers may infer this - from the endpoint the client submits requests to. Cannot - be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object minavailable: description: The minimal available pods to run for this AppWrapper; the default value is nil @@ -124,8 +104,8 @@ spec: type: integer priority: description: The priority of this resource - format: int32 - type: integer + format: float + type: number priorityslope: description: The increasing rate of priority value for this resource @@ -140,36 +120,21 @@ spec: Items: items: description: 'AppWrapperResource is App Wrapper aggregation - resource todo: To be depricated' + resource TODO: To be deprecated' properties: allocatedreplicas: description: The number of allocated replicas from this resource type format: int32 type: integer - apiVersion: - description: 'APIVersion defines the versioned schema of - this representation of an object. Servers should convert - recognized schemas to the latest internal value, and may - reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST - resource this object represents. Servers may infer this - from the endpoint the client submits requests to. Cannot - be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object minavailable: description: The minimal available pods to run for this AppWrapper; the default value is nil format: int32 type: integer priority: - description: The priority of this resource - format: int32 - type: integer + format: float + type: number priorityslope: description: The increasing rate of priority value for this resource @@ -190,6 +155,7 @@ spec: Pod, a ReplicaSet, a ... ?) type: string required: + - priorityslope - template type: object type: array @@ -252,67 +218,130 @@ spec: field is planned to be removed in 1.21 release." type: string type: object + required: + - metadata type: object schedulingSpec: - description: Field 'schedulingSpec' specifies the parameters used for scheduling generic items - wrapped inside AppWrappers. It defines the policy for requeuing jobs based on the number - of running pods. + description: SchedSpec specifies the parameters for scheduling. properties: - minAvailable: - description: Expected number of pods in running and/or completed state. Requeuing is triggered - when the number of running/completed pods is not equal to this value. When not specified, - requeuing is disabled and no check is performed. - type: integer - requeuing: - description: Specification of the requeuing strategy based on waiting time. Values in this field - control how often the pod check should happen and if requeuing has reached its maximum number of - times. + clusterScheduling: properties: - initialTimeInSeconds: - description: Value to keep track of the initial wait time. Users cannot set this as it is - taken from 'timeInSeconds'. - type: integer - timeInSeconds: - description: Initial waiting time before requeuing conditions are checked. This value is - specified by the user, but it may grow as requeuing events happen. - type: integer - default: 300 - maxTimeInSeconds: - description: Maximum waiting time for requeuing checks - type: integer - default: 0 - growthType: - description: Growth strategy to increase the waiting time between requeuing checks. The values - available are 'exponential', 'linear', or 'none'. For example, 'exponential' growth would - double the 'timeInSeconds' value every time a requeuing event is triggered. If the string value - is misspelled or not one of the possible options, the growth behavior is defaulted to 'none'. - type: string - default: "exponential" - numRequeuings: - description: Field to keep track of how many times a requeuing event has been triggered - type: integer - default: 0 - maxNumRequeuings: - description: Maximum number of requeuing events allowed. Once this value is reached (e.g., - 'numRequeuings = maxNumRequeuings', no more requeuing checks are performed and the generic - items are stopped and removed from the cluster (AppWrapper remains deployed). - type: integer - default: 0 + clusterSelector: + description: A label selector is a label query over a set + of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, + Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists or + DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field is + "key", the operator is "In", and the values array contains + only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + clusters: + items: + properties: + name: + type: string + required: + - name + type: object + type: array type: object dispatchDuration: - description: Wall clock duration time of appwrapper in seconds properties: expected: - format: int32 type: integer limit: - format: int32 type: integer + overrun: + type: boolean + type: object + dispatchingWindow: + properties: + end: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object + start: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object type: object + minAvailable: + type: integer nodeSelector: additionalProperties: type: string type: object + requeuing: + properties: + growthType: + type: string + initialTimeInSeconds: + type: integer + maxNumRequeuings: + type: integer + maxTimeInSeconds: + type: integer + numRequeuings: + type: integer + timeInSeconds: + type: integer + type: object type: object selector: description: A label selector is a label query over a set of resources. @@ -361,6 +390,7 @@ spec: are ANDed. type: object type: object + x-kubernetes-map-type: atomic service: description: AppWrapperService is App Wrapper service definition properties: @@ -730,49 +760,11 @@ spec: - type type: object type: array - pendingpodconditions: - description: Represent conditions of pod(s) that failed scheduling. - items: - description: Describes scheduling failed condition(s) of a pod - properties: - podname: - description: Name of the pod - type: string - conditions: - description: Failed condition(s) of a pod - items: - description: DeploymentCondition describes the state of a deployment - at a certain point. - properties: - LastProbeTime: - description: Last time the condition transitioned from one status - to another. - format: date-time - type: string - LastTransitionTime: - description: The last time this condition was updated. - format: date-time - type: string - message: - description: A human readable message indicating details about - the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of appwrapper condition. - type: string - required: - - status - - type - type: object - type: array - type: object - type: array + controllerfirstdispatchtimestamp: + description: Microsecond level timestamp when controller first sets + appwrapper in state Running + format: date-time + type: string controllerfirsttimestamp: description: Microsecond level timestamp when controller first sees QueueJob (by Informer) @@ -811,6 +803,50 @@ spec: description: The number of pending pods. format: int32 type: integer + pendingpodconditions: + description: Represents the latest available observations of pods + under appwrapper + items: + properties: + conditions: + items: + description: PodCondition contains details for the current + condition of this pod. + properties: + lastProbeTime: + description: Last time we probed the condition. + format: date-time + type: string + lastTransitionTime: + description: Last time the condition transitioned from + one status to another. + format: date-time + type: string + message: + description: Human-readable message indicating details + about last transition. + type: string + reason: + description: Unique, one-word, CamelCase reason for the + condition's last transition. + type: string + status: + description: 'Status is the status of the condition. Can + be True, False, Unknown. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#pod-conditions' + type: string + type: + description: 'Type is the type of the condition. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#pod-conditions' + type: string + required: + - status + - type + type: object + type: array + podname: + type: string + type: object + type: array queuejobstate: description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining, ... @@ -834,9 +870,13 @@ spec: (is this different from the MinAvailable from JobStatus) format: int32 type: integer + required: + - pendingpodconditions type: object required: - spec type: object served: true storage: true + subresources: + status: {} diff --git a/config/crd/bases/mcad.ibm.com_queuejobs.yaml b/config/crd/bases/mcad.ibm.com_queuejobs.yaml index 6c60855de..22b7b435c 100644 --- a/config/crd/bases/mcad.ibm.com_queuejobs.yaml +++ b/config/crd/bases/mcad.ibm.com_queuejobs.yaml @@ -1,10 +1,10 @@ - --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.4.1 + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null name: queuejobs.mcad.ibm.com spec: group: mcad.ibm.com @@ -33,70 +33,131 @@ spec: type: object spec: description: Specification of the desired behavior of a cron job, including - the minAvailable and the requeuing strategy + the minAvailable properties: schedulerName: type: string schedulingSpec: - description: Field 'schedulingSpec' specifies the parameters used for scheduling generic items - wrapped inside AppWrappers. It defines the policy for requeuing jobs based on the number - of running pods. + description: SchedSpec specifies the parameters for scheduling. properties: - minAvailable: - description: Expected number of pods in running and/or completed state. Requeuing is triggered - when the number of running/completed pods is not equal to this value. When not specified, - requeuing is disabled and no check is performed. - type: integer - requeuing: - description: Specification of the requeuing strategy based on waiting time. Values in this field - control how often the pod check should happen and if requeuing has reached its maximum number of - times. + clusterScheduling: properties: - initialTimeInSeconds: - description: Value to keep track of the initial wait time. Users cannot set this as it is - taken from 'timeInSeconds'. - type: integer - timeInSeconds: - description: Initial waiting time before requeuing conditions are checked. This value is - specified by the user, but it may grow as requeuing events happen. - type: integer - default: 300 - maxTimeInSeconds: - description: Maximum waiting time for requeuing checks - type: integer - default: 0 - growthType: - description: Growth strategy to increase the waiting time between requeuing checks. The values - available are 'exponential', 'linear', or 'none'. For example, 'exponential' growth would - double the 'timeInSeconds' value every time a requeuing event is triggered. If the string value - is misspelled or not one of the possible options, the growth behavior is defaulted to 'none'. - type: string - default: "exponential" - numRequeuings: - description: Field to keep track of how many times a requeuing event has been triggered - type: integer - default: 0 - maxNumRequeuings: - description: Maximum number of requeuing events allowed. Once this value is reached (e.g., - 'numRequeuings = maxNumRequeuings', no more requeuing checks are performed and the generic - items are stopped and removed from the cluster (AppWrapper remains deployed). - type: integer - default: 0 + clusterSelector: + description: A label selector is a label query over a set + of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, + Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists or + DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field is + "key", the operator is "In", and the values array contains + only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + clusters: + items: + properties: + name: + type: string + required: + - name + type: object + type: array type: object dispatchDuration: - description: Wall clock duration time of appwrapper in seconds properties: expected: - format: int32 type: integer limit: - format: int32 type: integer + overrun: + type: boolean type: object + dispatchingWindow: + properties: + end: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object + start: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object + type: object + minAvailable: + type: integer nodeSelector: additionalProperties: type: string type: object + requeuing: + properties: + growthType: + type: string + initialTimeInSeconds: + type: integer + maxNumRequeuings: + type: integer + maxTimeInSeconds: + type: integer + numRequeuings: + type: integer + timeInSeconds: + type: integer + type: object type: object taskSpecs: description: TaskSpecs specifies the task specification of QueueJob @@ -153,6 +214,7 @@ spec: only "value". The requirements are ANDed. type: object type: object + x-kubernetes-map-type: atomic template: description: Specifies the pod that will be created for this TaskSpec when executing a QueueJob @@ -497,6 +559,7 @@ spec: are ANDed. type: object type: object + x-kubernetes-map-type: atomic namespaces: description: namespaces specifies which namespaces the labelSelector @@ -612,6 +675,7 @@ spec: ANDed. type: object type: object + x-kubernetes-map-type: atomic namespaces: description: namespaces specifies which namespaces the labelSelector applies @@ -728,6 +792,7 @@ spec: are ANDed. type: object type: object + x-kubernetes-map-type: atomic namespaces: description: namespaces specifies which namespaces the labelSelector @@ -843,6 +908,7 @@ spec: ANDed. type: object type: object + x-kubernetes-map-type: atomic namespaces: description: namespaces specifies which namespaces the labelSelector applies @@ -5110,6 +5176,7 @@ spec: "value". The requirements are ANDed. type: object type: object + x-kubernetes-map-type: atomic maxSkew: description: 'MaxSkew describes the degree to which pods may be unevenly distributed. When @@ -5145,12 +5212,12 @@ spec: spread constraint. - DoNotSchedule (default) tells the scheduler not to schedule it. - ScheduleAnyway tells the scheduler to schedule the pod in any - location, but giving higher precedence to - topologies that would help reduce the skew. - A constraint is considered "Unsatisfiable" for - an incoming pod if and only if every possible - node assigment for that pod would violate "MaxSkew" - on some topology. For example, in a 3-zone cluster, + location, but giving higher precedence to topologies + that would help reduce the skew. A constraint + is considered "Unsatisfiable" for an incoming + pod if and only if every possible node assigment + for that pod would violate "MaxSkew" on some + topology. For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | If WhenUnsatisfiable @@ -5625,12 +5692,12 @@ spec: pod is removed. \n Use this if: a) the volume is only needed while the pod runs, b) features of normal volumes like restoring from snapshot - or capacity tracking are needed, c) the storage + or capacity tracking are needed, c) the storage driver is specified through a storage class, and d) the storage driver supports dynamic volume - provisioning through a PersistentVolumeClaim - (see EphemeralVolumeSource for more information - on the connection between this volume type and + provisioning through a PersistentVolumeClaim + (see EphemeralVolumeSource for more information + on the connection between this volume type and PersistentVolumeClaim). \n Use PersistentVolumeClaim or one of the vendor-specific APIs for volumes that persist for longer than the lifecycle of @@ -5825,6 +5892,7 @@ spec: are ANDed. type: object type: object + x-kubernetes-map-type: atomic storageClassName: description: 'Name of the StorageClass required by the claim. More info: diff --git a/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml b/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml index 3de2af124..9c907409d 100644 --- a/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml +++ b/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml @@ -1,10 +1,10 @@ - --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.4.1 + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null name: schedulingspecs.mcad.ibm.com spec: group: mcad.ibm.com @@ -33,70 +33,126 @@ spec: type: object spec: properties: - minAvailable: - description: Expected number of pods in running and/or completed state. Requeuing is triggered - when the number of running/completed pods is not equal to this value. When not specified, - requeuing is disabled and no check is performed. - type: integer - requeuing: - description: Specification of the requeuing strategy based on waiting time. Values in this field - control how often the pod check should happen and if requeuing has reached its maximum number of - times. + clusterScheduling: properties: - initialTimeInSeconds: - description: Value to keep track of the initial wait time. Users cannot set this as it is - taken from 'timeInSeconds'. - type: integer - timeInSeconds: - description: Initial waiting time before requeuing conditions are checked. This value is - specified by the user, but it may grow as requeuing events happen. - type: integer - default: 300 - maxTimeInSeconds: - description: Maximum waiting time for requeuing checks - type: integer - default: 0 - growthType: - description: Growth strategy to increase the waiting time between requeuing checks. The values - available are 'exponential', 'linear', or 'none'. For example, 'exponential' growth would - double the 'timeInSeconds' value every time a requeuing event is triggered. If the string value - is misspelled or not one of the possible options, the growth behavior is defaulted to 'none'. - type: string - default: "exponential" - numRequeuings: - description: Field to keep track of how many times a requeuing event has been triggered - type: integer - default: 0 - maxNumRequeuings: - description: Maximum number of requeuing events allowed. Once this value is reached (e.g., - 'numRequeuings = maxNumRequeuings', no more requeuing checks are performed and the generic - items are stopped and removed from the cluster (AppWrapper remains deployed). - type: integer - default: 0 + clusterSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An + empty label selector matches all objects. A null label selector + matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that relates + the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, + Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If + the operator is In or NotIn, the values array must + be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced + during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A + single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field is "key", + the operator is "In", and the values array contains only + "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + clusters: + items: + properties: + name: + type: string + required: + - name + type: object + type: array type: object dispatchDuration: - description: Wall clock duration time of appwrapper in seconds properties: expected: - format: int32 type: integer limit: - format: int32 type: integer + overrun: + type: boolean + type: object + dispatchingWindow: + properties: + end: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object + start: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object type: object + minAvailable: + type: integer nodeSelector: additionalProperties: type: string type: object + requeuing: + properties: + growthType: + type: string + initialTimeInSeconds: + type: integer + maxNumRequeuings: + type: integer + maxTimeInSeconds: + type: integer + numRequeuings: + type: integer + timeInSeconds: + type: integer + type: object type: object required: - metadata type: object served: true storage: true -status: - acceptedNames: - kind: "" - plural: "" - conditions: [] - storedVersions: [] diff --git a/deployment/mcad-controller/crds/ibm.com_quotasubtree-v1.yaml b/deployment/mcad-controller/crds/ibm.com_quotasubtree-v1.yaml deleted file mode 100644 index 38bd14197..000000000 --- a/deployment/mcad-controller/crds/ibm.com_quotasubtree-v1.yaml +++ /dev/null @@ -1,62 +0,0 @@ -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: quotasubtrees.ibm.com - finalizers: [] -spec: - group: ibm.com - scope: Namespaced - names: - kind: QuotaSubtree - singular: quotasubtree - plural: quotasubtrees - shortNames: - - qst - versions: - - name: v1 - served: true - storage: true - subresources: - status: {} - schema: - openAPIV3Schema: - type: object - properties: - status: - type: object - x-kubernetes-preserve-unknown-fields: true - spec: - type: object - properties: - parent: - type: string - parentNamespace: - type: string - children: - type: array - items: - type: object - properties: - name: - type: string - namespace: - type: string - quotas: - type: object - properties: - disabled: - type: boolean - hardLimit: - type: boolean - requests: - type: object - properties: - cpu: - x-kubernetes-int-or-string: true - pattern: '^[0-9]*(m)?$' - memory: - x-kubernetes-int-or-string: true - pattern: '^[0-9]*(Ei|Pi|Ti|Gi|Mi|Ki|E|P|T|G|M|K)?$' - nvidia.com/gpu: - x-kubernetes-int-or-string: true - pattern: '^[0-9]?$' diff --git a/deployment/mcad-controller/crds/ibm.com_quotasubtrees.yaml b/deployment/mcad-controller/crds/ibm.com_quotasubtrees.yaml new file mode 100644 index 000000000..a410ab505 --- /dev/null +++ b/deployment/mcad-controller/crds/ibm.com_quotasubtrees.yaml @@ -0,0 +1,122 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null + name: quotasubtrees.ibm.com +spec: + group: ibm.com + names: + kind: QuotaSubtree + listKind: QuotaSubtreeList + plural: quotasubtrees + singular: quotasubtree + scope: Namespaced + versions: + - name: v1 + schema: + openAPIV3Schema: + description: QuotaSubtree is a specification for a quota subtree resource + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: QuotaSubtreeSpec is the spec for a resource plan + properties: + children: + items: + description: Child is the spec for a QuotaSubtree resource + properties: + name: + type: string + namespace: + type: string + path: + type: string + quotas: + description: Quota is the spec for a QuotaSubtree resource + properties: + disabled: + type: boolean + hardLimit: + type: boolean + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + type: object + type: object + type: array + parent: + type: string + parentNamespace: + type: string + type: object + status: + description: QuotaSubtreeStatus is the status for a QuotaSubtree resource + properties: + children: + items: + description: ResourceAllocation is the spec for the child status + properties: + allocated: + description: ResourceAllocationStatus is the spec for the child + resource usage + properties: + requests: + additionalProperties: + type: string + type: object + type: object + name: + type: string + namespace: + type: string + path: + type: string + type: object + type: array + totalAllocation: + description: ResourceAllocation is the spec for the child status + properties: + allocated: + description: ResourceAllocationStatus is the spec for the child + resource usage + properties: + requests: + additionalProperties: + type: string + type: object + type: object + name: + type: string + namespace: + type: string + path: + type: string + type: object + required: + - children + - totalAllocation + type: object + required: + - spec + type: object + served: true + storage: true diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 0f7598754..4b58618ec 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -1,10 +1,10 @@ - --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.4.1 + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null name: appwrappers.mcad.ibm.com spec: group: mcad.ibm.com @@ -53,22 +53,10 @@ spec: resource type format: int32 type: integer - apiVersion: - description: 'APIVersion defines the versioned schema of - this representation of an object. Servers should convert - recognized schemas to the latest internal value, and may - reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string completionstatus: - description: Optional field that drives completion status of appwrapper. - This field within an item of an appwrapper determines the full state of appwrapper. - The completionstatus field contains a list of conditions that make the associate item considered - completed, for instance :- completion conditions could be "Complete" or "Failed". - The associated item's level .status.conditions[].type field is monitored for any one of these conditions. Once all items with this - option is set and the conditionstatus is met the entire appwrapper state will be changed to one of the valid appwrapper completion state. Note :- this is an AND - operation for all items where this option is set. See the list of appwrapper states for a list of valid complete states. + description: Optional field for users to determine completion + status of item type: string - custompodresources: description: Optional section that specifies resource requirements for non-standard k8s resources, follows same format as @@ -109,14 +97,6 @@ spec: type: object x-kubernetes-embedded-resource: true x-kubernetes-preserve-unknown-fields: true - kind: - description: 'Kind is a string value representing the REST - resource this object represents. Servers may infer this - from the endpoint the client submits requests to. Cannot - be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object minavailable: description: The minimal available pods to run for this AppWrapper; the default value is nil @@ -124,8 +104,8 @@ spec: type: integer priority: description: The priority of this resource - format: int32 - type: integer + format: float + type: number priorityslope: description: The increasing rate of priority value for this resource @@ -147,29 +127,14 @@ spec: resource type format: int32 type: integer - apiVersion: - description: 'APIVersion defines the versioned schema of - this representation of an object. Servers should convert - recognized schemas to the latest internal value, and may - reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST - resource this object represents. Servers may infer this - from the endpoint the client submits requests to. Cannot - be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object minavailable: description: The minimal available pods to run for this AppWrapper; the default value is nil format: int32 type: integer priority: - description: The priority of this resource - format: int32 - type: integer + format: float + type: number priorityslope: description: The increasing rate of priority value for this resource @@ -190,6 +155,7 @@ spec: Pod, a ReplicaSet, a ... ?) type: string required: + - priorityslope - template type: object type: array @@ -252,67 +218,130 @@ spec: field is planned to be removed in 1.21 release." type: string type: object + required: + - metadata type: object schedulingSpec: - description: Field 'schedulingSpec' specifies the parameters used for scheduling generic items - wrapped inside AppWrappers. It defines the policy for requeuing jobs based on the number - of running pods. + description: SchedSpec specifies the parameters for scheduling. properties: - minAvailable: - description: Expected number of pods in running and/or completed state. Requeuing is triggered - when the number of running/completed pods is not equal to this value. When not specified, - requeuing is disabled and no check is performed. - type: integer - requeuing: - description: Specification of the requeuing strategy based on waiting time. Values in this field - control how often the pod check should happen and if requeuing has reached its maximum number of - times. + clusterScheduling: properties: - initialTimeInSeconds: - description: Value to keep track of the initial wait time. Users cannot set this as it is - taken from 'timeInSeconds'. - type: integer - timeInSeconds: - description: Initial waiting time before requeuing conditions are checked. This value is - specified by the user, but it may grow as requeuing events happen. - type: integer - default: 300 - maxTimeInSeconds: - description: Maximum waiting time for requeuing checks - type: integer - default: 0 - growthType: - description: Growth strategy to increase the waiting time between requeuing checks. The values - available are 'exponential', 'linear', or 'none'. For example, 'exponential' growth would - double the 'timeInSeconds' value every time a requeuing event is triggered. If the string value - is misspelled or not one of the possible options, the growth behavior is defaulted to 'none'. - type: string - default: "exponential" - numRequeuings: - description: Field to keep track of how many times a requeuing event has been triggered - type: integer - default: 0 - maxNumRequeuings: - description: Maximum number of requeuing events allowed. Once this value is reached (e.g., - 'numRequeuings = maxNumRequeuings', no more requeuing checks are performed and the generic - items are stopped and removed from the cluster (AppWrapper remains deployed). - type: integer - default: 0 + clusterSelector: + description: A label selector is a label query over a set + of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, + Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists or + DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field is + "key", the operator is "In", and the values array contains + only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + clusters: + items: + properties: + name: + type: string + required: + - name + type: object + type: array type: object dispatchDuration: - description: Wall clock duration time of appwrapper in seconds properties: expected: - format: int32 type: integer limit: - format: int32 type: integer + overrun: + type: boolean + type: object + dispatchingWindow: + properties: + end: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object + start: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object type: object + minAvailable: + type: integer nodeSelector: additionalProperties: type: string type: object + requeuing: + properties: + growthType: + type: string + initialTimeInSeconds: + type: integer + maxNumRequeuings: + type: integer + maxTimeInSeconds: + type: integer + numRequeuings: + type: integer + timeInSeconds: + type: integer + type: object type: object selector: description: A label selector is a label query over a set of resources. @@ -361,6 +390,7 @@ spec: are ANDed. type: object type: object + x-kubernetes-map-type: atomic service: description: AppWrapperService is App Wrapper service definition properties: @@ -730,49 +760,11 @@ spec: - type type: object type: array - pendingpodconditions: - description: Represent conditions of pod(s) that failed scheduling. - items: - description: Describes scheduling failed condition(s) of a pod - properties: - podname: - description: Name of the pod - type: string - conditions: - description: Failed condition(s) of a pod - items: - description: DeploymentCondition describes the state of a deployment - at a certain point. - properties: - LastProbeTime: - description: Last time the condition transitioned from one status - to another. - format: date-time - type: string - LastTransitionTime: - description: The last time this condition was updated. - format: date-time - type: string - message: - description: A human readable message indicating details about - the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of appwrapper condition. - type: string - required: - - status - - type - type: object - type: array - type: object - type: array + controllerfirstdispatchtimestamp: + description: Microsecond level timestamp when controller first sets + appwrapper in state Running + format: date-time + type: string controllerfirsttimestamp: description: Microsecond level timestamp when controller first sees QueueJob (by Informer) @@ -811,6 +803,50 @@ spec: description: The number of pending pods. format: int32 type: integer + pendingpodconditions: + description: Represents the latest available observations of pods + under appwrapper + items: + properties: + conditions: + items: + description: PodCondition contains details for the current + condition of this pod. + properties: + lastProbeTime: + description: Last time we probed the condition. + format: date-time + type: string + lastTransitionTime: + description: Last time the condition transitioned from + one status to another. + format: date-time + type: string + message: + description: Human-readable message indicating details + about last transition. + type: string + reason: + description: Unique, one-word, CamelCase reason for the + condition's last transition. + type: string + status: + description: 'Status is the status of the condition. Can + be True, False, Unknown. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#pod-conditions' + type: string + type: + description: 'Type is the type of the condition. More + info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#pod-conditions' + type: string + required: + - status + - type + type: object + type: array + podname: + type: string + type: object + type: array queuejobstate: description: State of QueueJob - Init, Queueing, HeadOfLine, Rejoining, ... @@ -834,9 +870,13 @@ spec: (is this different from the MinAvailable from JobStatus) format: int32 type: integer + required: + - pendingpodconditions type: object required: - spec type: object served: true storage: true + subresources: + status: {} diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml index 6c60855de..22b7b435c 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml @@ -1,10 +1,10 @@ - --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.4.1 + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null name: queuejobs.mcad.ibm.com spec: group: mcad.ibm.com @@ -33,70 +33,131 @@ spec: type: object spec: description: Specification of the desired behavior of a cron job, including - the minAvailable and the requeuing strategy + the minAvailable properties: schedulerName: type: string schedulingSpec: - description: Field 'schedulingSpec' specifies the parameters used for scheduling generic items - wrapped inside AppWrappers. It defines the policy for requeuing jobs based on the number - of running pods. + description: SchedSpec specifies the parameters for scheduling. properties: - minAvailable: - description: Expected number of pods in running and/or completed state. Requeuing is triggered - when the number of running/completed pods is not equal to this value. When not specified, - requeuing is disabled and no check is performed. - type: integer - requeuing: - description: Specification of the requeuing strategy based on waiting time. Values in this field - control how often the pod check should happen and if requeuing has reached its maximum number of - times. + clusterScheduling: properties: - initialTimeInSeconds: - description: Value to keep track of the initial wait time. Users cannot set this as it is - taken from 'timeInSeconds'. - type: integer - timeInSeconds: - description: Initial waiting time before requeuing conditions are checked. This value is - specified by the user, but it may grow as requeuing events happen. - type: integer - default: 300 - maxTimeInSeconds: - description: Maximum waiting time for requeuing checks - type: integer - default: 0 - growthType: - description: Growth strategy to increase the waiting time between requeuing checks. The values - available are 'exponential', 'linear', or 'none'. For example, 'exponential' growth would - double the 'timeInSeconds' value every time a requeuing event is triggered. If the string value - is misspelled or not one of the possible options, the growth behavior is defaulted to 'none'. - type: string - default: "exponential" - numRequeuings: - description: Field to keep track of how many times a requeuing event has been triggered - type: integer - default: 0 - maxNumRequeuings: - description: Maximum number of requeuing events allowed. Once this value is reached (e.g., - 'numRequeuings = maxNumRequeuings', no more requeuing checks are performed and the generic - items are stopped and removed from the cluster (AppWrapper remains deployed). - type: integer - default: 0 + clusterSelector: + description: A label selector is a label query over a set + of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, + Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists or + DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field is + "key", the operator is "In", and the values array contains + only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + clusters: + items: + properties: + name: + type: string + required: + - name + type: object + type: array type: object dispatchDuration: - description: Wall clock duration time of appwrapper in seconds properties: expected: - format: int32 type: integer limit: - format: int32 type: integer + overrun: + type: boolean type: object + dispatchingWindow: + properties: + end: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object + start: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object + type: object + minAvailable: + type: integer nodeSelector: additionalProperties: type: string type: object + requeuing: + properties: + growthType: + type: string + initialTimeInSeconds: + type: integer + maxNumRequeuings: + type: integer + maxTimeInSeconds: + type: integer + numRequeuings: + type: integer + timeInSeconds: + type: integer + type: object type: object taskSpecs: description: TaskSpecs specifies the task specification of QueueJob @@ -153,6 +214,7 @@ spec: only "value". The requirements are ANDed. type: object type: object + x-kubernetes-map-type: atomic template: description: Specifies the pod that will be created for this TaskSpec when executing a QueueJob @@ -497,6 +559,7 @@ spec: are ANDed. type: object type: object + x-kubernetes-map-type: atomic namespaces: description: namespaces specifies which namespaces the labelSelector @@ -612,6 +675,7 @@ spec: ANDed. type: object type: object + x-kubernetes-map-type: atomic namespaces: description: namespaces specifies which namespaces the labelSelector applies @@ -728,6 +792,7 @@ spec: are ANDed. type: object type: object + x-kubernetes-map-type: atomic namespaces: description: namespaces specifies which namespaces the labelSelector @@ -843,6 +908,7 @@ spec: ANDed. type: object type: object + x-kubernetes-map-type: atomic namespaces: description: namespaces specifies which namespaces the labelSelector applies @@ -5110,6 +5176,7 @@ spec: "value". The requirements are ANDed. type: object type: object + x-kubernetes-map-type: atomic maxSkew: description: 'MaxSkew describes the degree to which pods may be unevenly distributed. When @@ -5145,12 +5212,12 @@ spec: spread constraint. - DoNotSchedule (default) tells the scheduler not to schedule it. - ScheduleAnyway tells the scheduler to schedule the pod in any - location, but giving higher precedence to - topologies that would help reduce the skew. - A constraint is considered "Unsatisfiable" for - an incoming pod if and only if every possible - node assigment for that pod would violate "MaxSkew" - on some topology. For example, in a 3-zone cluster, + location, but giving higher precedence to topologies + that would help reduce the skew. A constraint + is considered "Unsatisfiable" for an incoming + pod if and only if every possible node assigment + for that pod would violate "MaxSkew" on some + topology. For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | If WhenUnsatisfiable @@ -5625,12 +5692,12 @@ spec: pod is removed. \n Use this if: a) the volume is only needed while the pod runs, b) features of normal volumes like restoring from snapshot - or capacity tracking are needed, c) the storage + or capacity tracking are needed, c) the storage driver is specified through a storage class, and d) the storage driver supports dynamic volume - provisioning through a PersistentVolumeClaim - (see EphemeralVolumeSource for more information - on the connection between this volume type and + provisioning through a PersistentVolumeClaim + (see EphemeralVolumeSource for more information + on the connection between this volume type and PersistentVolumeClaim). \n Use PersistentVolumeClaim or one of the vendor-specific APIs for volumes that persist for longer than the lifecycle of @@ -5825,6 +5892,7 @@ spec: are ANDed. type: object type: object + x-kubernetes-map-type: atomic storageClassName: description: 'Name of the StorageClass required by the claim. More info: diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml index 3de2af124..9c907409d 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml @@ -1,10 +1,10 @@ - --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.4.1 + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null name: schedulingspecs.mcad.ibm.com spec: group: mcad.ibm.com @@ -33,70 +33,126 @@ spec: type: object spec: properties: - minAvailable: - description: Expected number of pods in running and/or completed state. Requeuing is triggered - when the number of running/completed pods is not equal to this value. When not specified, - requeuing is disabled and no check is performed. - type: integer - requeuing: - description: Specification of the requeuing strategy based on waiting time. Values in this field - control how often the pod check should happen and if requeuing has reached its maximum number of - times. + clusterScheduling: properties: - initialTimeInSeconds: - description: Value to keep track of the initial wait time. Users cannot set this as it is - taken from 'timeInSeconds'. - type: integer - timeInSeconds: - description: Initial waiting time before requeuing conditions are checked. This value is - specified by the user, but it may grow as requeuing events happen. - type: integer - default: 300 - maxTimeInSeconds: - description: Maximum waiting time for requeuing checks - type: integer - default: 0 - growthType: - description: Growth strategy to increase the waiting time between requeuing checks. The values - available are 'exponential', 'linear', or 'none'. For example, 'exponential' growth would - double the 'timeInSeconds' value every time a requeuing event is triggered. If the string value - is misspelled or not one of the possible options, the growth behavior is defaulted to 'none'. - type: string - default: "exponential" - numRequeuings: - description: Field to keep track of how many times a requeuing event has been triggered - type: integer - default: 0 - maxNumRequeuings: - description: Maximum number of requeuing events allowed. Once this value is reached (e.g., - 'numRequeuings = maxNumRequeuings', no more requeuing checks are performed and the generic - items are stopped and removed from the cluster (AppWrapper remains deployed). - type: integer - default: 0 + clusterSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An + empty label selector matches all objects. A null label selector + matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that relates + the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, + Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. If + the operator is In or NotIn, the values array must + be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced + during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A + single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field is "key", + the operator is "In", and the values array contains only + "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + clusters: + items: + properties: + name: + type: string + required: + - name + type: object + type: array type: object dispatchDuration: - description: Wall clock duration time of appwrapper in seconds properties: expected: - format: int32 type: integer limit: - format: int32 type: integer + overrun: + type: boolean + type: object + dispatchingWindow: + properties: + end: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object + start: + properties: + desiredTimestamp: + format: date-time + type: string + maxTimestamp: + format: date-time + type: string + minTimestamp: + format: date-time + type: string + type: object type: object + minAvailable: + type: integer nodeSelector: additionalProperties: type: string type: object + requeuing: + properties: + growthType: + type: string + initialTimeInSeconds: + type: integer + maxNumRequeuings: + type: integer + maxTimeInSeconds: + type: integer + numRequeuings: + type: integer + timeInSeconds: + type: integer + type: object type: object required: - metadata type: object served: true storage: true -status: - acceptedNames: - kind: "" - plural: "" - conditions: [] - storedVersions: [] diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index 01bb87b32..66f25bdc5 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -1,19 +1,4 @@ /* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package v1beta1 import ( @@ -42,11 +28,10 @@ const AppWrapperPlural string = "appwrappers" // which AppWrapper it belongs to. const AppWrapperAnnotationKey = "appwrapper.mcad.ibm.com/appwrapper-name" -//+kubebuilder:object:root=true -//+kubebuilder:subresource:status +// +kubebuilder:subresource:status +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // Definition of AppWrapper class -// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object type AppWrapper struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` @@ -54,10 +39,9 @@ type AppWrapper struct { Status AppWrapperStatus `json:"status,omitempty"` } -//+kubebuilder:object:root=true +// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // AppWrapperList is a collection of AppWrappers. -// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object type AppWrapperList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata"` @@ -101,10 +85,8 @@ type AppWrapperService struct { } // AppWrapperResource is App Wrapper aggregation resource -//todo: To be depricated +// TODO: To be deprecated type AppWrapperResource struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata"` // Replicas is the number of desired replicas Replicas int32 `json:"replicas,omitempty" protobuf:"bytes,2,opt,name=replicas"` @@ -125,20 +107,18 @@ type AppWrapperResource struct { // +kubebuilder:validation:Format=float PrioritySlope float64 `json:"priorityslope"` - //The type of the resource (is the resource a Pod, a ReplicaSet, a ... ?) + // The type of the resource (is the resource a Pod, a ReplicaSet, a ... ?) // +optional Type ResourceType `json:"type"` - //The template for the resource; it is now a raw text because we don't know for what resource - //it should be instantiated + // The template for the resource; it is now a raw text because we don't know for what resource + // it should be instantiated // +kubebuilder:pruning:PreserveUnknownFields Template runtime.RawExtension `json:"template"` } // AppWrapperResource is App Wrapper aggregation resource type AppWrapperGenericResource struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata"` // Replicas is the number of desired replicas DesiredAvailable int32 `json:"replicas,omitempty" protobuf:"bytes,2,opt,name=desiredavailable"` @@ -161,25 +141,25 @@ type AppWrapperGenericResource struct { // +kubebuilder:validation:Format=float PrioritySlope float64 `json:"priorityslope"` - //The template for the resource; it is now a raw text because we don't know for what resource - //it should be instantiated + // The template for the resource; it is now a raw text because we don't know for what resource + // it should be instantiated // +optional // +kubebuilder:pruning:PreserveUnknownFields // +kubebuilder:validation:EmbeddedResource GenericTemplate runtime.RawExtension `json:"generictemplate"` - //Optional section that specifies resource requirements for non-standard k8s resources, follows same format as that + // Optional section that specifies resource requirements for non-standard k8s resources, follows same format as that // of standard k8s resources CustomPodResources []CustomPodResourceTemplate `json:"custompodresources,omitempty"` - //Optional field for users to determine completion status of item + // Optional field for users to determine completion status of item CompletionStatus string `json:"completionstatus,omitempty"` } type CustomPodResourceTemplate struct { Replicas int `json:"replicas"` - //todo: replace with - //Containers []Container Contain v1.ResourceRequirements + // todo: replace with + // Containers []Container Contain v1.ResourceRequirements Requests v1.ResourceList `json:"requests"` // +optional @@ -224,13 +204,13 @@ type AppWrapperStatus struct { // +optional MinAvailable int32 `json:"template,omitempty" protobuf:"bytes,4,opt,name=template"` - //Can run? + // Can run? CanRun bool `json:"canrun,omitempty" protobuf:"bytes,1,opt,name=canrun"` - //Is Dispatched? + // Is Dispatched? IsDispatched bool `json:"isdispatched,omitempty" protobuf:"bytes,1,opt,name=isdispatched"` - //State - Pending, Running, Failed, Deleted + // State - Pending, Running, Failed, Deleted State AppWrapperState `json:"state,omitempty"` Message string `json:"message,omitempty"` @@ -275,7 +255,7 @@ type AppWrapperStatus struct { type AppWrapperState string -//enqueued, active, deleting, succeeded, failed +// enqueued, active, deleting, succeeded, failed const ( AppWrapperStateEnqueued AppWrapperState = "Pending" AppWrapperStateActive AppWrapperState = "Running" diff --git a/pkg/apis/controller/v1beta1/doc.go b/pkg/apis/controller/v1beta1/doc.go index 9e57c941f..e064d70e7 100644 --- a/pkg/apis/controller/v1beta1/doc.go +++ b/pkg/apis/controller/v1beta1/doc.go @@ -1,19 +1,4 @@ /* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,5 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // +k8s:deepcopy-gen=package +// +kubebuilder:object:generate=true +// +groupName=mcad.ibm.com + package v1beta1 diff --git a/pkg/apis/quotaplugins/quotasubtree/v1/doc.go b/pkg/apis/quotaplugins/quotasubtree/v1/doc.go index 632034959..a5f1a2300 100755 --- a/pkg/apis/quotaplugins/quotasubtree/v1/doc.go +++ b/pkg/apis/quotaplugins/quotasubtree/v1/doc.go @@ -1,2 +1,21 @@ +/* +Copyright 2023 The Multi-Cluster App Dispatcher Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + // +k8s:deepcopy-gen=package +// +kubebuilder:object:generate=true +// +groupName=ibm.com + package v1 From 5bec02f69e2ac0dc0c075d10fd64278f1da5bc02 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Mon, 3 Jul 2023 18:06:28 +0200 Subject: [PATCH 02/10] api: Update API documentation --- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 104 ++++++++++++----- config/crd/bases/mcad.ibm.com_queuejobs.yaml | 34 ++++++ .../bases/mcad.ibm.com_schedulingspecs.yaml | 32 ++++++ .../crds/mcad.ibm.com_appwrappers.yaml | 106 +++++++++++++----- .../crds/mcad.ibm.com_queuejobs.yaml | 34 ++++++ .../crds/mcad.ibm.com_schedulingspecs.yaml | 32 ++++++ pkg/apis/controller/v1beta1/appwrapper.go | 52 +++++---- pkg/apis/controller/v1beta1/schedulingspec.go | 61 ++++++---- 8 files changed, 353 insertions(+), 102 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 4cae0914f..a6f1eeaec 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -54,13 +54,24 @@ spec: format: int32 type: integer completionstatus: - description: Optional field for users to determine completion - status of item + description: 'Optional field that drives completion status + of this AppWrapper. This field within an item of an AppWrapper + determines the full state of the AppWrapper. The completionstatus + field contains a list of conditions that make the associate + item considered completed, for instance: - completion + conditions could be "Complete" or "Failed". The associated + item''s level .status.conditions[].type field is monitored + for any one of these conditions. Once all items with this + option is set and the conditionstatus is met the entire + AppWrapper state will be changed to one of the valid AppWrapper + completion state. Note: - this is an AND operation for + all items where this option is set. See the list of AppWrapper + states for a list of valid complete states.' type: string custompodresources: description: Optional section that specifies resource requirements for non-standard k8s resources, follows same format as - that of standard k8s resources + that of standard k8s resources. items: properties: limits: @@ -104,8 +115,8 @@ spec: type: integer priority: description: The priority of this resource - format: float - type: number + format: int32 + type: integer priorityslope: description: The increasing rate of priority value for this resource @@ -133,8 +144,9 @@ spec: format: int32 type: integer priority: - format: float - type: number + description: The priority of this resource + format: int32 + type: integer priorityslope: description: The increasing rate of priority value for this resource @@ -155,7 +167,6 @@ spec: Pod, a ReplicaSet, a ... ?) type: string required: - - priorityslope - template type: object type: array @@ -222,7 +233,9 @@ spec: - metadata type: object schedulingSpec: - description: SchedSpec specifies the parameters for scheduling. + description: SchedSpec specifies the parameters used for scheduling + generic items wrapped inside AppWrappers. It defines the policy + for requeuing jobs based on the number of running pods. properties: clusterScheduling: properties: @@ -286,6 +299,7 @@ spec: type: array type: object dispatchDuration: + description: Wall clock duration time of appwrapper in seconds. properties: expected: type: integer @@ -322,24 +336,57 @@ spec: type: object type: object minAvailable: + description: Expected number of pods in running and/or completed + state. Requeuing is triggered when the number of running/completed + pods is not equal to this value. When not specified, requeuing + is disabled and no check is performed. type: integer nodeSelector: additionalProperties: type: string type: object requeuing: + description: Specification of the requeuing strategy based on + waiting time. Values in this field control how often the pod + check should happen, and if requeuing has reached its maximum + number of times. properties: growthType: + default: exponential + description: Growth strategy to increase the waiting time + between requeuing checks. The values available are 'exponential', + 'linear', or 'none'. For example, 'exponential' growth would + double the 'timeInSeconds' value every time a requeuing + event is triggered. If the string value is misspelled or + not one of the possible options, the growth behavior is + defaulted to 'none'. type: string initialTimeInSeconds: + description: Value to keep track of the initial wait time. + Users cannot set this as it is taken from 'timeInSeconds'. type: integer maxNumRequeuings: + default: 0 + description: Maximum number of requeuing events allowed. Once + this value is reached (e.g., 'numRequeuings = maxNumRequeuings', + no more requeuing checks are performed and the generic items + are stopped and removed from the cluster (AppWrapper remains + deployed). type: integer maxTimeInSeconds: + default: 0 + description: Maximum waiting time for requeuing checks. type: integer numRequeuings: + default: 0 + description: Field to keep track of how many times a requeuing + event has been triggered. type: integer timeInSeconds: + default: 300 + description: Initial waiting time before requeuing conditions + are checked. This value is specified by the user, but it + may grow as requeuing events happen. type: integer type: object type: object @@ -727,10 +774,10 @@ spec: description: Can run? type: boolean conditions: - description: Represents the latest available observations of a appwrapper's + description: Represents the latest available observations of the AppWrapper's current condition. items: - description: DeploymentCondition describes the state of a deployment + description: AppWrapperCondition describes the state of an AppWrapper at a certain point. properties: lastTransitionMicroTime: @@ -743,7 +790,7 @@ spec: format: date-time type: string message: - description: A human readable message indicating details about + description: A human-readable message indicating details about the transition. type: string reason: @@ -762,7 +809,7 @@ spec: type: array controllerfirstdispatchtimestamp: description: Microsecond level timestamp when controller first sets - appwrapper in state Running + the AppWrapper in state Running format: date-time type: string controllerfirsttimestamp: @@ -787,25 +834,13 @@ spec: type: boolean message: type: string - totalcpu: - description: The number of cpu consumed by all pods belonging to an appwrapper. - format: int32 - type: integer - totalmemory: - description: The amount of memory consumed by all pods belonging to an appwrapper. - format: int32 - type: integer - totalgpu: - description: The total number of GPUs consumed by all pods belonging to an appwrapper. - format: int32 - type: integer pending: description: The number of pending pods. format: int32 type: integer pendingpodconditions: description: Represents the latest available observations of pods - under appwrapper + belonging to the AppWrapper. items: properties: conditions: @@ -870,8 +905,21 @@ spec: (is this different from the MinAvailable from JobStatus) format: int32 type: integer - required: - - pendingpodconditions + totalcpu: + description: The number of CPU consumed by all pods belonging to the + AppWrapper. + format: int32 + type: integer + totalgpu: + description: The total number of GPUs consumed by all pods belonging + to the AppWrapper. + format: int64 + type: integer + totalmemory: + description: The amount of memory consumed by all pods belonging to + the AppWrapper. + format: int32 + type: integer type: object required: - spec diff --git a/config/crd/bases/mcad.ibm.com_queuejobs.yaml b/config/crd/bases/mcad.ibm.com_queuejobs.yaml index 22b7b435c..2474d207d 100644 --- a/config/crd/bases/mcad.ibm.com_queuejobs.yaml +++ b/config/crd/bases/mcad.ibm.com_queuejobs.yaml @@ -102,6 +102,7 @@ spec: type: array type: object dispatchDuration: + description: Wall clock duration time of appwrapper in seconds. properties: expected: type: integer @@ -138,24 +139,57 @@ spec: type: object type: object minAvailable: + description: Expected number of pods in running and/or completed + state. Requeuing is triggered when the number of running/completed + pods is not equal to this value. When not specified, requeuing + is disabled and no check is performed. type: integer nodeSelector: additionalProperties: type: string type: object requeuing: + description: Specification of the requeuing strategy based on + waiting time. Values in this field control how often the pod + check should happen, and if requeuing has reached its maximum + number of times. properties: growthType: + default: exponential + description: Growth strategy to increase the waiting time + between requeuing checks. The values available are 'exponential', + 'linear', or 'none'. For example, 'exponential' growth would + double the 'timeInSeconds' value every time a requeuing + event is triggered. If the string value is misspelled or + not one of the possible options, the growth behavior is + defaulted to 'none'. type: string initialTimeInSeconds: + description: Value to keep track of the initial wait time. + Users cannot set this as it is taken from 'timeInSeconds'. type: integer maxNumRequeuings: + default: 0 + description: Maximum number of requeuing events allowed. Once + this value is reached (e.g., 'numRequeuings = maxNumRequeuings', + no more requeuing checks are performed and the generic items + are stopped and removed from the cluster (AppWrapper remains + deployed). type: integer maxTimeInSeconds: + default: 0 + description: Maximum waiting time for requeuing checks. type: integer numRequeuings: + default: 0 + description: Field to keep track of how many times a requeuing + event has been triggered. type: integer timeInSeconds: + default: 300 + description: Initial waiting time before requeuing conditions + are checked. This value is specified by the user, but it + may grow as requeuing events happen. type: integer type: object type: object diff --git a/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml b/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml index 9c907409d..b96fe7d70 100644 --- a/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml +++ b/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml @@ -94,6 +94,7 @@ spec: type: array type: object dispatchDuration: + description: Wall clock duration time of appwrapper in seconds. properties: expected: type: integer @@ -130,24 +131,55 @@ spec: type: object type: object minAvailable: + description: Expected number of pods in running and/or completed state. + Requeuing is triggered when the number of running/completed pods + is not equal to this value. When not specified, requeuing is disabled + and no check is performed. type: integer nodeSelector: additionalProperties: type: string type: object requeuing: + description: Specification of the requeuing strategy based on waiting + time. Values in this field control how often the pod check should + happen, and if requeuing has reached its maximum number of times. properties: growthType: + default: exponential + description: Growth strategy to increase the waiting time between + requeuing checks. The values available are 'exponential', 'linear', + or 'none'. For example, 'exponential' growth would double the + 'timeInSeconds' value every time a requeuing event is triggered. + If the string value is misspelled or not one of the possible + options, the growth behavior is defaulted to 'none'. type: string initialTimeInSeconds: + description: Value to keep track of the initial wait time. Users + cannot set this as it is taken from 'timeInSeconds'. type: integer maxNumRequeuings: + default: 0 + description: Maximum number of requeuing events allowed. Once + this value is reached (e.g., 'numRequeuings = maxNumRequeuings', + no more requeuing checks are performed and the generic items + are stopped and removed from the cluster (AppWrapper remains + deployed). type: integer maxTimeInSeconds: + default: 0 + description: Maximum waiting time for requeuing checks. type: integer numRequeuings: + default: 0 + description: Field to keep track of how many times a requeuing + event has been triggered. type: integer timeInSeconds: + default: 300 + description: Initial waiting time before requeuing conditions + are checked. This value is specified by the user, but it may + grow as requeuing events happen. type: integer type: object type: object diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 4b58618ec..a6f1eeaec 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -54,13 +54,24 @@ spec: format: int32 type: integer completionstatus: - description: Optional field for users to determine completion - status of item + description: 'Optional field that drives completion status + of this AppWrapper. This field within an item of an AppWrapper + determines the full state of the AppWrapper. The completionstatus + field contains a list of conditions that make the associate + item considered completed, for instance: - completion + conditions could be "Complete" or "Failed". The associated + item''s level .status.conditions[].type field is monitored + for any one of these conditions. Once all items with this + option is set and the conditionstatus is met the entire + AppWrapper state will be changed to one of the valid AppWrapper + completion state. Note: - this is an AND operation for + all items where this option is set. See the list of AppWrapper + states for a list of valid complete states.' type: string custompodresources: description: Optional section that specifies resource requirements for non-standard k8s resources, follows same format as - that of standard k8s resources + that of standard k8s resources. items: properties: limits: @@ -104,8 +115,8 @@ spec: type: integer priority: description: The priority of this resource - format: float - type: number + format: int32 + type: integer priorityslope: description: The increasing rate of priority value for this resource @@ -120,7 +131,7 @@ spec: Items: items: description: 'AppWrapperResource is App Wrapper aggregation - resource todo: To be depricated' + resource TODO: To be deprecated' properties: allocatedreplicas: description: The number of allocated replicas from this @@ -133,8 +144,9 @@ spec: format: int32 type: integer priority: - format: float - type: number + description: The priority of this resource + format: int32 + type: integer priorityslope: description: The increasing rate of priority value for this resource @@ -155,7 +167,6 @@ spec: Pod, a ReplicaSet, a ... ?) type: string required: - - priorityslope - template type: object type: array @@ -222,7 +233,9 @@ spec: - metadata type: object schedulingSpec: - description: SchedSpec specifies the parameters for scheduling. + description: SchedSpec specifies the parameters used for scheduling + generic items wrapped inside AppWrappers. It defines the policy + for requeuing jobs based on the number of running pods. properties: clusterScheduling: properties: @@ -286,6 +299,7 @@ spec: type: array type: object dispatchDuration: + description: Wall clock duration time of appwrapper in seconds. properties: expected: type: integer @@ -322,24 +336,57 @@ spec: type: object type: object minAvailable: + description: Expected number of pods in running and/or completed + state. Requeuing is triggered when the number of running/completed + pods is not equal to this value. When not specified, requeuing + is disabled and no check is performed. type: integer nodeSelector: additionalProperties: type: string type: object requeuing: + description: Specification of the requeuing strategy based on + waiting time. Values in this field control how often the pod + check should happen, and if requeuing has reached its maximum + number of times. properties: growthType: + default: exponential + description: Growth strategy to increase the waiting time + between requeuing checks. The values available are 'exponential', + 'linear', or 'none'. For example, 'exponential' growth would + double the 'timeInSeconds' value every time a requeuing + event is triggered. If the string value is misspelled or + not one of the possible options, the growth behavior is + defaulted to 'none'. type: string initialTimeInSeconds: + description: Value to keep track of the initial wait time. + Users cannot set this as it is taken from 'timeInSeconds'. type: integer maxNumRequeuings: + default: 0 + description: Maximum number of requeuing events allowed. Once + this value is reached (e.g., 'numRequeuings = maxNumRequeuings', + no more requeuing checks are performed and the generic items + are stopped and removed from the cluster (AppWrapper remains + deployed). type: integer maxTimeInSeconds: + default: 0 + description: Maximum waiting time for requeuing checks. type: integer numRequeuings: + default: 0 + description: Field to keep track of how many times a requeuing + event has been triggered. type: integer timeInSeconds: + default: 300 + description: Initial waiting time before requeuing conditions + are checked. This value is specified by the user, but it + may grow as requeuing events happen. type: integer type: object type: object @@ -727,10 +774,10 @@ spec: description: Can run? type: boolean conditions: - description: Represents the latest available observations of a appwrapper's + description: Represents the latest available observations of the AppWrapper's current condition. items: - description: DeploymentCondition describes the state of a deployment + description: AppWrapperCondition describes the state of an AppWrapper at a certain point. properties: lastTransitionMicroTime: @@ -743,7 +790,7 @@ spec: format: date-time type: string message: - description: A human readable message indicating details about + description: A human-readable message indicating details about the transition. type: string reason: @@ -762,7 +809,7 @@ spec: type: array controllerfirstdispatchtimestamp: description: Microsecond level timestamp when controller first sets - appwrapper in state Running + the AppWrapper in state Running format: date-time type: string controllerfirsttimestamp: @@ -787,25 +834,13 @@ spec: type: boolean message: type: string - totalcpu: - description: The number of cpu consumed by all pods belonging to an appwrapper. - format: int32 - type: integer - totalmemory: - description: The amount of memory consumed by all pods belonging to an appwrapper. - format: int32 - type: integer - totalgpu: - description: The total number of GPUs consumed by all pods belonging to an appwrapper. - format: int32 - type: integer pending: description: The number of pending pods. format: int32 type: integer pendingpodconditions: description: Represents the latest available observations of pods - under appwrapper + belonging to the AppWrapper. items: properties: conditions: @@ -870,8 +905,21 @@ spec: (is this different from the MinAvailable from JobStatus) format: int32 type: integer - required: - - pendingpodconditions + totalcpu: + description: The number of CPU consumed by all pods belonging to the + AppWrapper. + format: int32 + type: integer + totalgpu: + description: The total number of GPUs consumed by all pods belonging + to the AppWrapper. + format: int64 + type: integer + totalmemory: + description: The amount of memory consumed by all pods belonging to + the AppWrapper. + format: int32 + type: integer type: object required: - spec diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml index 22b7b435c..2474d207d 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml @@ -102,6 +102,7 @@ spec: type: array type: object dispatchDuration: + description: Wall clock duration time of appwrapper in seconds. properties: expected: type: integer @@ -138,24 +139,57 @@ spec: type: object type: object minAvailable: + description: Expected number of pods in running and/or completed + state. Requeuing is triggered when the number of running/completed + pods is not equal to this value. When not specified, requeuing + is disabled and no check is performed. type: integer nodeSelector: additionalProperties: type: string type: object requeuing: + description: Specification of the requeuing strategy based on + waiting time. Values in this field control how often the pod + check should happen, and if requeuing has reached its maximum + number of times. properties: growthType: + default: exponential + description: Growth strategy to increase the waiting time + between requeuing checks. The values available are 'exponential', + 'linear', or 'none'. For example, 'exponential' growth would + double the 'timeInSeconds' value every time a requeuing + event is triggered. If the string value is misspelled or + not one of the possible options, the growth behavior is + defaulted to 'none'. type: string initialTimeInSeconds: + description: Value to keep track of the initial wait time. + Users cannot set this as it is taken from 'timeInSeconds'. type: integer maxNumRequeuings: + default: 0 + description: Maximum number of requeuing events allowed. Once + this value is reached (e.g., 'numRequeuings = maxNumRequeuings', + no more requeuing checks are performed and the generic items + are stopped and removed from the cluster (AppWrapper remains + deployed). type: integer maxTimeInSeconds: + default: 0 + description: Maximum waiting time for requeuing checks. type: integer numRequeuings: + default: 0 + description: Field to keep track of how many times a requeuing + event has been triggered. type: integer timeInSeconds: + default: 300 + description: Initial waiting time before requeuing conditions + are checked. This value is specified by the user, but it + may grow as requeuing events happen. type: integer type: object type: object diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml index 9c907409d..b96fe7d70 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml @@ -94,6 +94,7 @@ spec: type: array type: object dispatchDuration: + description: Wall clock duration time of appwrapper in seconds. properties: expected: type: integer @@ -130,24 +131,55 @@ spec: type: object type: object minAvailable: + description: Expected number of pods in running and/or completed state. + Requeuing is triggered when the number of running/completed pods + is not equal to this value. When not specified, requeuing is disabled + and no check is performed. type: integer nodeSelector: additionalProperties: type: string type: object requeuing: + description: Specification of the requeuing strategy based on waiting + time. Values in this field control how often the pod check should + happen, and if requeuing has reached its maximum number of times. properties: growthType: + default: exponential + description: Growth strategy to increase the waiting time between + requeuing checks. The values available are 'exponential', 'linear', + or 'none'. For example, 'exponential' growth would double the + 'timeInSeconds' value every time a requeuing event is triggered. + If the string value is misspelled or not one of the possible + options, the growth behavior is defaulted to 'none'. type: string initialTimeInSeconds: + description: Value to keep track of the initial wait time. Users + cannot set this as it is taken from 'timeInSeconds'. type: integer maxNumRequeuings: + default: 0 + description: Maximum number of requeuing events allowed. Once + this value is reached (e.g., 'numRequeuings = maxNumRequeuings', + no more requeuing checks are performed and the generic items + are stopped and removed from the cluster (AppWrapper remains + deployed). type: integer maxTimeInSeconds: + default: 0 + description: Maximum waiting time for requeuing checks. type: integer numRequeuings: + default: 0 + description: Field to keep track of how many times a requeuing + event has been triggered. type: integer timeInSeconds: + default: 300 + description: Initial waiting time before requeuing conditions + are checked. This value is specified by the user, but it may + grow as requeuing events happen. type: integer type: object type: object diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index 66f25bdc5..4337378f1 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -64,7 +64,8 @@ type AppWrapperSpec struct { Selector *metav1.LabelSelector `json:"selector,omitempty" protobuf:"bytes,1,opt,name=selector"` - // SchedSpec specifies the parameters for scheduling. + // SchedSpec specifies the parameters used for scheduling generic items wrapped inside AppWrappers. + // It defines the policy for requeuing jobs based on the number of running pods. SchedSpec SchedulingSpecTemplate `json:"schedulingSpec,omitempty" protobuf:"bytes,2,opt,name=schedulingSpec"` } @@ -97,14 +98,13 @@ type AppWrapperResource struct { // +optional AllocatedReplicas int32 `json:"allocatedreplicas"` - // +kubebuilder:validation:Type=number - // +kubebuilder:validation:Format=float - // +optional - Priority float64 `json:"priority,omitempty"` + // The priority of this resource + Priority int32 `json:"priority,omitempty"` // The increasing rate of priority value for this resource // +kubebuilder:validation:Type=number // +kubebuilder:validation:Format=float + // +optional PrioritySlope float64 `json:"priorityslope"` // The type of the resource (is the resource a Pod, a ReplicaSet, a ... ?) @@ -131,14 +131,12 @@ type AppWrapperGenericResource struct { // The priority of this resource // +optional - // +kubebuilder:validation:Type=number - // +kubebuilder:validation:Format=float - Priority float64 `json:"priority"` + Priority int32 `json:"priority"` // The increasing rate of priority value for this resource - // +optional // +kubebuilder:validation:Type=number // +kubebuilder:validation:Format=float + // +optional PrioritySlope float64 `json:"priorityslope"` // The template for the resource; it is now a raw text because we don't know for what resource @@ -148,11 +146,20 @@ type AppWrapperGenericResource struct { // +kubebuilder:validation:EmbeddedResource GenericTemplate runtime.RawExtension `json:"generictemplate"` - // Optional section that specifies resource requirements for non-standard k8s resources, follows same format as that - // of standard k8s resources + // Optional section that specifies resource requirements for non-standard k8s resources, + // follows same format as that of standard k8s resources. CustomPodResources []CustomPodResourceTemplate `json:"custompodresources,omitempty"` - // Optional field for users to determine completion status of item + // Optional field that drives completion status of this AppWrapper. + // This field within an item of an AppWrapper determines the full state of the AppWrapper. + // The completionstatus field contains a list of conditions that make the associate item considered + // completed, for instance: + // - completion conditions could be "Complete" or "Failed". + // The associated item's level .status.conditions[].type field is monitored for any one of these conditions. + // Once all items with this option is set and the conditionstatus is met the entire AppWrapper state will be changed to one of the valid AppWrapper completion state. + // Note: + // - this is an AND operation for all items where this option is set. + // See the list of AppWrapper states for a list of valid complete states. CompletionStatus string `json:"completionstatus,omitempty"` } @@ -226,7 +233,7 @@ type AppWrapperStatus struct { // Microsecond level timestamp when controller first sees QueueJob (by Informer) ControllerFirstTimestamp metav1.MicroTime `json:"controllerfirsttimestamp,omitempty"` - // Microsecond level timestamp when controller first sets appwrapper in state Running + // Microsecond level timestamp when controller first sets the AppWrapper in state Running ControllerFirstDispatchTimestamp metav1.MicroTime `json:"controllerfirstdispatchtimestamp,omitempty"` // Tell Informer to ignore this update message (do not generate a controller event) @@ -238,18 +245,21 @@ type AppWrapperStatus struct { // Indicate if message is a duplicate (for Informer to recognize duplicate messages) Local bool `json:"local,omitempty"` - // Represents the latest available observations of a appwrapper's current condition. + // Represents the latest available observations of the AppWrapper's current condition. Conditions []AppWrapperCondition `json:"conditions,omitempty"` - // Represents the latest available observations of pods under appwrapper - PendingPodConditions []PendingPodSpec `json:"pendingpodconditions"` + // Represents the latest available observations of pods belonging to the AppWrapper. + PendingPodConditions []PendingPodSpec `json:"pendingpodconditions,omitempty"` - //Resources consumed + // Resources consumed - TotalCPU float64 `json:"totalcpu,omitempty"` + // The number of CPU consumed by all pods belonging to the AppWrapper. + TotalCPU int32 `json:"totalcpu,omitempty"` - TotalMemory float64 `json:"totalmemory,omitempty"` + // The amount of memory consumed by all pods belonging to the AppWrapper. + TotalMemory int32 `json:"totalmemory,omitempty"` + // The total number of GPUs consumed by all pods belonging to the AppWrapper. TotalGPU int64 `json:"totalgpu,omitempty"` } @@ -282,7 +292,7 @@ const ( AppWrapperCondRunningHoldCompletion AppWrapperConditionType = "RunningHoldCompletion" ) -// DeploymentCondition describes the state of a deployment at a certain point. +// AppWrapperCondition describes the state of an AppWrapper at a certain point. type AppWrapperCondition struct { // Type of appwrapper condition. Type AppWrapperConditionType `json:"type"` @@ -294,7 +304,7 @@ type AppWrapperCondition struct { LastTransitionMicroTime metav1.MicroTime `json:"lastTransitionMicroTime,omitempty"` // The reason for the condition's last transition. Reason string `json:"reason,omitempty"` - // A human readable message indicating details about the transition. + // A human-readable message indicating details about the transition. Message string `json:"message,omitempty"` } diff --git a/pkg/apis/controller/v1beta1/schedulingspec.go b/pkg/apis/controller/v1beta1/schedulingspec.go index 457cd1540..01dba300f 100644 --- a/pkg/apis/controller/v1beta1/schedulingspec.go +++ b/pkg/apis/controller/v1beta1/schedulingspec.go @@ -1,19 +1,4 @@ /* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package v1beta1 import ( @@ -46,21 +32,48 @@ type SchedulingSpec struct { } type SchedulingSpecTemplate struct { - NodeSelector map[string]string `json:"nodeSelector,omitempty" protobuf:"bytes,1,rep,name=nodeSelector"` - MinAvailable int `json:"minAvailable,omitempty" protobuf:"bytes,2,rep,name=minAvailable"` + NodeSelector map[string]string `json:"nodeSelector,omitempty" protobuf:"bytes,1,rep,name=nodeSelector"` + // Expected number of pods in running and/or completed state. + // Requeuing is triggered when the number of running/completed pods is not equal to this value. + // When not specified, requeuing is disabled and no check is performed. + MinAvailable int `json:"minAvailable,omitempty" protobuf:"bytes,2,rep,name=minAvailable"` + // Specification of the requeuing strategy based on waiting time. + // Values in this field control how often the pod check should happen, + // and if requeuing has reached its maximum number of times. Requeuing RequeuingTemplate `json:"requeuing,omitempty" protobuf:"bytes,1,rep,name=requeuing"` ClusterScheduling ClusterSchedulingSpec `json:"clusterScheduling,omitempty"` DispatchingWindow DispatchingWindowSpec `json:"dispatchingWindow,omitempty"` - DispatchDuration DispatchDurationSpec `json:"dispatchDuration,omitempty"` + // Wall clock duration time of appwrapper in seconds. + DispatchDuration DispatchDurationSpec `json:"dispatchDuration,omitempty"` } type RequeuingTemplate struct { - InitialTimeInSeconds int `json:"initialTimeInSeconds,omitempty" protobuf:"bytes,1,rep,name=initialTimeInSeconds"` - TimeInSeconds int `json:"timeInSeconds,omitempty" protobuf:"bytes,2,rep,name=timeInSeconds"` - MaxTimeInSeconds int `json:"maxTimeInSeconds,omitempty" protobuf:"bytes,3,rep,name=maxTimeInSeconds"` - GrowthType string `json:"growthType,omitempty" protobuf:"bytes,4,rep,name=growthType"` - NumRequeuings int `json:"numRequeuings,omitempty" protobuf:"bytes,5,rep,name=numRequeuings"` - MaxNumRequeuings int `json:"maxNumRequeuings,omitempty" protobuf:"bytes,6,rep,name=maxNumRequeuings"` + // Value to keep track of the initial wait time. + // Users cannot set this as it is taken from 'timeInSeconds'. + InitialTimeInSeconds int `json:"initialTimeInSeconds,omitempty" protobuf:"bytes,1,rep,name=initialTimeInSeconds"` + // Initial waiting time before requeuing conditions are checked. This value is + // specified by the user, but it may grow as requeuing events happen. + // +kubebuilder:default=300 + TimeInSeconds int `json:"timeInSeconds,omitempty" protobuf:"bytes,2,rep,name=timeInSeconds"` + // Maximum waiting time for requeuing checks. + // +kubebuilder:default=0 + MaxTimeInSeconds int `json:"maxTimeInSeconds,omitempty" protobuf:"bytes,3,rep,name=maxTimeInSeconds"` + // Growth strategy to increase the waiting time between requeuing checks. + // The values available are 'exponential', 'linear', or 'none'. + // For example, 'exponential' growth would double the 'timeInSeconds' value + // every time a requeuing event is triggered. + // If the string value is misspelled or not one of the possible options, + // the growth behavior is defaulted to 'none'. + // +kubebuilder:default=exponential + GrowthType string `json:"growthType,omitempty" protobuf:"bytes,4,rep,name=growthType"` + // Field to keep track of how many times a requeuing event has been triggered. + // +kubebuilder:default=0 + NumRequeuings int `json:"numRequeuings,omitempty" protobuf:"bytes,5,rep,name=numRequeuings"` + // Maximum number of requeuing events allowed. Once this value is reached (e.g., + // 'numRequeuings = maxNumRequeuings', no more requeuing checks are performed and the generic + // items are stopped and removed from the cluster (AppWrapper remains deployed). + // +kubebuilder:default=0 + MaxNumRequeuings int `json:"maxNumRequeuings,omitempty" protobuf:"bytes,6,rep,name=maxNumRequeuings"` } // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object From 59be9703a7057f2022e34b8e60872b81a6c915af Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Mon, 3 Jul 2023 18:08:52 +0200 Subject: [PATCH 03/10] api: Remove clusterScheduling and dispatchingWindow fields --- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 88 ------------------- config/crd/bases/mcad.ibm.com_queuejobs.yaml | 88 ------------------- .../bases/mcad.ibm.com_schedulingspecs.yaml | 87 ------------------ .../crds/mcad.ibm.com_appwrappers.yaml | 88 ------------------- .../crds/mcad.ibm.com_queuejobs.yaml | 88 ------------------- .../crds/mcad.ibm.com_schedulingspecs.yaml | 87 ------------------ pkg/apis/controller/v1beta1/schedulingspec.go | 4 +- 7 files changed, 1 insertion(+), 529 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index a6f1eeaec..5ef4218ce 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -237,67 +237,6 @@ spec: generic items wrapped inside AppWrappers. It defines the policy for requeuing jobs based on the number of running pods. properties: - clusterScheduling: - properties: - clusterSelector: - description: A label selector is a label query over a set - of resources. The result of matchLabels and matchExpressions - are ANDed. An empty label selector matches all objects. - A null label selector matches no objects. - properties: - matchExpressions: - description: matchExpressions is a list of label selector - requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector - that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector - applies to. - type: string - operator: - description: operator represents a key's relationship - to a set of values. Valid operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. - If the operator is In or NotIn, the values array - must be non-empty. If the operator is Exists or - DoesNotExist, the values array must be empty. - This array is replaced during a strategic merge - patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. - A single {key,value} in the matchLabels map is equivalent - to an element of matchExpressions, whose key field is - "key", the operator is "In", and the values array contains - only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - clusters: - items: - properties: - name: - type: string - required: - - name - type: object - type: array - type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds. properties: @@ -308,33 +247,6 @@ spec: overrun: type: boolean type: object - dispatchingWindow: - properties: - end: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - start: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - type: object minAvailable: description: Expected number of pods in running and/or completed state. Requeuing is triggered when the number of running/completed diff --git a/config/crd/bases/mcad.ibm.com_queuejobs.yaml b/config/crd/bases/mcad.ibm.com_queuejobs.yaml index 2474d207d..9a7344f68 100644 --- a/config/crd/bases/mcad.ibm.com_queuejobs.yaml +++ b/config/crd/bases/mcad.ibm.com_queuejobs.yaml @@ -40,67 +40,6 @@ spec: schedulingSpec: description: SchedSpec specifies the parameters for scheduling. properties: - clusterScheduling: - properties: - clusterSelector: - description: A label selector is a label query over a set - of resources. The result of matchLabels and matchExpressions - are ANDed. An empty label selector matches all objects. - A null label selector matches no objects. - properties: - matchExpressions: - description: matchExpressions is a list of label selector - requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector - that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector - applies to. - type: string - operator: - description: operator represents a key's relationship - to a set of values. Valid operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. - If the operator is In or NotIn, the values array - must be non-empty. If the operator is Exists or - DoesNotExist, the values array must be empty. - This array is replaced during a strategic merge - patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. - A single {key,value} in the matchLabels map is equivalent - to an element of matchExpressions, whose key field is - "key", the operator is "In", and the values array contains - only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - clusters: - items: - properties: - name: - type: string - required: - - name - type: object - type: array - type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds. properties: @@ -111,33 +50,6 @@ spec: overrun: type: boolean type: object - dispatchingWindow: - properties: - end: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - start: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - type: object minAvailable: description: Expected number of pods in running and/or completed state. Requeuing is triggered when the number of running/completed diff --git a/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml b/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml index b96fe7d70..e91a18ffd 100644 --- a/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml +++ b/config/crd/bases/mcad.ibm.com_schedulingspecs.yaml @@ -33,66 +33,6 @@ spec: type: object spec: properties: - clusterScheduling: - properties: - clusterSelector: - description: A label selector is a label query over a set of resources. - The result of matchLabels and matchExpressions are ANDed. An - empty label selector matches all objects. A null label selector - matches no objects. - properties: - matchExpressions: - description: matchExpressions is a list of label selector - requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector - that contains values, a key, and an operator that relates - the key and values. - properties: - key: - description: key is the label key that the selector - applies to. - type: string - operator: - description: operator represents a key's relationship - to a set of values. Valid operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If - the operator is In or NotIn, the values array must - be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced - during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A - single {key,value} in the matchLabels map is equivalent - to an element of matchExpressions, whose key field is "key", - the operator is "In", and the values array contains only - "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - clusters: - items: - properties: - name: - type: string - required: - - name - type: object - type: array - type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds. properties: @@ -103,33 +43,6 @@ spec: overrun: type: boolean type: object - dispatchingWindow: - properties: - end: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - start: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - type: object minAvailable: description: Expected number of pods in running and/or completed state. Requeuing is triggered when the number of running/completed pods diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index a6f1eeaec..5ef4218ce 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -237,67 +237,6 @@ spec: generic items wrapped inside AppWrappers. It defines the policy for requeuing jobs based on the number of running pods. properties: - clusterScheduling: - properties: - clusterSelector: - description: A label selector is a label query over a set - of resources. The result of matchLabels and matchExpressions - are ANDed. An empty label selector matches all objects. - A null label selector matches no objects. - properties: - matchExpressions: - description: matchExpressions is a list of label selector - requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector - that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector - applies to. - type: string - operator: - description: operator represents a key's relationship - to a set of values. Valid operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. - If the operator is In or NotIn, the values array - must be non-empty. If the operator is Exists or - DoesNotExist, the values array must be empty. - This array is replaced during a strategic merge - patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. - A single {key,value} in the matchLabels map is equivalent - to an element of matchExpressions, whose key field is - "key", the operator is "In", and the values array contains - only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - clusters: - items: - properties: - name: - type: string - required: - - name - type: object - type: array - type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds. properties: @@ -308,33 +247,6 @@ spec: overrun: type: boolean type: object - dispatchingWindow: - properties: - end: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - start: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - type: object minAvailable: description: Expected number of pods in running and/or completed state. Requeuing is triggered when the number of running/completed diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml index 2474d207d..9a7344f68 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml @@ -40,67 +40,6 @@ spec: schedulingSpec: description: SchedSpec specifies the parameters for scheduling. properties: - clusterScheduling: - properties: - clusterSelector: - description: A label selector is a label query over a set - of resources. The result of matchLabels and matchExpressions - are ANDed. An empty label selector matches all objects. - A null label selector matches no objects. - properties: - matchExpressions: - description: matchExpressions is a list of label selector - requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector - that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector - applies to. - type: string - operator: - description: operator represents a key's relationship - to a set of values. Valid operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. - If the operator is In or NotIn, the values array - must be non-empty. If the operator is Exists or - DoesNotExist, the values array must be empty. - This array is replaced during a strategic merge - patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. - A single {key,value} in the matchLabels map is equivalent - to an element of matchExpressions, whose key field is - "key", the operator is "In", and the values array contains - only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - clusters: - items: - properties: - name: - type: string - required: - - name - type: object - type: array - type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds. properties: @@ -111,33 +50,6 @@ spec: overrun: type: boolean type: object - dispatchingWindow: - properties: - end: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - start: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - type: object minAvailable: description: Expected number of pods in running and/or completed state. Requeuing is triggered when the number of running/completed diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml index b96fe7d70..e91a18ffd 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_schedulingspecs.yaml @@ -33,66 +33,6 @@ spec: type: object spec: properties: - clusterScheduling: - properties: - clusterSelector: - description: A label selector is a label query over a set of resources. - The result of matchLabels and matchExpressions are ANDed. An - empty label selector matches all objects. A null label selector - matches no objects. - properties: - matchExpressions: - description: matchExpressions is a list of label selector - requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector - that contains values, a key, and an operator that relates - the key and values. - properties: - key: - description: key is the label key that the selector - applies to. - type: string - operator: - description: operator represents a key's relationship - to a set of values. Valid operators are In, NotIn, - Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If - the operator is In or NotIn, the values array must - be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced - during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A - single {key,value} in the matchLabels map is equivalent - to an element of matchExpressions, whose key field is "key", - the operator is "In", and the values array contains only - "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - clusters: - items: - properties: - name: - type: string - required: - - name - type: object - type: array - type: object dispatchDuration: description: Wall clock duration time of appwrapper in seconds. properties: @@ -103,33 +43,6 @@ spec: overrun: type: boolean type: object - dispatchingWindow: - properties: - end: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - start: - properties: - desiredTimestamp: - format: date-time - type: string - maxTimestamp: - format: date-time - type: string - minTimestamp: - format: date-time - type: string - type: object - type: object minAvailable: description: Expected number of pods in running and/or completed state. Requeuing is triggered when the number of running/completed pods diff --git a/pkg/apis/controller/v1beta1/schedulingspec.go b/pkg/apis/controller/v1beta1/schedulingspec.go index 01dba300f..90f5d65fb 100644 --- a/pkg/apis/controller/v1beta1/schedulingspec.go +++ b/pkg/apis/controller/v1beta1/schedulingspec.go @@ -40,9 +40,7 @@ type SchedulingSpecTemplate struct { // Specification of the requeuing strategy based on waiting time. // Values in this field control how often the pod check should happen, // and if requeuing has reached its maximum number of times. - Requeuing RequeuingTemplate `json:"requeuing,omitempty" protobuf:"bytes,1,rep,name=requeuing"` - ClusterScheduling ClusterSchedulingSpec `json:"clusterScheduling,omitempty"` - DispatchingWindow DispatchingWindowSpec `json:"dispatchingWindow,omitempty"` + Requeuing RequeuingTemplate `json:"requeuing,omitempty" protobuf:"bytes,1,rep,name=requeuing"` // Wall clock duration time of appwrapper in seconds. DispatchDuration DispatchDurationSpec `json:"dispatchDuration,omitempty"` } From ab4cd67646df871d16ccf109b2d4ed1a2b87a2f1 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Mon, 3 Jul 2023 18:11:14 +0200 Subject: [PATCH 04/10] api: Regen deepcopy --- pkg/apis/controller/v1beta1/zz_generated.deepcopy.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go b/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go index 7af3a4dbf..632be6ece 100644 --- a/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go @@ -91,8 +91,6 @@ func (in *AppWrapperCondition) DeepCopy() *AppWrapperCondition { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AppWrapperGenericResource) DeepCopyInto(out *AppWrapperGenericResource) { *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) if in.MinAvailable != nil { in, out := &in.MinAvailable, &out.MinAvailable *out = new(int32) @@ -155,8 +153,6 @@ func (in *AppWrapperList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AppWrapperResource) DeepCopyInto(out *AppWrapperResource) { *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) if in.MinAvailable != nil { in, out := &in.MinAvailable, &out.MinAvailable *out = new(int32) @@ -617,8 +613,6 @@ func (in *SchedulingSpecTemplate) DeepCopyInto(out *SchedulingSpecTemplate) { } } out.Requeuing = in.Requeuing - in.ClusterScheduling.DeepCopyInto(&out.ClusterScheduling) - in.DispatchingWindow.DeepCopyInto(&out.DispatchingWindow) out.DispatchDuration = in.DispatchDuration return } From 58d96339d97f7fa7780ee4d49eaa44d4afb9c960 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 4 Jul 2023 10:07:38 +0200 Subject: [PATCH 05/10] api: Align APIs with CRDs to use integers instead of floats --- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 4 +- config/crd/bases/mcad.ibm.com_queuejobs.yaml | 3 +- .../crds/mcad.ibm.com_appwrappers.yaml | 4 +- .../crds/mcad.ibm.com_queuejobs.yaml | 3 +- pkg/apis/controller/v1beta1/appwrapper.go | 4 +- pkg/apis/controller/v1beta1/queuejob.go | 18 +-- .../clusterstate/api/histogram_info.go | 51 +++---- .../clusterstate/api/resource_info.go | 45 ++---- .../metrics/adapter/provider/provider.go | 30 +--- .../queuejob/queuejob_controller_ex.go | 137 +++++++++--------- .../queuejobdispatch/queuejobagent.go | 16 +- .../queuejobresources/configmap/configmap.go | 17 +-- .../deployment/deployment.go | 32 ++-- .../genericresource/genericresource.go | 54 ++++--- .../queuejobresources/interfaces.go | 22 +-- .../queuejobresources/namespace/namespace.go | 19 ++- .../networkpolicy/networkpolicy.go | 17 +-- .../persistentvolume/persistentvolume.go | 19 ++- .../persistentvolumeclaim.go | 17 +-- pkg/controller/queuejobresources/pod/pod.go | 37 +++-- .../queuejobresources/secret/secret.go | 17 +-- .../queuejobresources/service/service.go | 30 ++-- .../statefulset/statefulset.go | 29 ++-- .../qm_lib_backend_with_quotasubt_mgr.go | 20 +-- .../quota-simple-rest/quota_rest_manager.go | 18 +-- 25 files changed, 280 insertions(+), 383 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 5ef4218ce..76a82df98 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -820,7 +820,7 @@ spec: totalcpu: description: The number of CPU consumed by all pods belonging to the AppWrapper. - format: int32 + format: int64 type: integer totalgpu: description: The total number of GPUs consumed by all pods belonging @@ -830,7 +830,7 @@ spec: totalmemory: description: The amount of memory consumed by all pods belonging to the AppWrapper. - format: int32 + format: int64 type: integer type: object required: diff --git a/config/crd/bases/mcad.ibm.com_queuejobs.yaml b/config/crd/bases/mcad.ibm.com_queuejobs.yaml index 9a7344f68..486f7e2d2 100644 --- a/config/crd/bases/mcad.ibm.com_queuejobs.yaml +++ b/config/crd/bases/mcad.ibm.com_queuejobs.yaml @@ -32,8 +32,7 @@ spec: metadata: type: object spec: - description: Specification of the desired behavior of a cron job, including - the minAvailable + description: Specification of the desired behavior of a cron job properties: schedulerName: type: string diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 5ef4218ce..76a82df98 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -820,7 +820,7 @@ spec: totalcpu: description: The number of CPU consumed by all pods belonging to the AppWrapper. - format: int32 + format: int64 type: integer totalgpu: description: The total number of GPUs consumed by all pods belonging @@ -830,7 +830,7 @@ spec: totalmemory: description: The amount of memory consumed by all pods belonging to the AppWrapper. - format: int32 + format: int64 type: integer type: object required: diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml index 9a7344f68..486f7e2d2 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml @@ -32,8 +32,7 @@ spec: metadata: type: object spec: - description: Specification of the desired behavior of a cron job, including - the minAvailable + description: Specification of the desired behavior of a cron job properties: schedulerName: type: string diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index 4337378f1..c97ab82c1 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -254,10 +254,10 @@ type AppWrapperStatus struct { // Resources consumed // The number of CPU consumed by all pods belonging to the AppWrapper. - TotalCPU int32 `json:"totalcpu,omitempty"` + TotalCPU int64 `json:"totalcpu,omitempty"` // The amount of memory consumed by all pods belonging to the AppWrapper. - TotalMemory int32 `json:"totalmemory,omitempty"` + TotalMemory int64 `json:"totalmemory,omitempty"` // The total number of GPUs consumed by all pods belonging to the AppWrapper. TotalGPU int64 `json:"totalgpu,omitempty"` diff --git a/pkg/apis/controller/v1beta1/queuejob.go b/pkg/apis/controller/v1beta1/queuejob.go index 872809f07..cdf681171 100644 --- a/pkg/apis/controller/v1beta1/queuejob.go +++ b/pkg/apis/controller/v1beta1/queuejob.go @@ -1,19 +1,4 @@ /* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package v1beta1 import ( @@ -43,7 +29,7 @@ type QueueJob struct { metav1.ObjectMeta `json:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"` - // Specification of the desired behavior of a cron job, including the minAvailable + // Specification of the desired behavior of a cron job Spec QueueJobSpec `json:"spec,omitempty" protobuf:"bytes,2,opt,name=spec"` // Current status of QueueJob diff --git a/pkg/controller/clusterstate/api/histogram_info.go b/pkg/controller/clusterstate/api/histogram_info.go index 31d3994a7..10640e43f 100644 --- a/pkg/controller/clusterstate/api/histogram_info.go +++ b/pkg/controller/clusterstate/api/histogram_info.go @@ -13,19 +13,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package api import ( - "github.com/prometheus/client_golang/prometheus" - "math" - "k8s.io/klog/v2" + + "github.com/prometheus/client_golang/prometheus" ) const ( - BucketCount = 20 //Must be > 0 - tolerance = 0.1 + BucketCount = 20 // Must be > 0 ) + type ResourceHistogram struct { MilliCPU *prometheus.Histogram Memory *prometheus.Histogram @@ -33,51 +33,50 @@ type ResourceHistogram struct { } func NewResourceHistogram(min *Resource, max *Resource) *ResourceHistogram { - start := max.MilliCPU width := 1.0 count := 2 - diff := math.Abs(min.MilliCPU - max.MilliCPU) - if diff >= tolerance { + diff := max.MilliCPU - min.MilliCPU + if diff > 0 { start = min.MilliCPU - width = (diff/(BucketCount - 1)) + width = float64(diff) / (BucketCount - 1) count = BucketCount + 1 } klog.V(10).Infof("[NewResourceHistogram] Start histogram numbers for CPU: start=%f, width=%f, count=%d", start, width, count) millicpuHist := prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "millicpu", - Buckets: prometheus.LinearBuckets(start, width, count),}) + Name: "millicpu", + Buckets: prometheus.LinearBuckets(float64(start), width, count)}) start = max.Memory width = 1.0 count = 2 - diff = math.Abs(min.Memory - max.Memory) - if diff >= tolerance { + diff = max.Memory - min.Memory + if diff > 0 { start = min.Memory - width = (diff/(BucketCount - 1)) + width = float64(diff) / (BucketCount - 1) count = BucketCount + 1 } klog.V(10).Infof("[NewResourceHistogram] Start histogram numbers for Memory: start=%f, width=%f, count=%d", start, width, count) memoryHist := prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "memory", - Buckets: prometheus.LinearBuckets(start, width, count),}) + Name: "memory", + Buckets: prometheus.LinearBuckets(float64(start), width, count)}) - start = float64(max.GPU) + start = max.GPU width = 1.0 count = 2 - diff = math.Abs(float64(min.GPU - max.GPU)) - if diff >= tolerance { - start = float64(min.GPU) - width = (diff/(BucketCount - 1)) + diff = max.GPU - min.GPU + if diff >= 0 { + start = min.GPU + width = float64(diff) / (BucketCount - 1) count = BucketCount + 1 } klog.V(10).Infof("[NewResourceHistogram] Start histogram numbers for GPU: start=%f, width=%f, count=%d", start, width, count) gpuHist := prometheus.NewHistogram(prometheus.HistogramOpts{ - Name: "gpu", - Buckets: prometheus.LinearBuckets(start, width, count),}) + Name: "gpu", + Buckets: prometheus.LinearBuckets(float64(start), width, count)}) rh := &ResourceHistogram{ MilliCPU: &millicpuHist, @@ -88,9 +87,7 @@ func NewResourceHistogram(min *Resource, max *Resource) *ResourceHistogram { } func (rh *ResourceHistogram) Observer(r *Resource) { - (*rh.MilliCPU).Observe(r.MilliCPU) - (*rh.Memory).Observe(r.Memory) + (*rh.MilliCPU).Observe(float64(r.MilliCPU)) + (*rh.Memory).Observe(float64(r.Memory)) (*rh.GPU).Observe(float64(r.GPU)) } - - diff --git a/pkg/controller/clusterstate/api/resource_info.go b/pkg/controller/clusterstate/api/resource_info.go index e38374ce2..6c7bbc906 100644 --- a/pkg/controller/clusterstate/api/resource_info.go +++ b/pkg/controller/clusterstate/api/resource_info.go @@ -1,19 +1,4 @@ /* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,18 +13,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package api import ( "fmt" - "math" v1 "k8s.io/api/core/v1" ) type Resource struct { - MilliCPU float64 - Memory float64 + MilliCPU int64 + Memory int64 GPU int64 } @@ -65,17 +50,17 @@ func (r *Resource) Clone() *Resource { return clone } -var minMilliCPU float64 = 10 -var minMemory float64 = 10 * 1024 * 1024 +var minMilliCPU int64 = 10 +var minMemory int64 = 10 * 1024 * 1024 func NewResource(rl v1.ResourceList) *Resource { r := EmptyResource() for rName, rQuant := range rl { switch rName { case v1.ResourceCPU: - r.MilliCPU += float64(rQuant.MilliValue()) + r.MilliCPU += rQuant.MilliValue() case v1.ResourceMemory: - r.Memory += float64(rQuant.Value()) + r.Memory += rQuant.Value() case GPUResourceName: q, _ := rQuant.AsInt64() r.GPU += q @@ -116,12 +101,12 @@ func (r *Resource) Replace(rr *Resource) *Resource { return r } -//Sub subtracts two Resource objects. +// Sub subtracts two Resource objects. func (r *Resource) Sub(rr *Resource) (*Resource, error) { return r.NonNegSub(rr) } -//Sub subtracts two Resource objects and return zero for negative subtractions. +// Sub subtracts two Resource objects and return zero for negative subtractions. func (r *Resource) NonNegSub(rr *Resource) (*Resource, error) { // Check for negative calculation var isNegative bool @@ -164,24 +149,24 @@ func (r *Resource) Less(rr *Resource) bool { } func (r *Resource) LessEqual(rr *Resource) bool { - return (r.MilliCPU < rr.MilliCPU || math.Abs(rr.MilliCPU-r.MilliCPU) < 0.01) && - (r.Memory < rr.Memory || math.Abs(rr.Memory-r.Memory) < 1) && - (r.GPU <= rr.GPU) + return r.MilliCPU < rr.MilliCPU && + r.Memory < rr.Memory && + r.GPU <= rr.GPU } func (r *Resource) String() string { - return fmt.Sprintf("cpu %0.2f, memory %0.2f, GPU %d", + return fmt.Sprintf("cpu %d, memory %d, GPU %d", r.MilliCPU, r.Memory, r.GPU) } -func (r *Resource) Get(rn v1.ResourceName) (float64, error) { +func (r *Resource) Get(rn v1.ResourceName) (int64, error) { switch rn { case v1.ResourceCPU: return r.MilliCPU, nil case v1.ResourceMemory: return r.Memory, nil case GPUResourceName: - return float64(r.GPU), nil + return r.GPU, nil default: err := fmt.Errorf("resource not supported %v", rn) return 0.0, err diff --git a/pkg/controller/metrics/adapter/provider/provider.go b/pkg/controller/metrics/adapter/provider/provider.go index 6909471b3..874b67f0b 100644 --- a/pkg/controller/metrics/adapter/provider/provider.go +++ b/pkg/controller/metrics/adapter/provider/provider.go @@ -1,19 +1,4 @@ /* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package provider import ( @@ -37,7 +23,6 @@ import ( "time" "github.com/emicklei/go-restful" - "k8s.io/klog/v2" apierr "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" @@ -47,12 +32,13 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/dynamic" + "k8s.io/klog/v2" "k8s.io/metrics/pkg/apis/custom_metrics" "k8s.io/metrics/pkg/apis/external_metrics" - clusterstatecache "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/cache" "github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/provider" "github.com/kubernetes-sigs/custom-metrics-apiserver/pkg/provider/helpers" + clusterstatecache "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/cache" ) // CustomMetricResource wraps provider.CustomMetricInfo in a struct which stores the Name and Namespace of the resource @@ -157,7 +143,7 @@ func NewFakeProvider(client dynamic.Interface, mapper apimeta.RESTMapper, cluste // webService creates a restful.WebService with routes set up for receiving fake metrics // These writing routes have been set up to be identical to the format of routes which metrics are read from. // There are 3 metric types available: namespaced, root-scoped, and namespaces. -// (Note: Namespaces, we're assuming, are themselves namespaced resources, but for consistency with how metrics are retreived they have a separate route) +// (Note: Namespaces, we're assuming, are themselves namespaced resources, but for consistency with how metrics are retrieved they have a separate route) func (p *clusterMetricsProvider) webService() *restful.WebService { klog.V(10).Infof("Entered webService()") ws := new(restful.WebService) @@ -360,7 +346,7 @@ func (p *clusterMetricsProvider) GetExternalMetric(namespace string, metricSelec p.valuesLock.RLock() defer p.valuesLock.RUnlock() - matchingMetrics := []external_metrics.ExternalMetricValue{} + var matchingMetrics []external_metrics.ExternalMetricValue for _, metric := range p.externalMetrics { klog.V(9).Infof("[GetExternalMetric] externalMetricsInfo: %s, externalMetricValue: %v, externalMetricLabels: %v ", metric.info.Metric, metric.Value, metric.labels) @@ -374,15 +360,15 @@ func (p *clusterMetricsProvider) GetExternalMetric(namespace string, metricSelec klog.V(9).Infof("[GetExternalMetric] Cache resources: %v", resources) klog.V(10).Infof("[GetExternalMetric] Setting memory metric Value: %f.", resources.Memory) - metricValue.Value = *resource.NewQuantity(int64(resources.Memory), resource.DecimalSI) - //metricValue.Value = *resource.NewQuantity(4500000000, resource.DecimalSI) + metricValue.Value = *resource.NewQuantity(resources.Memory, resource.DecimalSI) + // metricValue.Value = *resource.NewQuantity(4500000000, resource.DecimalSI) } else if strings.Compare(labelVal, "cpu") == 0 { // Set cpu Value resources := p.cache2.GetUnallocatedResources() klog.V(9).Infof("[GetExternalMetric] Cache resources: %f", resources) klog.V(10).Infof("[GetExternalMetric] Setting cpu metric Value: %v.", resources.MilliCPU) - metricValue.Value = *resource.NewQuantity(int64(resources.MilliCPU), resource.DecimalSI) + metricValue.Value = *resource.NewQuantity(resources.MilliCPU, resource.DecimalSI) } else if strings.Compare(labelVal, "gpu") == 0 { // Set gpu Value resources := p.cache2.GetUnallocatedResources() diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index d6489205f..271b3c42c 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -31,6 +31,7 @@ limitations under the License. package queuejob import ( + jsons "encoding/json" "fmt" "math" "math/rand" @@ -91,8 +92,6 @@ import ( "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobdispatch" - jsons "encoding/json" - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" clusterstatecache "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/cache" ) @@ -139,7 +138,7 @@ type XController struct { // eventQueue that need to sync up eventQueue *cache.FIFO - //QJ queue that needs to be allocated + // QJ queue that needs to be allocated qjqueue SchedulingQueue // our own local cache, used for computing total amount of resources @@ -236,7 +235,7 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * cc.qjobResControls = map[arbv1.ResourceType]queuejobresources.Interface{} RegisterAllQueueJobResourceTypes(&cc.qjobRegisteredResources) - //initialize pod sub-resource control + // initialize pod sub-resource control resControlPod, found, err := cc.qjobRegisteredResources.InitQueueJobResource(arbv1.ResourceTypePod, config) if err != nil { klog.Errorf("fail to create queuejob resource control") @@ -363,10 +362,10 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * case *arbv1.AppWrapper: klog.V(10).Infof("[Informer] Filter Name=%s Version=%s Local=%t FilterIgnore=%t Sender=%s &qj=%p qj=%+v", t.Name, t.ResourceVersion, t.Status.Local, t.Status.FilterIgnore, t.Status.Sender, t, t) // todo: This is a current workaround for duplicate message bug. - //if t.Status.Local == true { // ignore duplicate message from cache + // if t.Status.Local == true { // ignore duplicate message from cache // return false - //} - //t.Status.Local = true // another copy of this will be recognized as duplicate + // } + // t.Status.Local = true // another copy of this will be recognized as duplicate return true // return !t.Status.FilterIgnore // ignore update messages default: @@ -382,7 +381,7 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * cc.queueJobLister = cc.queueJobInformer.Lister() cc.queueJobSynced = cc.queueJobInformer.Informer().HasSynced - //create sub-resource reference manager + // create sub-resource reference manager cc.refManager = queuejobresources.NewLabelRefManager() // Setup Quota @@ -406,7 +405,7 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * klog.Infof("[Controller] Agent mode") } - //create agents and agentMap + // create agents and agentMap cc.agentMap = map[string]*queuejobdispatch.JobClusterAgent{} cc.agentList = []string{} for _, agentconfig := range strings.Split(serverOption.AgentConfigs, ",") { @@ -420,7 +419,7 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) * return nil } - //create (empty) dispatchMap + // create (empty) dispatchMap cc.dispatchMap = map[string]string{} return cc @@ -441,7 +440,7 @@ func (qjm *XController) PreemptQueueJobs() { } newjob.Status.CanRun = false cleanAppWrapper := false - //If dispatch deadline is exceeded no matter what the state of AW, kill the job and set status as Failed. + // If dispatch deadline is exceeded no matter what the state of AW, kill the job and set status as Failed. if (aw.Status.State == arbv1.AppWrapperStateActive) && (aw.Spec.SchedSpec.DispatchDuration.Limit > 0) { if aw.Spec.SchedSpec.DispatchDuration.Overrun { index := getIndexOfMatchedCondition(aw, arbv1.AppWrapperCondPreemptCandidate, "DispatchDeadlineExceeded") @@ -453,7 +452,7 @@ func (qjm *XController) PreemptQueueJobs() { cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondPreemptCandidate, v1.ConditionTrue, "DispatchDeadlineExceeded", "") newjob.Status.Conditions[index] = *cond.DeepCopy() } - //should the AW state be set in this method?? + // should the AW state be set in this method?? newjob.Status.State = arbv1.AppWrapperStateFailed newjob.Status.QueueJobState = arbv1.AppWrapperCondFailed newjob.Status.Running = 0 @@ -461,10 +460,10 @@ func (qjm *XController) PreemptQueueJobs() { if err := qjm.updateEtcd(updateNewJob, "PreemptQueueJobs - CanRun: false"); err != nil { klog.Errorf("Failed to update status of AppWrapper %v/%v: %v", aw.Namespace, aw.Name, err) } - //cannot use cleanup AW, since it puts AW back in running state + // cannot use cleanup AW, since it puts AW back in running state go qjm.qjqueue.AddUnschedulableIfNotPresent(aw) - //Move to next AW + // Move to next AW continue } } @@ -501,11 +500,11 @@ func (qjm *XController) PreemptQueueJobs() { updateNewJob = newjob.DeepCopy() } else { - //If pods failed scheduling generate new preempt condition + // If pods failed scheduling generate new preempt condition message = fmt.Sprintf("Pods failed scheduling failed=%v, running=%v.", len(aw.Status.PendingPodConditions), aw.Status.Running) index := getIndexOfMatchedCondition(newjob, arbv1.AppWrapperCondPreemptCandidate, "PodsFailedScheduling") - //ignore co-scheduler failed scheduling events. This is a temp - //work around until co-scheduler version 0.22.X perf issues are resolved. + // ignore co-scheduler failed scheduling events. This is a temp + // work around until co-scheduler version 0.22.X perf issues are resolved. if index < 0 { cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondPreemptCandidate, v1.ConditionTrue, "PodsFailedScheduling", message) newjob.Status.Conditions = append(newjob.Status.Conditions, cond) @@ -524,7 +523,7 @@ func (qjm *XController) PreemptQueueJobs() { go qjm.Cleanup(aw) } else { klog.V(4).Infof("[PreemptQueueJobs] Adding preempted AppWrapper %s/%s to backoff queue.", aw.Name, aw.Namespace) - //Only back-off AWs that are in state running and not in state Failed + // Only back-off AWs that are in state running and not in state Failed if updateNewJob.Status.State != arbv1.AppWrapperStateFailed { go qjm.backoff(aw, "PreemptionTriggered", string(message)) } @@ -579,7 +578,7 @@ func (qjm *XController) GetQueueJobsEligibleForPreemption() []*arbv1.AppWrapper klog.V(8).Infof("Appwrapper Dispatch limit exceeded, currentTime %v, dispatchTimeInSeconds %v", currentTime, dispatchDuration) value.Spec.SchedSpec.DispatchDuration.Overrun = true qjobs = append(qjobs, value) - //Got AW which exceeded dispatch runtime limit, move to next AW + // Got AW which exceeded dispatch runtime limit, move to next AW continue } } @@ -693,7 +692,7 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) arb if err := jsons.Unmarshal(objectName.Raw, &blob); err != nil { klog.Errorf("[getAppWrapperCompletionStatus] Error unmarshalling, err=%#v", err) } - unstruct.Object = blob.(map[string]interface{}) //set object to the content of the blob after Unmarshalling + unstruct.Object = blob.(map[string]interface{}) // set object to the content of the blob after Unmarshalling name := "" if md, ok := unstruct.Object["metadata"]; ok { metadata := md.(map[string]interface{}) @@ -708,18 +707,18 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) arb status := qjm.genericresources.IsItemCompleted(&genericItem, caw.Namespace, caw.Name, name) if !status { - //early termination because a required item is not completed + // early termination because a required item is not completed return caw.Status.State } - //only consider count completion required for valid items + // only consider count completion required for valid items countCompletionRequired = countCompletionRequired + 1 } } klog.V(4).Infof("[getAppWrapperCompletionStatus] countCompletionRequired %v, podsRunning %v, podsPending %v", countCompletionRequired, caw.Status.Running, caw.Status.Pending) - //Set new status only when completion required flag is present in genericitems array + // Set new status only when completion required flag is present in genericitems array if countCompletionRequired > 0 { if caw.Status.Running == 0 && caw.Status.Pending == 0 { return arbv1.AppWrapperStateCompleted @@ -729,12 +728,12 @@ func (qjm *XController) getAppWrapperCompletionStatus(caw *arbv1.AppWrapper) arb return arbv1.AppWrapperStateRunningHoldCompletion } } - //return previous condition + // return previous condition return caw.Status.State } func (qjm *XController) GetAggregatedResources(cqj *arbv1.AppWrapper) *clusterstateapi.Resource { - //todo: deprecate resource controllers + // todo: deprecate resource controllers allocated := clusterstateapi.EmptyResource() for _, resctrl := range qjm.qjobResControls { qjv := resctrl.GetAggregatedResources(cqj) @@ -774,7 +773,7 @@ func (qjm *XController) getProposedPreemptions(requestingJob *arbv1.AppWrapper, klog.V(10).Infof("[getProposedPreemptions] Processing %v candidate jobs for preemption.", len(preemptableAWs)) } - //Sort keys of map + // Sort keys of map priorityKeyValues := make([]float64, len(preemptableAWs)) i := 0 for key, _ := range preemptableAWs { @@ -870,7 +869,7 @@ func (qjm *XController) getDispatchedAppWrappers() (map[string]*clusterstateapi. return awrRetVal, awsRetVal } -func (qjm *XController) addTotalSnapshotResourcesConsumedByAw(totalgpu int64, totalcpu float64, totalmemory float64) *clusterstateapi.Resource { +func (qjm *XController) addTotalSnapshotResourcesConsumedByAw(totalgpu int64, totalcpu int64, totalmemory int64) *clusterstateapi.Resource { totalResource := clusterstateapi.EmptyResource() totalResource.GPU = totalgpu @@ -962,7 +961,7 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust pending = pending.Add(delta) continue } else { - //Do nothing + // Do nothing } } @@ -992,7 +991,7 @@ func (qjm *XController) chooseAgent(qj *arbv1.AppWrapper) string { if qjAggrResources.LessEqual(resources) { klog.V(2).Infof("[chooseAgent] Agent %s has enough resources\n", agentId) - //Now evaluate quota + // Now evaluate quota if qjm.serverOption.QuotaEnabled { if qjm.quotaManager != nil { if fits, preemptAWs, _ := qjm.quotaManager.Fits(qj, qjAggrResources, proposedPreemptions); fits { @@ -1006,7 +1005,7 @@ func (qjm *XController) chooseAgent(qj *arbv1.AppWrapper) string { klog.Errorf("[chooseAgent] Quota evaluation is enable but not initialize. AppWrapper %s/%s does not have enough quota\n", qj.Name, qj.Namespace) } } else { - //Quota is not enabled to return selected agent + // Quota is not enabled to return selected agent return agentId } } else { @@ -1175,7 +1174,7 @@ func (qjm *XController) ScheduleNext() { apiQueueJob.DeepCopyInto(qj) } - //apiQueueJob.Status.CanRun = true + // apiQueueJob.Status.CanRun = true qj.Status.CanRun = true queueJobKey, _ := GetQueueJobKey(qj) qjm.dispatchMap[queueJobKey] = agentId @@ -1191,9 +1190,9 @@ func (qjm *XController) ScheduleNext() { qjm.qjqueue.Delete(qj) } - //if _, err := qjm.arbclients.ArbV1().AppWrappers(qj.Namespace).Update(apiQueueJob); err != nil { + // if _, err := qjm.arbclients.ArbV1().AppWrappers(qj.Namespace).Update(apiQueueJob); err != nil { // klog.Errorf("Failed to update status of AppWrapper %v/%v: %v", qj.Namespace, qj.Name, err) - //} + // } klog.V(10).Infof("[TTime] %s, %s: ScheduleNextAfterEtcd", qj.Name, time.Now().Sub(qj.CreationTimestamp.Time)) return } else { @@ -1291,7 +1290,7 @@ func (qjm *XController) ScheduleNext() { fits = quotaFits } else { fits = false - //Quota manager not initialized + // Quota manager not initialized dispatchFailedMessage = "Quota evaluation is enabled but not initialized. Insufficient quota to dispatch AppWrapper." klog.Errorf("[ScheduleNext] Quota evaluation is enabled but not initialized. AppWrapper %s/%s does not have enough quota\n", qj.Name, qj.Namespace) } @@ -1307,7 +1306,7 @@ func (qjm *XController) ScheduleNext() { if e != nil { klog.Errorf("[ScheduleNext] Unable to get AW %s from API cache &aw=%p Version=%s Status=%+v err=%#v", qj.Name, qj, qj.ResourceVersion, qj.Status, err) if qjm.quotaManager != nil && quotaFits { - //Quota was allocated for this appwrapper, release it. + // Quota was allocated for this appwrapper, release it. qjm.quotaManager.Release(qj) } return @@ -1336,8 +1335,8 @@ func (qjm *XController) ScheduleNext() { forwarded = true klog.V(4).Infof("[ScheduleNext] %s Delay=%.6f seconds eventQueue.Add_afterHeadOfLine activeQ=%t, Unsched=%t &aw=%p Version=%s Status=%+v", qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qjm.qjqueue.IfExistActiveQ(qj), qjm.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) } - } //updateEtcd - } //fits + } // updateEtcd + } // fits } else { // Not enough free resources to dispatch HOL dispatchFailedMessage = "Insufficient resources to dispatch AppWrapper." klog.V(4).Infof("[ScheduleNext] HOL Blocking by %s for %s activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", qj.Name, time.Now().Sub(HOLStartTime), qjm.qjqueue.IfExistActiveQ(qj), qjm.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) @@ -1349,13 +1348,13 @@ func (qjm *XController) ScheduleNext() { // stop trying to dispatch after HeadOfLineHoldingTime // release quota if allocated if qjm.quotaManager != nil && quotaFits { - //Quota was allocated for this appwrapper, release it. + // Quota was allocated for this appwrapper, release it. qjm.quotaManager.Release(qj) } break } else { // Try to dispatch again after one second if qjm.quotaManager != nil && quotaFits { - //release any quota as the qj will be tried again and the quota might have been allocated. + // release any quota as the qj will be tried again and the quota might have been allocated. qjm.quotaManager.Release(qj) } time.Sleep(time.Second * 1) @@ -1375,17 +1374,17 @@ func (qjm *XController) ScheduleNext() { // Update AppWrappers in etcd // todo: This is a current workaround for duplicate message bug. func (cc *XController) updateEtcd(qj *arbv1.AppWrapper, at string) error { - //apiCacheAWJob, e := cc.queueJobLister.AppWrappers(qj.Namespace).Get(qj.Name) + // apiCacheAWJob, e := cc.queueJobLister.AppWrappers(qj.Namespace).Get(qj.Name) // - //if (e != nil) { + // if (e != nil) { // klog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", // apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, e) // return e - //} + // } - //TODO: Remove next line + // TODO: Remove next line var apiCacheAWJob *arbv1.AppWrapper - //TODO: Remove next line + // TODO: Remove next line apiCacheAWJob = qj apiCacheAWJob.Status.Sender = "before " + at // set Sender string to indicate code location apiCacheAWJob.Status.Local = false // for Informer FilterFunc to pickup @@ -1398,8 +1397,8 @@ func (cc *XController) updateEtcd(qj *arbv1.AppWrapper, at string) error { } klog.V(10).Infof("[updateEtcd] AppWrapperUpdate success %s at %s &qj=%p qj=%+v", apiCacheAWJob.Name, at, apiCacheAWJob, apiCacheAWJob) - //qj.Status.Local = true // for Informer FilterFunc to ignore duplicate - //qj.Status.Sender = "after "+ at // set Sender string to indicate code location + // qj.Status.Local = true // for Informer FilterFunc to ignore duplicate + // qj.Status.Sender = "after "+ at // set Sender string to indicate code location return nil } @@ -1465,7 +1464,7 @@ func (qjm *XController) getLatestStatusConditionType(aw *arbv1.AppWrapper, condT for _, condition := range aw.Status.Conditions { // Matching condition? if condition.Type == condType { - //First time match? + // First time match? if (arbv1.AppWrapperCondition{} == latestConditionBasedOnType) { latestConditionBasedOnType = condition } else { @@ -1527,7 +1526,7 @@ func (qjm *XController) backoff(q *arbv1.AppWrapper, reason string, message stri apiCacheAWJob.Status.QueueJobState = arbv1.AppWrapperCondBackoff workingAW.Status.FilterIgnore = true // update QueueJobState only, no work needed qjm.addOrUpdateCondition(workingAW, arbv1.AppWrapperCondBackoff, v1.ConditionTrue, reason, message) - //qjm.updateEtcd(workingAW, "backoff - Rejoining") + // qjm.updateEtcd(workingAW, "backoff - Rejoining") qjm.updateStatusInEtcd(workingAW, "backoff - Rejoining") } else { workingAW = q @@ -1622,7 +1621,7 @@ func (qjm *XController) UpdateQueueJobs() { klog.V(3).Infof("[UpdateQueueJobs] %s 0Delay=%.6f seconds CreationTimestamp=%s ControllerFirstTimestamp=%s", newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob.CreationTimestamp, newjob.Status.ControllerFirstTimestamp) } - //only set if appwrapper is running and dispatch time is not set previously + // only set if appwrapper is running and dispatch time is not set previously if newjob.Status.QueueJobState == "Running" && newjob.Status.ControllerFirstDispatchTimestamp.String() == "0001-01-01 00:00:00 +0000 UTC" { newjob.Status.ControllerFirstDispatchTimestamp = firstTime } @@ -1687,7 +1686,7 @@ func (cc *XController) updateQueueJob(oldObj, newObj interface{}) { if !ok { klog.Errorf("[Informer-updateQJ] old object is not AppWrapper. enqueue(newQJ). oldObj=%+v", oldObj) klog.V(4).Infof("[Informer-updateQJ] %s *Delay=%.6f seconds BadOldObject enqueue &newQJ=%p Version=%s Status=%+v", newQJ.Name, time.Now().Sub(newQJ.Status.ControllerFirstTimestamp.Time).Seconds(), newQJ, newQJ.ResourceVersion, newQJ.Status) - //cc.enqueue(newQJ) + // cc.enqueue(newQJ) return } // AppWrappers may come out of order. Ignore old ones. @@ -1864,10 +1863,10 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { klog.V(10).Infof("[syncQueueJob] AppWrapper %s not found in cache: info=%+v", qj.Name, err) // Implicit detection of deletion if apierrors.IsNotFound(err) { - //if (cc.isDispatcher) { + // if (cc.isDispatcher) { cc.Cleanup(qj) cc.qjqueue.Delete(qj) - //} + // } return nil } return err @@ -1884,7 +1883,7 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { // If it is Agent (not a dispatcher), update pod information podPhaseChanges := false if !cc.isDispatcher { - //Make a copy first to not update cache object and to use for comparing + // Make a copy first to not update cache object and to use for comparing awNew := qj.DeepCopy() // we call sync to update pods running, pending,... if qj.Status.State == arbv1.AppWrapperStateActive { @@ -1903,13 +1902,13 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { cc.updateEtcd(awNew, "[syncQueueJob] setRunning") } - //For debugging? + // For debugging? if !reflect.DeepEqual(awNew.Status, qj.Status) { podPhaseChanges = true // Using DeepCopy before DeepCopyInto as it seems that DeepCopyInto does not alloc a new memory object awNewStatus := awNew.Status.DeepCopy() awNewStatus.DeepCopyInto(&qj.Status) - //awNew.Status.DeepCopy().DeepCopyInto(&qj.Status) + // awNew.Status.DeepCopy().DeepCopyInto(&qj.Status) klog.V(10).Infof("[syncQueueJob] AW pod phase change(s) detected %s &eventqueueaw=%p eventqueueawVersion=%s eventqueueawStatus=%+v; &newaw=%p newawVersion=%s newawStatus=%+v", qj.Name, qj, qj.ResourceVersion, qj.Status, awNew, awNew.ResourceVersion, awNew.Status) } @@ -1917,7 +1916,7 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { } return cc.manageQueueJob(qj, podPhaseChanges) - //return cc.manageQueueJob(cacheAWJob) + // return cc.manageQueueJob(cacheAWJob) } // manageQueueJob is the core method responsible for managing the number of running @@ -1941,7 +1940,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool if err != nil { return err } - //empty finalizers and delete the queuejob again + // empty finalizers and delete the queuejob again accessor, err := meta.Accessor(qj) if err != nil { return err @@ -1953,7 +1952,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool return nil } - //Job is Complete only update pods if needed. + // Job is Complete only update pods if needed. if qj.Status.State == arbv1.AppWrapperStateCompleted || qj.Status.State == arbv1.AppWrapperStateRunningHoldCompletion { if podPhaseChanges { // Only update etcd if AW status has changed. This can happen for periodic @@ -2013,7 +2012,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool return nil } // End of first execution of qj to add to qjqueue for ScheduleNext - //Handle recovery condition + // Handle recovery condition if !qj.Status.CanRun && qj.Status.State == arbv1.AppWrapperStateEnqueued && !cc.qjqueue.IfExistUnschedulableQ(qj) && !cc.qjqueue.IfExistActiveQ(qj) { // One more check to ensure AW is not the current active schedule object @@ -2029,7 +2028,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool if qj.Status.CanRun && qj.Status.State != arbv1.AppWrapperStateActive && qj.Status.State != arbv1.AppWrapperStateCompleted && qj.Status.State != arbv1.AppWrapperStateRunningHoldCompletion { - //keep conditions until the appwrapper is re-dispatched + // keep conditions until the appwrapper is re-dispatched qj.Status.PendingPodConditions = nil qj.Status.State = arbv1.AppWrapperStateActive @@ -2104,11 +2103,11 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool } } else if qj.Status.CanRun && qj.Status.State == arbv1.AppWrapperStateActive { - //set appwrapper status to Complete or RunningHoldCompletion + // set appwrapper status to Complete or RunningHoldCompletion derivedAwStatus := cc.getAppWrapperCompletionStatus(qj) - //Set Appwrapper state to complete if all items in Appwrapper - //are completed + // Set Appwrapper state to complete if all items in Appwrapper + // are completed if derivedAwStatus == arbv1.AppWrapperStateRunningHoldCompletion { qj.Status.State = derivedAwStatus var updateQj *arbv1.AppWrapper @@ -2126,7 +2125,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool } cc.updateEtcd(updateQj, "[syncQueueJob] setRunningHoldCompletion") } - //Set appwrapper status to complete + // Set appwrapper status to complete if derivedAwStatus == arbv1.AppWrapperStateCompleted { qj.Status.State = derivedAwStatus qj.Status.CanRun = false @@ -2167,7 +2166,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool if err != nil { return err } - //empty finalizers and delete the queuejob again + // empty finalizers and delete the queuejob again accessor, err := meta.Accessor(qj) if err != nil { return err @@ -2206,10 +2205,10 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool } } - //_, err = cc.arbclients.ArbV1().AppWrappers(qj.Namespace).Update(qj) - //if err != nil { + // _, err = cc.arbclients.ArbV1().AppWrappers(qj.Namespace).Update(qj) + // if err != nil { // return err - //} + // } return nil } @@ -2282,7 +2281,7 @@ func (cc *XController) Cleanup(appwrapper *arbv1.AppWrapper) error { } else { // klog.Infof("[Dispatcher] Cleanup: State=%s\n", appwrapper.Status.State) - //if ! appwrapper.Status.CanRun && appwrapper.Status.IsDispatched { + // if ! appwrapper.Status.CanRun && appwrapper.Status.IsDispatched { if appwrapper.Status.IsDispatched { queuejobKey, _ := GetQueueJobKey(appwrapper) if obj, ok := cc.dispatchMap[queuejobKey]; ok { diff --git a/pkg/controller/queuejobdispatch/queuejobagent.go b/pkg/controller/queuejobdispatch/queuejobagent.go index 738db41b4..4ac80def2 100644 --- a/pkg/controller/queuejobdispatch/queuejobagent.go +++ b/pkg/controller/queuejobdispatch/queuejobagent.go @@ -241,10 +241,10 @@ func (qa *JobClusterAgent) UpdateAggrResources() error { res.Items[i].MetricName, res.Items[i].MetricLabels, res.Items[i].Value, qa.AgentId, qa.DeploymentName) clusterMetricType := res.Items[i].MetricLabels["cluster"] - if strings.Compare(clusterMetricType, "cpu") == 0 || strings.Compare(clusterMetricType, "memory") == 0 { + if strings.Compare(clusterMetricType, "cpu") == 0 || strings.Compare(clusterMetricType, "memory") == 0 { val, units, _ := getFloatString(res.Items[i].Value) num, err := strconv.ParseFloat(val, 64) - if err !=nil { + if err != nil { klog.Warningf("[Dispatcher: UpdateAggrResources] Possible issue converting %s string value of %s due to error: %v\n", clusterMetricType, res.Items[i].Value, err) } else { @@ -255,11 +255,11 @@ func (qa *JobClusterAgent) UpdateAggrResources() error { f_zero := math.Float64bits(0.0) if f_num > f_zero { if strings.Compare(clusterMetricType, "cpu") == 0 { - qa.AggrResources.MilliCPU = num + qa.AggrResources.MilliCPU = int64(num) klog.V(10).Infof("[Dispatcher: UpdateAggrResources] Updated %s from %f to %f for metrics: %v from deployment Agent ID: %s with Agent Name: %s\n", clusterMetricType, qa.AggrResources.MilliCPU, num, res, qa.AgentId, qa.DeploymentName) } else { - qa.AggrResources.Memory = num + qa.AggrResources.Memory = int64(num) klog.V(10).Infof("[Dispatcher: UpdateAggrResources] Updated %s from %f to %f for metrics: %v from deployment Agent ID: %s with Agent Name: %s\n", clusterMetricType, qa.AggrResources.Memory, num, res, qa.AgentId, qa.DeploymentName) } @@ -269,8 +269,8 @@ func (qa *JobClusterAgent) UpdateAggrResources() error { } // Float value resulted in zero value. } // Converting string to float success } else if strings.Compare(clusterMetricType, "gpu") == 0 { - num, err := getInt64String(res.Items[i].Value) - if err !=nil { + num, err := getInt64String(res.Items[i].Value) + if err != nil { klog.Warningf("[Dispatcher: UpdateAggrResources] Possible issue converting %s string value of %s due to int64 type, error: %v\n", clusterMetricType, res.Items[i].Value, err) } else { @@ -309,7 +309,7 @@ func getFloatString(num string) (string, string, error) { } else { validatedNum = num } - return validatedNum, numUnits, err + return validatedNum, numUnits, err } func getInt64String(num string) (int64, error) { var validatedNum int64 = 0 @@ -317,7 +317,7 @@ func getInt64String(num string) (int64, error) { if err == nil { validatedNum = n } - return validatedNum, err + return validatedNum, err } func buildResource(cpu string, memory string) *clusterstateapi.Resource { diff --git a/pkg/controller/queuejobresources/configmap/configmap.go b/pkg/controller/queuejobresources/configmap/configmap.go index b2dc1fb8d..aedd67ab8 100644 --- a/pkg/controller/queuejobresources/configmap/configmap.go +++ b/pkg/controller/queuejobresources/configmap/configmap.go @@ -30,6 +30,8 @@ package configmap import ( "context" "fmt" + "sync" + "time" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" @@ -41,9 +43,6 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/klog/v2" - "sync" - "time" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" @@ -68,7 +67,7 @@ const ( ControllerUIDLabel string = "controller-uid" ) -//QueueJobResService contains service info +// QueueJobResService contains service info type QueueJobResConfigMap struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -81,14 +80,14 @@ type QueueJobResConfigMap struct { refManager queuejobresources.RefManager } -//Register registers a queue job resource type +// Register registers a queue job resource type func Register(regs *queuejobresources.RegisteredResources) { regs.Register(arbv1.ResourceTypeConfigMap, func(config *rest.Config) queuejobresources.Interface { return NewQueueJobResConfigMap(config) }) } -//NewQueueJobResService creates a service controller +// NewQueueJobResService creates a service controller func NewQueueJobResConfigMap(config *rest.Config) queuejobresources.Interface { qjrConfigMap := &QueueJobResConfigMap{ clients: kubernetes.NewForConfigOrDie(config), @@ -148,7 +147,7 @@ func (qjrConfigMap *QueueJobResConfigMap) deleteConfigMap(obj interface{}) { return } -func (qjrConfigMap *QueueJobResConfigMap) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { +func (qjrConfigMap *QueueJobResConfigMap) GetAggregatedResourcesByPriority(priority int32, job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() return total } @@ -217,7 +216,7 @@ func (qjrConfigMap *QueueJobResConfigMap) SyncQueueJob(queuejob *arbv1.AppWrappe klog.V(4).Infof("QJob: %s had %d configMaps and %d desired configMaps", queuejob.Name, configMapLen, replicas) if diff > 0 { - //TODO: need set reference after Service has been really added + // TODO: need set reference after Service has been really added tmpConfigMap := v1.ConfigMap{} err = qjrConfigMap.refManager.AddReference(qjobRes, &tmpConfigMap) if err != nil { @@ -318,7 +317,7 @@ func (qjrConfigMap *QueueJobResConfigMap) deleteQueueJobResConfigMaps(qjobRes *a return nil } -//Cleanup deletes all services +// Cleanup deletes all services func (qjrConfigMap *QueueJobResConfigMap) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { return qjrConfigMap.deleteQueueJobResConfigMaps(qjobRes, queuejob) } diff --git a/pkg/controller/queuejobresources/deployment/deployment.go b/pkg/controller/queuejobresources/deployment/deployment.go index e7c3af108..f8128756e 100644 --- a/pkg/controller/queuejobresources/deployment/deployment.go +++ b/pkg/controller/queuejobresources/deployment/deployment.go @@ -65,7 +65,7 @@ const ( ControllerUIDLabel string = "controller-uid" ) -//QueueJobResDeployment contains the resources of this queuejob +// QueueJobResDeployment contains the resources of this queuejob type QueueJobResDeployment struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -78,14 +78,14 @@ type QueueJobResDeployment struct { refManager queuejobresources.RefManager } -//Register registers a queue job resource type +// Register registers a queue job resource type func Register(regs *queuejobresources.RegisteredResources) { regs.Register(arbv1.ResourceTypeDeployment, func(config *rest.Config) queuejobresources.Interface { return NewQueueJobResDeployment(config) }) } -//NewQueueJobResDeployment returns a new deployment controller +// NewQueueJobResDeployment returns a new deployment controller func NewQueueJobResDeployment(config *rest.Config) queuejobresources.Interface { qjrDeployment := &QueueJobResDeployment{ clients: kubernetes.NewForConfigOrDie(config), @@ -126,7 +126,7 @@ func (qjrDeployment *QueueJobResDeployment) GetPodTemplate(qjobRes *arbv1.AppWra return nil, -1, err } - // Validate template field + // Validate template field if res.Spec.Replicas == nil { return nil, 0, fmt.Errorf("spec.replicas field not defined in resource object: %#v", qjobRes) } @@ -136,7 +136,7 @@ func (qjrDeployment *QueueJobResDeployment) GetPodTemplate(qjobRes *arbv1.AppWra func (qjrDeployment *QueueJobResDeployment) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() if job.Spec.AggrResources.Items != nil { - //calculate scaling + // calculate scaling for _, ar := range job.Spec.AggrResources.Items { if ar.Type == arbv1.ResourceTypeDeployment { template, replicas, err := qjrDeployment.GetPodTemplate(&ar) @@ -144,8 +144,8 @@ func (qjrDeployment *QueueJobResDeployment) GetAggregatedResources(job *arbv1.Ap klog.Errorf("Pod Template not found in item: %#v error: %#v. Aggregated resources set to 0.", ar, err) } else { myres := queuejobresources.GetPodResources(template) - myres.MilliCPU = float64(replicas) * myres.MilliCPU - myres.Memory = float64(replicas) * myres.Memory + myres.MilliCPU = int64(replicas) * myres.MilliCPU + myres.Memory = int64(replicas) * myres.Memory myres.GPU = int64(replicas) * myres.GPU total = total.Add(myres) } @@ -155,10 +155,10 @@ func (qjrDeployment *QueueJobResDeployment) GetAggregatedResources(job *arbv1.Ap return total } -func (qjrDeployment *QueueJobResDeployment) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { +func (qjrDeployment *QueueJobResDeployment) GetAggregatedResourcesByPriority(priority int32, job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() if job.Spec.AggrResources.Items != nil { - //calculate scaling + // calculate scaling for _, ar := range job.Spec.AggrResources.Items { if ar.Priority < priority { continue @@ -166,8 +166,8 @@ func (qjrDeployment *QueueJobResDeployment) GetAggregatedResourcesByPriority(pri if ar.Type == arbv1.ResourceTypeDeployment { template, replicas, _ := qjrDeployment.GetPodTemplate(&ar) myres := queuejobresources.GetPodResources(template) - myres.MilliCPU = float64(replicas) * myres.MilliCPU - myres.Memory = float64(replicas) * myres.Memory + myres.MilliCPU = int64(replicas) * myres.MilliCPU + myres.Memory = int64(replicas) * myres.Memory myres.GPU = int64(replicas) * myres.GPU total = total.Add(myres) } @@ -176,7 +176,7 @@ func (qjrDeployment *QueueJobResDeployment) GetAggregatedResourcesByPriority(pri return total } -//func (qjrDeployment *QueueJobResDeployment) GetAggregatedResourcesByPhase(phase v1.PodPhase, job *arbv1.AppWrapper) *clusterstateapi.Resource { +// func (qjrDeployment *QueueJobResDeployment) GetAggregatedResourcesByPhase(phase v1.PodPhase, job *arbv1.AppWrapper) *clusterstateapi.Resource { // total := clusterstateapi.EmptyResource() // if job.Spec.AggrResources.Items != nil { // //calculate scaling @@ -192,9 +192,9 @@ func (qjrDeployment *QueueJobResDeployment) GetAggregatedResourcesByPriority(pri // } // } // return total -//} +// } -//Run the main goroutine responsible for watching and deployments. +// Run the main goroutine responsible for watching and deployments. func (qjrDeployment *QueueJobResDeployment) Run(stopCh <-chan struct{}) { qjrDeployment.deployInformer.Informer().Run(stopCh) } @@ -275,7 +275,7 @@ func (qjrDeployment *QueueJobResDeployment) SyncQueueJob(queuejob *arbv1.AppWrap klog.V(4).Infof("QJob: %s had %d Deployments and %d desired Deployments", queuejob.Name, deploymentLen, replicas) if diff > 0 { - //TODO: need set reference after Service has been really added + // TODO: need set reference after Service has been really added tmpDeployment := apps.Deployment{} err = qjrDeployment.refManager.AddReference(qjobRes, &tmpDeployment) if err != nil { @@ -379,7 +379,7 @@ func (qjrDeployment *QueueJobResDeployment) deleteQueueJobResDeployments(qjobRes return nil } -//Cleanup deletes all services +// Cleanup deletes all services func (qjrDeployment *QueueJobResDeployment) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { return qjrDeployment.deleteQueueJobResDeployments(qjobRes, queuejob) } diff --git a/pkg/controller/queuejobresources/genericresource/genericresource.go b/pkg/controller/queuejobresources/genericresource/genericresource.go index 4b2879305..e29fbacbe 100644 --- a/pkg/controller/queuejobresources/genericresource/genericresource.go +++ b/pkg/controller/queuejobresources/genericresource/genericresource.go @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -19,7 +19,6 @@ import ( "context" "encoding/json" "fmt" - "math" "reflect" "runtime/debug" "strings" @@ -85,7 +84,7 @@ func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperG name := "" namespaced := true - //todo:DELETEME dd := common.KubeClient.Discovery() + // todo:DELETEME dd := common.KubeClient.Discovery() dd := gr.clients.Discovery() apigroups, err := restmapper.GetAPIGroupResources(dd) if err != nil { @@ -106,7 +105,7 @@ func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperG return name, gvk, err } - //todo:DELETEME restconfig := common.KubeConfig + // todo:DELETEME restconfig := common.KubeConfig restconfig := gr.kubeClientConfig restconfig.GroupVersion = &schema.GroupVersion{ Group: mapping.GroupVersionKind.Group, @@ -144,7 +143,7 @@ func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperG return name, gvk, err } - unstruct.Object = blob.(map[string]interface{}) //set object to the content of the blob after Unmarshalling + unstruct.Object = blob.(map[string]interface{}) // set object to the content of the blob after Unmarshalling namespace := "" if md, ok := unstruct.Object["metadata"]; ok { @@ -197,7 +196,7 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra }() namespaced := true - //todo:DELETEME dd := common.KubeClient.Discovery() + // todo:DELETEME dd := common.KubeClient.Discovery() dd := gr.clients.Discovery() apigroups, err := restmapper.GetAPIGroupResources(dd) if err != nil { @@ -206,8 +205,8 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra } ext := awr.GenericTemplate restmapper := restmapper.NewDiscoveryRESTMapper(apigroups) - //versions := &unstructured.Unstructured{} - //_, gvk, err := unstructured.UnstructuredJSONScheme.Decode(ext.Raw, nil, versions) + // versions := &unstructured.Unstructured{} + // _, gvk, err := unstructured.UnstructuredJSONScheme.Decode(ext.Raw, nil, versions) _, gvk, err := unstructured.UnstructuredJSONScheme.Decode(ext.Raw, nil, nil) if err != nil { klog.Errorf("Decoding error, please check your CR! Aborting handling the resource creation, err: `%v`", err) @@ -219,7 +218,7 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra return []*v1.Pod{}, err } - //todo:DELETEME restconfig := common.KubeConfig + // todo:DELETEME restconfig := common.KubeConfig restconfig := gr.kubeClientConfig restconfig.GroupVersion = &schema.GroupVersion{ Group: mapping.GroupVersionKind.Group, @@ -256,7 +255,7 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra return []*v1.Pod{}, err } ownerRef := metav1.NewControllerRef(aw, appWrapperKind) - unstruct.Object = blob.(map[string]interface{}) //set object to the content of the blob after Unmarshalling + unstruct.Object = blob.(map[string]interface{}) // set object to the content of the blob after Unmarshalling unstruct.SetOwnerReferences(append(unstruct.GetOwnerReferences(), *ownerRef)) namespace := "default" name := "" @@ -337,7 +336,7 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra return pods, nil } -//checks if object has pod template spec and add new labels +// checks if object has pod template spec and add new labels func addLabelsToPodTemplateField(unstruct *unstructured.Unstructured, labels map[string]string) (hasFields bool) { spec, isFound, _ := unstructured.NestedMap(unstruct.UnstructuredContent(), "spec") if !isFound { @@ -379,7 +378,7 @@ func addLabelsToPodTemplateField(unstruct *unstructured.Unstructured, labels map return isFound } -//checks if object has replicas and containers field +// checks if object has replicas and containers field func hasFields(obj runtime.RawExtension) (hasFields bool, replica float64, containers []v1.Container) { var unstruct unstructured.Unstructured unstruct.Object = make(map[string]interface{}) @@ -552,22 +551,21 @@ func getPodResources(pod arbv1.CustomPodResourceTemplate) (resource *clusterstat replicas := pod.Replicas req := clusterstateapi.NewResource(pod.Requests) limit := clusterstateapi.NewResource(pod.Limits) - tolerance := 0.001 // Use limit if request is 0 - if diff := math.Abs(req.MilliCPU - float64(0.0)); diff < tolerance { + if req.MilliCPU == 0 { req.MilliCPU = limit.MilliCPU } - if diff := math.Abs(req.Memory - float64(0.0)); diff < tolerance { + if req.Memory == 0 { req.Memory = limit.Memory } if req.GPU <= 0 { req.GPU = limit.GPU } - req.MilliCPU = req.MilliCPU * float64(replicas) - req.Memory = req.Memory * float64(replicas) + req.MilliCPU = req.MilliCPU * int64(replicas) + req.Memory = req.Memory * int64(replicas) req.GPU = req.GPU * int64(replicas) return req } @@ -576,14 +574,12 @@ func getContainerResources(container v1.Container, replicas float64) *clustersta req := clusterstateapi.NewResource(container.Resources.Requests) limit := clusterstateapi.NewResource(container.Resources.Limits) - tolerance := 0.001 - // Use limit if request is 0 - if diff := math.Abs(req.MilliCPU - float64(0.0)); diff < tolerance { + if req.MilliCPU == 0 { req.MilliCPU = limit.MilliCPU } - if diff := math.Abs(req.Memory - float64(0.0)); diff < tolerance { + if req.Memory == 0 { req.Memory = limit.Memory } @@ -591,13 +587,13 @@ func getContainerResources(container v1.Container, replicas float64) *clustersta req.GPU = limit.GPU } - req.MilliCPU = req.MilliCPU * float64(replicas) - req.Memory = req.Memory * float64(replicas) + req.MilliCPU = req.MilliCPU * int64(replicas) + req.Memory = req.Memory * int64(replicas) req.GPU = req.GPU * int64(replicas) return req } -//returns status of an item present in etcd +// returns status of an item present in etcd func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResource, namespace string, appwrapperName string, genericItemName string) (completed bool) { dd := gr.clients.Discovery() apigroups, err := restmapper.GetAPIGroupResources(dd) @@ -637,7 +633,7 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc } for _, job := range inEtcd.Items { - //job.UnstructuredContent() has status information + // job.UnstructuredContent() has status information unstructuredObjectName := job.GetName() if unstructuredObjectName != genericItemName { continue @@ -654,8 +650,8 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc continue } - //check with a false status field - //check also conditions object + // check with a false status field + // check also conditions object jobMap := job.UnstructuredContent() if jobMap == nil { continue @@ -665,14 +661,14 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc status := job.Object["status"].(map[string]interface{}) if status["conditions"] != nil { conditions, ok := job.Object["status"].(map[string]interface{})["conditions"].([]interface{}) - //if condition not found skip for this interation + // if condition not found skip for this interation if !ok { klog.Errorf("[IsItemCompleted] Error processing of unstructured object %v in namespace %v with labels %v, err: %v", job.GetName(), job.GetNamespace(), job.GetLabels(), err) continue } for _, item := range conditions { completionType := fmt.Sprint(item.(map[string]interface{})["type"]) - //Move this to utils package? + // Move this to utils package? userSpecfiedCompletionConditions := strings.Split(awgr.CompletionStatus, ",") for _, condition := range userSpecfiedCompletionConditions { if strings.Contains(strings.ToLower(completionType), strings.ToLower(condition)) { diff --git a/pkg/controller/queuejobresources/interfaces.go b/pkg/controller/queuejobresources/interfaces.go index d442a2df4..816335561 100644 --- a/pkg/controller/queuejobresources/interfaces.go +++ b/pkg/controller/queuejobresources/interfaces.go @@ -1,19 +1,4 @@ /* -Copyright 2014 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package queuejobresources import ( @@ -40,9 +26,9 @@ type Interface interface { SyncQueueJob(queuejob *qjobv1.AppWrapper, qjobRes *qjobv1.AppWrapperResource) error UpdateQueueJobStatus(queuejob *qjobv1.AppWrapper) error GetAggregatedResources(queuejob *qjobv1.AppWrapper) *clusterstateapi.Resource - GetAggregatedResourcesByPriority(priority float64, queuejob *qjobv1.AppWrapper) *clusterstateapi.Resource - //TODO: Add to calculate more accurate partial deployments while job is being realized -// GetAggregatedResourcesByPhase(phase v1.PodPhase, queuejob *qjobv1.AppWrapper) *clusterstateapi.Resource + GetAggregatedResourcesByPriority(priority int32, queuejob *qjobv1.AppWrapper) *clusterstateapi.Resource + // TODO: Add to calculate more accurate partial deployments while job is being realized + // GetAggregatedResourcesByPhase(phase v1.PodPhase, queuejob *qjobv1.AppWrapper) *clusterstateapi.Resource Cleanup(queuejob *qjobv1.AppWrapper, qjobRes *qjobv1.AppWrapperResource) error Run(stopCh <-chan struct{}) } diff --git a/pkg/controller/queuejobresources/namespace/namespace.go b/pkg/controller/queuejobresources/namespace/namespace.go index b58f74e9b..3e12cffd1 100644 --- a/pkg/controller/queuejobresources/namespace/namespace.go +++ b/pkg/controller/queuejobresources/namespace/namespace.go @@ -30,14 +30,13 @@ package namespace import ( "context" "fmt" + "sync" + "time" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - "sync" - "time" - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -67,7 +66,7 @@ const ( ControllerUIDLabel string = "controller-uid" ) -//QueueJobResService contains service info +// QueueJobResService contains service info type QueueJobResNamespace struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -80,14 +79,14 @@ type QueueJobResNamespace struct { refManager queuejobresources.RefManager } -//Register registers a queue job resource type +// Register registers a queue job resource type func Register(regs *queuejobresources.RegisteredResources) { regs.Register(arbv1.ResourceTypeNamespace, func(config *rest.Config) queuejobresources.Interface { return NewQueueJobResNamespace(config) }) } -//NewQueueJobResService creates a service controller +// NewQueueJobResService creates a service controller func NewQueueJobResNamespace(config *rest.Config) queuejobresources.Interface { qjrNamespace := &QueueJobResNamespace{ clients: kubernetes.NewForConfigOrDie(config), @@ -147,7 +146,7 @@ func (qjrNamespace *QueueJobResNamespace) deleteNamespace(obj interface{}) { return } -func (qjrNamespace *QueueJobResNamespace) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { +func (qjrNamespace *QueueJobResNamespace) GetAggregatedResourcesByPriority(priority int32, job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() return total } @@ -199,7 +198,7 @@ func (qjrNamespace *QueueJobResNamespace) UpdateQueueJobStatus(queuejob *arbv1.A return nil } -//SyncQueueJob syncs the services +// SyncQueueJob syncs the services func (qjrNamespace *QueueJobResNamespace) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { startTime := time.Now() @@ -225,7 +224,7 @@ func (qjrNamespace *QueueJobResNamespace) SyncQueueJob(queuejob *arbv1.AppWrappe klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) return err } - //TODO: need set reference after Service has been really added + // TODO: need set reference after Service has been really added tmpNamespace := v1.Namespace{} err = qjrNamespace.refManager.AddReference(qjobRes, &tmpNamespace) if err != nil { @@ -321,7 +320,7 @@ func (qjrNamespace *QueueJobResNamespace) deleteQueueJobResNamespaces(qjobRes *a return nil } -//Cleanup deletes all services +// Cleanup deletes all services func (qjrNamespace *QueueJobResNamespace) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { return qjrNamespace.deleteQueueJobResNamespaces(qjobRes, queuejob) } diff --git a/pkg/controller/queuejobresources/networkpolicy/networkpolicy.go b/pkg/controller/queuejobresources/networkpolicy/networkpolicy.go index 56240abbc..d46b0fd23 100644 --- a/pkg/controller/queuejobresources/networkpolicy/networkpolicy.go +++ b/pkg/controller/queuejobresources/networkpolicy/networkpolicy.go @@ -30,6 +30,8 @@ package networkpolicy import ( "context" "fmt" + "sync" + "time" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" @@ -46,9 +48,6 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" - "sync" - "time" - "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" @@ -68,7 +67,7 @@ const ( ControllerUIDLabel string = "controller-uid" ) -//QueueJobResService contains service info +// QueueJobResService contains service info type QueueJobResNetworkPolicy struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -81,14 +80,14 @@ type QueueJobResNetworkPolicy struct { refManager queuejobresources.RefManager } -//Register registers a queue job resource type +// Register registers a queue job resource type func Register(regs *queuejobresources.RegisteredResources) { regs.Register(arbv1.ResourceTypeNetworkPolicy, func(config *rest.Config) queuejobresources.Interface { return NewQueueJobResNetworkPolicy(config) }) } -//NewQueueJobResService creates a service controller +// NewQueueJobResService creates a service controller func NewQueueJobResNetworkPolicy(config *rest.Config) queuejobresources.Interface { qjrNetworkPolicy := &QueueJobResNetworkPolicy{ clients: kubernetes.NewForConfigOrDie(config), @@ -149,7 +148,7 @@ func (qjrNetworkPolicy *QueueJobResNetworkPolicy) deleteNetworkPolicy(obj interf return } -func (qjrNetworkPolicy *QueueJobResNetworkPolicy) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { +func (qjrNetworkPolicy *QueueJobResNetworkPolicy) GetAggregatedResourcesByPriority(priority int32, job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() return total } @@ -220,7 +219,7 @@ func (qjrNetworkPolicy *QueueJobResNetworkPolicy) SyncQueueJob(queuejob *arbv1.A klog.V(4).Infof("QJob: %s had %d NetworkPolicies and %d desired NetworkPolicies", queuejob.Name, networkPolicyLen, replicas) if diff > 0 { - //TODO: need set reference after Service has been really added + // TODO: need set reference after Service has been really added tmpNetworkPolicy := networkingv1.NetworkPolicy{} err = qjrNetworkPolicy.refManager.AddReference(qjobRes, &tmpNetworkPolicy) if err != nil { @@ -319,7 +318,7 @@ func (qjrNetworkPolicy *QueueJobResNetworkPolicy) deleteQueueJobResNetworkPolici return nil } -//Cleanup deletes all services +// Cleanup deletes all services func (qjrNetworkPolicy *QueueJobResNetworkPolicy) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { return qjrNetworkPolicy.deleteQueueJobResNetworkPolicies(qjobRes, queuejob) } diff --git a/pkg/controller/queuejobresources/persistentvolume/persistentvolume.go b/pkg/controller/queuejobresources/persistentvolume/persistentvolume.go index 02248b376..b3be2664f 100644 --- a/pkg/controller/queuejobresources/persistentvolume/persistentvolume.go +++ b/pkg/controller/queuejobresources/persistentvolume/persistentvolume.go @@ -30,6 +30,8 @@ package persistentvolume import ( "context" "fmt" + "sync" + "time" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" @@ -40,9 +42,6 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/klog/v2" - "sync" - "time" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" @@ -67,7 +66,7 @@ const ( ControllerUIDLabel string = "controller-uid" ) -//QueueJobResService contains service info +// QueueJobResService contains service info type QueueJobResPersistentvolume struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -80,14 +79,14 @@ type QueueJobResPersistentvolume struct { refManager queuejobresources.RefManager } -//Register registers a queue job resource type +// Register registers a queue job resource type func Register(regs *queuejobresources.RegisteredResources) { regs.Register(arbv1.ResourceTypePersistentVolume, func(config *rest.Config) queuejobresources.Interface { return NewQueueJobResPersistentvolume(config) }) } -//NewQueueJobResService creates a service controller +// NewQueueJobResService creates a service controller func NewQueueJobResPersistentvolume(config *rest.Config) queuejobresources.Interface { qjrPersistentvolume := &QueueJobResPersistentvolume{ clients: kubernetes.NewForConfigOrDie(config), @@ -147,7 +146,7 @@ func (qjrPersistentvolume *QueueJobResPersistentvolume) deletePersistentVolume(o return } -func (qjrPersistentvolume *QueueJobResPersistentvolume) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { +func (qjrPersistentvolume *QueueJobResPersistentvolume) GetAggregatedResourcesByPriority(priority int32, job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() return total } @@ -198,7 +197,7 @@ func (qjrPersistentvolume *QueueJobResPersistentvolume) UpdateQueueJobStatus(que return nil } -//SyncQueueJob syncs the services +// SyncQueueJob syncs the services func (qjrPersistentvolume *QueueJobResPersistentvolume) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { startTime := time.Now() @@ -224,7 +223,7 @@ func (qjrPersistentvolume *QueueJobResPersistentvolume) SyncQueueJob(queuejob *a klog.Errorf("Cannot read template from resource %+v %+v", qjobRes, err) return err } - //TODO: need set reference after Service has been really added + // TODO: need set reference after Service has been really added tmpPersistentVolume := v1.PersistentVolume{} err = qjrPersistentvolume.refManager.AddReference(qjobRes, &tmpPersistentVolume) if err != nil { @@ -333,7 +332,7 @@ func (qjrPersistentvolume *QueueJobResPersistentvolume) deleteQueueJobResPersist return nil } -//Cleanup deletes all services +// Cleanup deletes all services func (qjrPersistentvolume *QueueJobResPersistentvolume) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { return qjrPersistentvolume.deleteQueueJobResPersistentVolumes(qjobRes, queuejob) } diff --git a/pkg/controller/queuejobresources/persistentvolumeclaim/persistentvolumeclaim.go b/pkg/controller/queuejobresources/persistentvolumeclaim/persistentvolumeclaim.go index 710e24e8f..8ca74e29b 100644 --- a/pkg/controller/queuejobresources/persistentvolumeclaim/persistentvolumeclaim.go +++ b/pkg/controller/queuejobresources/persistentvolumeclaim/persistentvolumeclaim.go @@ -30,14 +30,13 @@ package persistentvolumeclaim import ( "context" "fmt" + "sync" + "time" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - "sync" - "time" - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -66,7 +65,7 @@ const ( ControllerUIDLabel string = "controller-uid" ) -//QueueJobResService contains service info +// QueueJobResService contains service info type QueueJobResPersistentVolumeClaim struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -79,14 +78,14 @@ type QueueJobResPersistentVolumeClaim struct { refManager queuejobresources.RefManager } -//Register registers a queue job resource type +// Register registers a queue job resource type func Register(regs *queuejobresources.RegisteredResources) { regs.Register(arbv1.ResourceTypePersistentVolumeClaim, func(config *rest.Config) queuejobresources.Interface { return NewQueueJobResPersistentVolumeClaim(config) }) } -//NewQueueJobResService creates a service controller +// NewQueueJobResService creates a service controller func NewQueueJobResPersistentVolumeClaim(config *rest.Config) queuejobresources.Interface { qjrPersistentVolumeClaim := &QueueJobResPersistentVolumeClaim{ clients: kubernetes.NewForConfigOrDie(config), @@ -146,7 +145,7 @@ func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) deletePersiste return } -func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { +func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) GetAggregatedResourcesByPriority(priority int32, job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() return total } @@ -218,7 +217,7 @@ func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) SyncQueueJob(q klog.V(4).Infof("QJob: %s had %d PersistVolumeClaims and %d desired PersistVolumeClaims", queuejob.Name, persistentVolumeClaimLen, replicas) if diff > 0 { - //TODO: need set reference after Service has been really added + // TODO: need set reference after Service has been really added tmpPersistentVolumeClaim := v1.PersistentVolumeClaim{} err = qjrPersistentVolumeClaim.refManager.AddReference(qjobRes, &tmpPersistentVolumeClaim) if err != nil { @@ -318,7 +317,7 @@ func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) deleteQueueJob return nil } -//Cleanup deletes all services +// Cleanup deletes all services func (qjrPersistentVolumeClaim *QueueJobResPersistentVolumeClaim) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { return qjrPersistentVolumeClaim.deleteQueueJobResPersistentVolumeClaims(qjobRes, queuejob) } diff --git a/pkg/controller/queuejobresources/pod/pod.go b/pkg/controller/queuejobresources/pod/pod.go index a17d1b4a2..d66acdb9f 100644 --- a/pkg/controller/queuejobresources/pod/pod.go +++ b/pkg/controller/queuejobresources/pod/pod.go @@ -33,15 +33,14 @@ package pod import ( "context" "fmt" + "sync" + "time" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/maputils" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - "sync" - "time" - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -70,7 +69,7 @@ const ( ControllerUIDLabel string = "controller-uid" ) -//QueueJobResPod Controller for QueueJob pods +// QueueJobResPod Controller for QueueJob pods type QueueJobResPod struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -105,7 +104,7 @@ func Register(regs *queuejobresources.RegisteredResources) { }) } -//NewQueueJobResPod Creates a new controller for QueueJob pods +// NewQueueJobResPod Creates a new controller for QueueJob pods func NewQueueJobResPod(config *rest.Config) queuejobresources.Interface { // create k8s clientset @@ -205,13 +204,13 @@ func isPodActive(p *v1.Pod) bool { p.DeletionTimestamp == nil } -//SyncQueueJob : method to sync the resources of this job +// SyncQueueJob : method to sync the resources of this job func (qjrPod *QueueJobResPod) SyncQueueJob(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { // check if there are still terminating pods for this QueueJob - //counter, ok := qjrPod.deletedPodsCounter.Get(fmt.Sprintf("%s/%s", queuejob.Namespace, queuejob.Name)) - //if ok && counter >= 0 { + // counter, ok := qjrPod.deletedPodsCounter.Get(fmt.Sprintf("%s/%s", queuejob.Namespace, queuejob.Name)) + // if ok && counter >= 0 { // return fmt.Errorf("There are still terminating pods for QueueJob %s/%s, can not sync it now", queuejob.Namespace, queuejob.Name) - //} + // } pods, err := qjrPod.getPodsForQueueJob(queuejob) if err != nil { @@ -255,7 +254,7 @@ func (qjrPod *QueueJobResPod) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) e queuejob.Status.Running = running queuejob.Status.Succeeded = succeeded queuejob.Status.Failed = failed - //Total resources by all running pods + // Total resources by all running pods queuejob.Status.TotalGPU = totalResourcesConsumed.GPU queuejob.Status.TotalCPU = totalResourcesConsumed.MilliCPU queuejob.Status.TotalMemory = totalResourcesConsumed.Memory @@ -541,7 +540,7 @@ func createQueueJobSchedulingSpec(qj *arbv1.AppWrapper) *arbv1.SchedulingSpec { } } -//GetPodTemplate Parse queue job api object to get Pod template +// GetPodTemplate Parse queue job api object to get Pod template func (qjrPod *QueueJobResPod) GetPodTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.PodTemplateSpec, error) { podGVK := schema.GroupVersion{Group: v1.GroupName, Version: "v1"}.WithKind("PodTemplate") @@ -563,7 +562,7 @@ func (qjrPod *QueueJobResPod) GetPodTemplate(qjobRes *arbv1.AppWrapperResource) func (qjrPod *QueueJobResPod) GetAggregatedResources(job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() if job.Spec.AggrResources.Items != nil { - //calculate scaling + // calculate scaling for _, ar := range job.Spec.AggrResources.Items { if ar.Type == arbv1.ResourceTypePod { template, err := qjrPod.GetPodTemplate(&ar) @@ -573,8 +572,8 @@ func (qjrPod *QueueJobResPod) GetAggregatedResources(job *arbv1.AppWrapper) *clu replicas := ar.Replicas myres := queuejobresources.GetPodResources(template) - myres.MilliCPU = float64(replicas) * myres.MilliCPU - myres.Memory = float64(replicas) * myres.Memory + myres.MilliCPU = int64(replicas) * myres.MilliCPU + myres.Memory = int64(replicas) * myres.Memory myres.GPU = int64(replicas) * myres.GPU total = total.Add(myres) } @@ -584,10 +583,10 @@ func (qjrPod *QueueJobResPod) GetAggregatedResources(job *arbv1.AppWrapper) *clu return total } -func (qjrPod *QueueJobResPod) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { +func (qjrPod *QueueJobResPod) GetAggregatedResourcesByPriority(priority int32, job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() if job.Spec.AggrResources.Items != nil { - //calculate scaling + // calculate scaling for _, ar := range job.Spec.AggrResources.Items { if ar.Priority < priority { continue @@ -623,7 +622,7 @@ func (qjrPod *QueueJobResPod) createQueueJobPod(qj *arbv1.AppWrapper, ix int32, if tmpl == nil { tmpl = make(map[string]string) } - + tmpl[queueJobName] = qj.Name // Include pre-defined metadata info, e.g. annotations @@ -634,12 +633,12 @@ func (qjrPod *QueueJobResPod) createQueueJobPod(qj *arbv1.AppWrapper, ix int32, templateObjMetadata.SetNamespace(qj.Namespace) templateObjMetadata.SetOwnerReferences([]metav1.OwnerReference{ *metav1.NewControllerRef(qj, queueJobKind), - },) + }) templateObjMetadata.SetLabels(tmpl) return &v1.Pod{ ObjectMeta: templateObjMetadata, - Spec: templateCopy.Spec, + Spec: templateCopy.Spec, } } diff --git a/pkg/controller/queuejobresources/secret/secret.go b/pkg/controller/queuejobresources/secret/secret.go index 938380b00..bce978f71 100644 --- a/pkg/controller/queuejobresources/secret/secret.go +++ b/pkg/controller/queuejobresources/secret/secret.go @@ -30,6 +30,8 @@ package secret import ( "context" "fmt" + "sync" + "time" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" @@ -40,9 +42,6 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/klog/v2" - "sync" - "time" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" @@ -67,7 +66,7 @@ const ( ControllerUIDLabel string = "controller-uid" ) -//QueueJobResService contains service info +// QueueJobResService contains service info type QueueJobResSecret struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -80,14 +79,14 @@ type QueueJobResSecret struct { refManager queuejobresources.RefManager } -//Register registers a queue job resource type +// Register registers a queue job resource type func Register(regs *queuejobresources.RegisteredResources) { regs.Register(arbv1.ResourceTypeSecret, func(config *rest.Config) queuejobresources.Interface { return NewQueueJobResSecret(config) }) } -//NewQueueJobResService creates a service controller +// NewQueueJobResService creates a service controller func NewQueueJobResSecret(config *rest.Config) queuejobresources.Interface { qjrSecret := &QueueJobResSecret{ clients: kubernetes.NewForConfigOrDie(config), @@ -147,7 +146,7 @@ func (qjrSecret *QueueJobResSecret) deleteSecret(obj interface{}) { return } -func (qjrSecret *QueueJobResSecret) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { +func (qjrSecret *QueueJobResSecret) GetAggregatedResourcesByPriority(priority int32, job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() return total } @@ -219,7 +218,7 @@ func (qjrSecret *QueueJobResSecret) SyncQueueJob(queuejob *arbv1.AppWrapper, qjo klog.V(4).Infof("QJob: %s had %d Secrets and %d desired Secrets", queuejob.Name, secretLen, replicas) if diff > 0 { - //TODO: need set reference after Service has been really added + // TODO: need set reference after Service has been really added tmpSecret := v1.Secret{} err = qjrSecret.refManager.AddReference(qjobRes, &tmpSecret) if err != nil { @@ -331,7 +330,7 @@ func (qjrSecret *QueueJobResSecret) deleteQueueJobResSecrets(qjobRes *arbv1.AppW return nil } -//Cleanup deletes all services +// Cleanup deletes all services func (qjrSecret *QueueJobResSecret) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { return qjrSecret.deleteQueueJobResSecrets(qjobRes, queuejob) } diff --git a/pkg/controller/queuejobresources/service/service.go b/pkg/controller/queuejobresources/service/service.go index 9592ecd0e..f1a654727 100644 --- a/pkg/controller/queuejobresources/service/service.go +++ b/pkg/controller/queuejobresources/service/service.go @@ -1,16 +1,4 @@ /* -Copyright 2017 The Kubernetes Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,11 +13,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package service import ( "context" "fmt" + "sync" + "time" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" clientset "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" @@ -40,9 +31,6 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/klog/v2" - "sync" - "time" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" @@ -67,7 +55,7 @@ const ( ControllerUIDLabel string = "controller-uid" ) -//QueueJobResService contains service info +// QueueJobResService contains service info type QueueJobResService struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -80,14 +68,14 @@ type QueueJobResService struct { refManager queuejobresources.RefManager } -//Register registers a queue job resource type +// Register registers a queue job resource type func Register(regs *queuejobresources.RegisteredResources) { regs.Register(arbv1.ResourceTypeService, func(config *rest.Config) queuejobresources.Interface { return NewQueueJobResService(config) }) } -//NewQueueJobResService creates a service controller +// NewQueueJobResService creates a service controller func NewQueueJobResService(config *rest.Config) queuejobresources.Interface { qjrService := &QueueJobResService{ clients: kubernetes.NewForConfigOrDie(config), @@ -147,7 +135,7 @@ func (qjrService *QueueJobResService) deleteService(obj interface{}) { return } -func (qjrService *QueueJobResService) GetAggregatedResourcesByPriority(priority float64, job *arbv1.AppWrapper) *clusterstateapi.Resource { +func (qjrService *QueueJobResService) GetAggregatedResourcesByPriority(priority int32, job *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() return total } @@ -219,7 +207,7 @@ func (qjrService *QueueJobResService) SyncQueueJob(queuejob *arbv1.AppWrapper, q klog.V(4).Infof("QJob: %s had %d Services and %d desired Services", queuejob.Name, serviceLen, replicas) if diff > 0 { - //TODO: need set reference after Service has been really added + // TODO: need set reference after Service has been really added tmpService := v1.Service{} err = qjrService.refManager.AddReference(qjobRes, &tmpService) if err != nil { @@ -331,7 +319,7 @@ func (qjrService *QueueJobResService) deleteQueueJobResServices(qjobRes *arbv1.A return nil } -//Cleanup deletes all services +// Cleanup deletes all services func (qjrService *QueueJobResService) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { return qjrService.deleteQueueJobResServices(qjobRes, queuejob) } diff --git a/pkg/controller/queuejobresources/statefulset/statefulset.go b/pkg/controller/queuejobresources/statefulset/statefulset.go index a47c604a9..7d14b9266 100644 --- a/pkg/controller/queuejobresources/statefulset/statefulset.go +++ b/pkg/controller/queuejobresources/statefulset/statefulset.go @@ -30,13 +30,12 @@ package statefulset import ( "context" "fmt" + "sync" + "time" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources" - "sync" - "time" - clusterstateapi "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/clusterstate/api" apps "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" @@ -69,7 +68,7 @@ const ( ControllerUIDLabel string = "controller-uid" ) -//QueueJobResStatefulSet - stateful sets +// QueueJobResStatefulSet - stateful sets type QueueJobResStatefulSet struct { clients *kubernetes.Clientset arbclients *clientset.Clientset @@ -89,7 +88,7 @@ func Register(regs *queuejobresources.RegisteredResources) { }) } -//NewQueueJobResStatefulSet - creates a controller for SS +// NewQueueJobResStatefulSet - creates a controller for SS func NewQueueJobResStatefulSet(config *rest.Config) queuejobresources.Interface { qjrd := &QueueJobResStatefulSet{ clients: kubernetes.NewForConfigOrDie(config), @@ -129,7 +128,7 @@ func (qjrStatefulSet *QueueJobResStatefulSet) Run(stopCh <-chan struct{}) { qjrStatefulSet.deployInformer.Informer().Run(stopCh) } -//GetPodTemplate Parse queue job api object to get Pod template +// GetPodTemplate Parse queue job api object to get Pod template func (qjrStatefulSet *QueueJobResStatefulSet) GetPodTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.PodTemplateSpec, int32, error) { res, err := qjrStatefulSet.getStatefulSetTemplate(qjobRes) if err != nil { @@ -141,13 +140,13 @@ func (qjrStatefulSet *QueueJobResStatefulSet) GetPodTemplate(qjobRes *arbv1.AppW func (qjrStatefulSet *QueueJobResStatefulSet) GetAggregatedResources(queueJob *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() if queueJob.Spec.AggrResources.Items != nil { - //calculate scaling + // calculate scaling for _, ar := range queueJob.Spec.AggrResources.Items { if ar.Type == arbv1.ResourceTypeStatefulSet { podTemplate, replicas, _ := qjrStatefulSet.GetPodTemplate(&ar) myres := queuejobresources.GetPodResources(podTemplate) - myres.MilliCPU = float64(replicas) * myres.MilliCPU - myres.Memory = float64(replicas) * myres.Memory + myres.MilliCPU = int64(replicas) * myres.MilliCPU + myres.Memory = int64(replicas) * myres.Memory myres.GPU = int64(replicas) * myres.GPU total = total.Add(myres) } @@ -156,10 +155,10 @@ func (qjrStatefulSet *QueueJobResStatefulSet) GetAggregatedResources(queueJob *a return total } -func (qjrStatefulSet *QueueJobResStatefulSet) GetAggregatedResourcesByPriority(priority float64, queueJob *arbv1.AppWrapper) *clusterstateapi.Resource { +func (qjrStatefulSet *QueueJobResStatefulSet) GetAggregatedResourcesByPriority(priority int32, queueJob *arbv1.AppWrapper) *clusterstateapi.Resource { total := clusterstateapi.EmptyResource() if queueJob.Spec.AggrResources.Items != nil { - //calculate scaling + // calculate scaling for _, ar := range queueJob.Spec.AggrResources.Items { if ar.Priority < priority { continue @@ -167,8 +166,8 @@ func (qjrStatefulSet *QueueJobResStatefulSet) GetAggregatedResourcesByPriority(p if ar.Type == arbv1.ResourceTypeStatefulSet { podTemplate, replicas, _ := qjrStatefulSet.GetPodTemplate(&ar) myres := queuejobresources.GetPodResources(podTemplate) - myres.MilliCPU = float64(replicas) * myres.MilliCPU - myres.Memory = float64(replicas) * myres.Memory + myres.MilliCPU = int64(replicas) * myres.MilliCPU + myres.Memory = int64(replicas) * myres.Memory myres.GPU = int64(replicas) * myres.GPU total = total.Add(myres) } @@ -250,7 +249,7 @@ func (qjrStatefulSet *QueueJobResStatefulSet) SyncQueueJob(queuejob *arbv1.AppWr klog.V(4).Infof("QJob: %s had %d StatefulSets and %d desired StatefulSets", queuejob.Name, statefulSetLen, replicas) if diff > 0 { - //TODO: need set reference after Service has been really added + // TODO: need set reference after Service has been really added tmpStatefulSet := apps.StatefulSet{} err = qjrStatefulSet.refManager.AddReference(qjobRes, &tmpStatefulSet) if err != nil { @@ -365,7 +364,7 @@ func (qjrStatefulSet *QueueJobResStatefulSet) deleteQueueJobResStatefulSets(qjob return nil } -//Cleanup deletes all services +// Cleanup deletes all services func (qjrStatefulSet *QueueJobResStatefulSet) Cleanup(queuejob *arbv1.AppWrapper, qjobRes *arbv1.AppWrapperResource) error { return qjrStatefulSet.deleteQueueJobResStatefulSets(qjobRes, queuejob) } diff --git a/pkg/controller/quota/quotaforestmanager/qm_lib_backend_with_quotasubt_mgr.go b/pkg/controller/quota/quotaforestmanager/qm_lib_backend_with_quotasubt_mgr.go index d1c9b96dc..a34b36b2e 100644 --- a/pkg/controller/quota/quotaforestmanager/qm_lib_backend_with_quotasubt_mgr.go +++ b/pkg/controller/quota/quotaforestmanager/qm_lib_backend_with_quotasubt_mgr.go @@ -20,6 +20,7 @@ package quotaforestmanager import ( "bytes" "fmt" + "reflect" "strings" "github.com/hashicorp/go-multierror" @@ -34,9 +35,6 @@ import ( "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/util" "k8s.io/client-go/rest" - "math" - "reflect" - "k8s.io/klog/v2" ) @@ -335,18 +333,6 @@ func (qm *QuotaManager) convertInt64Demand(int64Demand int64) (int, error) { } } -func (qm *QuotaManager) convertFloat64Demand(floatDemand float64) (int, error) { - var err error - err = nil - if floatDemand > float64(MaxInt) { - err = fmt.Errorf("demand %f is larger than Max Quota Management Backend size, resetting demand to %d", - floatDemand, MaxInt) - return MaxInt, err - } else { - return int(math.Trunc(floatDemand)), err - } -} - func (qm *QuotaManager) getQuotaTreeResourceTypesDemands(awResDemands *clusterstateapi.Resource, treeToResourceTypes []string) (map[string]int, error) { demands := map[string]int{} var err error @@ -358,7 +344,7 @@ func (qm *QuotaManager) getQuotaTreeResourceTypesDemands(awResDemands *clusterst // CPU Demands if strings.Contains(strings.ToLower(treeResourceType), "cpu") { // Handle type conversions - demand, converErr := qm.convertFloat64Demand(awResDemands.MilliCPU) + demand, converErr := qm.convertInt64Demand(awResDemands.MilliCPU) if converErr != nil { if err == nil { err = fmt.Errorf("resource type: %s %s", @@ -375,7 +361,7 @@ func (qm *QuotaManager) getQuotaTreeResourceTypesDemands(awResDemands *clusterst // Memory Demands if strings.Contains(strings.ToLower(treeResourceType), "memory") { // Handle type conversions - demand, converErr := qm.convertFloat64Demand(awResDemands.Memory) + demand, converErr := qm.convertInt64Demand(awResDemands.Memory) if converErr != nil { if err == nil { err = fmt.Errorf("resource type: %s %s", diff --git a/pkg/quotaplugins/quota-simple-rest/quota_rest_manager.go b/pkg/quotaplugins/quota-simple-rest/quota_rest_manager.go index 855ee1372..db0c15d2c 100644 --- a/pkg/quotaplugins/quota-simple-rest/quota_rest_manager.go +++ b/pkg/quotaplugins/quota-simple-rest/quota_rest_manager.go @@ -23,6 +23,12 @@ import ( "bytes" "encoding/json" "fmt" + "io/ioutil" + "net/http" + "net/http/httputil" + "reflect" + "strings" + "time" "github.com/project-codeflare/multi-cluster-app-dispatcher/cmd/kar-controllers/app/options" arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" @@ -31,14 +37,6 @@ import ( "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/quota" "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/util" - "io/ioutil" - "math" - "net/http" - "net/http/httputil" - "reflect" - "strings" - "time" - "k8s.io/client-go/rest" "k8s.io/klog/v2" ) @@ -272,8 +270,8 @@ func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi groups := qm.getQuotaDesignation(aw) preemptable := qm.preemptionEnabled - awCPU_Demand := int(math.Trunc(awResDemands.MilliCPU)) - awMem_Demand := int(math.Trunc(awResDemands.Memory) / 1000000) + awCPU_Demand := int(awResDemands.MilliCPU) + awMem_Demand := int(awResDemands.Memory / 1000000) var demand []int demand = append(demand, awCPU_Demand) demand = append(demand, awMem_Demand) From 2816880fe95985ece1e43f49bc0f4066c527293c Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 4 Jul 2023 15:24:16 +0200 Subject: [PATCH 06/10] Fix resource histogram creation --- pkg/controller/clusterstate/api/histogram_info.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/controller/clusterstate/api/histogram_info.go b/pkg/controller/clusterstate/api/histogram_info.go index 10640e43f..d5f9388bd 100644 --- a/pkg/controller/clusterstate/api/histogram_info.go +++ b/pkg/controller/clusterstate/api/histogram_info.go @@ -33,7 +33,7 @@ type ResourceHistogram struct { } func NewResourceHistogram(min *Resource, max *Resource) *ResourceHistogram { - start := max.MilliCPU + start := min.MilliCPU width := 1.0 count := 2 diff := max.MilliCPU - min.MilliCPU @@ -42,13 +42,13 @@ func NewResourceHistogram(min *Resource, max *Resource) *ResourceHistogram { width = float64(diff) / (BucketCount - 1) count = BucketCount + 1 } - klog.V(10).Infof("[NewResourceHistogram] Start histogram numbers for CPU: start=%f, width=%f, count=%d", + klog.V(10).Infof("[NewResourceHistogram] Start histogram numbers for CPU: start=%d, width=%d, count=%d", start, width, count) millicpuHist := prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "millicpu", Buckets: prometheus.LinearBuckets(float64(start), width, count)}) - start = max.Memory + start = min.Memory width = 1.0 count = 2 diff = max.Memory - min.Memory @@ -57,22 +57,22 @@ func NewResourceHistogram(min *Resource, max *Resource) *ResourceHistogram { width = float64(diff) / (BucketCount - 1) count = BucketCount + 1 } - klog.V(10).Infof("[NewResourceHistogram] Start histogram numbers for Memory: start=%f, width=%f, count=%d", + klog.V(10).Infof("[NewResourceHistogram] Start histogram numbers for Memory: start=%d, width=%d, count=%d", start, width, count) memoryHist := prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "memory", Buckets: prometheus.LinearBuckets(float64(start), width, count)}) - start = max.GPU + start = min.GPU width = 1.0 count = 2 diff = max.GPU - min.GPU - if diff >= 0 { + if diff > 0 { start = min.GPU width = float64(diff) / (BucketCount - 1) count = BucketCount + 1 } - klog.V(10).Infof("[NewResourceHistogram] Start histogram numbers for GPU: start=%f, width=%f, count=%d", + klog.V(10).Infof("[NewResourceHistogram] Start histogram numbers for GPU: start=%d, width=%d, count=%d", start, width, count) gpuHist := prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "gpu", From 581abe8443f6907485e656fa20743ba5a551ce28 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 4 Jul 2023 15:25:09 +0200 Subject: [PATCH 07/10] api: Remove AppWrapper CRD sub-resource --- config/crd/bases/mcad.ibm.com_appwrappers.yaml | 2 -- deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml | 2 -- pkg/apis/controller/v1beta1/appwrapper.go | 1 - 3 files changed, 5 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 76a82df98..f0e71b0ba 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -838,5 +838,3 @@ spec: type: object served: true storage: true - subresources: - status: {} diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 76a82df98..f0e71b0ba 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -838,5 +838,3 @@ spec: type: object served: true storage: true - subresources: - status: {} diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index c97ab82c1..e46f11639 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -28,7 +28,6 @@ const AppWrapperPlural string = "appwrappers" // which AppWrapper it belongs to. const AppWrapperAnnotationKey = "appwrapper.mcad.ibm.com/appwrapper-name" -// +kubebuilder:subresource:status // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // Definition of AppWrapper class From 9eb1da1c3c0b14521b09900513e414e6c25f0778 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 4 Jul 2023 15:47:56 +0200 Subject: [PATCH 08/10] api: Remove object meta fields from AppWrapperResourceList --- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 61 ------------------- .../crds/mcad.ibm.com_appwrappers.yaml | 61 ------------------- pkg/apis/controller/v1beta1/appwrapper.go | 3 - .../v1beta1/zz_generated.deepcopy.go | 2 - 4 files changed, 127 deletions(-) diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index f0e71b0ba..4f46af02f 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -170,67 +170,6 @@ spec: - template type: object type: array - apiVersion: - description: 'APIVersion defines the versioned schema of this - representation of an object. Servers should convert recognized - schemas to the latest internal value, and may reject unrecognized - values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource - this object represents. Servers may infer this from the endpoint - the client submits requests to. Cannot be updated. In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - description: ListMeta describes metadata that synthetic resources - must have, including lists and various status objects. A resource - may have only one of {ObjectMeta, ListMeta}. - properties: - continue: - description: continue may be set if the user set a limit on - the number of items returned, and indicates that the server - has more data available. The value is opaque and may be - used to issue another request to the endpoint that served - this list to retrieve the next set of available objects. - Continuing a consistent list may not be possible if the - server configuration has changed or more than a few minutes - have passed. The resourceVersion field returned when using - this continue value will be identical to the value in the - first response, unless you have received this token from - an error message. - type: string - remainingItemCount: - description: remainingItemCount is the number of subsequent - items in the list which are not included in this list response. - If the list request contained label or field selectors, - then the number of remaining items is unknown and the field - will be left unset and omitted during serialization. If - the list is complete (either because it is not chunking - or because this is the last chunk), then there are no more - remaining items and this field will be left unset and omitted - during serialization. Servers older than v1.15 do not set - this field. The intended use of the remainingItemCount is - *estimating* the size of a collection. Clients should not - rely on the remainingItemCount to be set or to be exact. - format: int64 - type: integer - resourceVersion: - description: 'String that identifies the server''s internal - version of this object that can be used by clients to determine - when objects have changed. Value must be treated as opaque - by clients and passed unmodified back to the server. Populated - by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency' - type: string - selfLink: - description: "selfLink is a URL representing this object. - Populated by the system. Read-only. \n DEPRECATED Kubernetes - will stop propagating this field in 1.20 release and the - field is planned to be removed in 1.21 release." - type: string - type: object - required: - - metadata type: object schedulingSpec: description: SchedSpec specifies the parameters used for scheduling diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index f0e71b0ba..4f46af02f 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -170,67 +170,6 @@ spec: - template type: object type: array - apiVersion: - description: 'APIVersion defines the versioned schema of this - representation of an object. Servers should convert recognized - schemas to the latest internal value, and may reject unrecognized - values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource - this object represents. Servers may infer this from the endpoint - the client submits requests to. Cannot be updated. In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - description: ListMeta describes metadata that synthetic resources - must have, including lists and various status objects. A resource - may have only one of {ObjectMeta, ListMeta}. - properties: - continue: - description: continue may be set if the user set a limit on - the number of items returned, and indicates that the server - has more data available. The value is opaque and may be - used to issue another request to the endpoint that served - this list to retrieve the next set of available objects. - Continuing a consistent list may not be possible if the - server configuration has changed or more than a few minutes - have passed. The resourceVersion field returned when using - this continue value will be identical to the value in the - first response, unless you have received this token from - an error message. - type: string - remainingItemCount: - description: remainingItemCount is the number of subsequent - items in the list which are not included in this list response. - If the list request contained label or field selectors, - then the number of remaining items is unknown and the field - will be left unset and omitted during serialization. If - the list is complete (either because it is not chunking - or because this is the last chunk), then there are no more - remaining items and this field will be left unset and omitted - during serialization. Servers older than v1.15 do not set - this field. The intended use of the remainingItemCount is - *estimating* the size of a collection. Clients should not - rely on the remainingItemCount to be set or to be exact. - format: int64 - type: integer - resourceVersion: - description: 'String that identifies the server''s internal - version of this object that can be used by clients to determine - when objects have changed. Value must be treated as opaque - by clients and passed unmodified back to the server. Populated - by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency' - type: string - selfLink: - description: "selfLink is a URL representing this object. - Populated by the system. Read-only. \n DEPRECATED Kubernetes - will stop propagating this field in 1.20 release and the - field is planned to be removed in 1.21 release." - type: string - type: object - required: - - metadata type: object schedulingSpec: description: SchedSpec specifies the parameters used for scheduling diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index e46f11639..85e2e2cec 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -70,9 +70,6 @@ type AppWrapperSpec struct { // a collection of AppWrapperResource type AppWrapperResourceList struct { - metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata"` - // +optional Items []AppWrapperResource `json:"Items"` // +optional diff --git a/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go b/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go index 632be6ece..46ec8adc0 100644 --- a/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go @@ -175,8 +175,6 @@ func (in *AppWrapperResource) DeepCopy() *AppWrapperResource { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AppWrapperResourceList) DeepCopyInto(out *AppWrapperResourceList) { *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) if in.Items != nil { in, out := &in.Items, &out.Items *out = make([]AppWrapperResource, len(*in)) From 1ec142df961f4b39f8bb02ee77637c1773832cd7 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Wed, 5 Jul 2023 09:34:41 +0200 Subject: [PATCH 09/10] api: ObjectMeta is required on AppWrapperResource --- Makefile | 2 +- .../crd/bases/mcad.ibm.com_appwrappers.yaml | 20 +++++++++++ config/crd/bases/mcad.ibm.com_queuejobs.yaml | 34 +++++++++++++++++++ .../crds/mcad.ibm.com_appwrappers.yaml | 20 +++++++++++ .../crds/mcad.ibm.com_queuejobs.yaml | 34 +++++++++++++++++++ pkg/apis/controller/v1beta1/appwrapper.go | 2 ++ .../v1beta1/zz_generated.deepcopy.go | 1 + 7 files changed, 112 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5b6f84d94..582be3742 100644 --- a/Makefile +++ b/Makefile @@ -85,7 +85,7 @@ $(CONTROLLER_GEN): $(LOCALBIN) .PHONY: manifests manifests: controller-gen ## Generate CustomResourceDefinition objects. - $(CONTROLLER_GEN) crd:allowDangerousTypes=true paths="./pkg/apis/..." output:crd:artifacts:config=config/crd/bases + $(CONTROLLER_GEN) crd:allowDangerousTypes=true,generateEmbeddedObjectMeta=true paths="./pkg/apis/..." output:crd:artifacts:config=config/crd/bases generate-code: pkg/apis/controller/v1beta1/zz_generated.deepcopy.go diff --git a/config/crd/bases/mcad.ibm.com_appwrappers.yaml b/config/crd/bases/mcad.ibm.com_appwrappers.yaml index 4f46af02f..d60d40da4 100644 --- a/config/crd/bases/mcad.ibm.com_appwrappers.yaml +++ b/config/crd/bases/mcad.ibm.com_appwrappers.yaml @@ -138,6 +138,25 @@ spec: resource type format: int32 type: integer + metadata: + properties: + annotations: + additionalProperties: + type: string + type: object + finalizers: + items: + type: string + type: array + labels: + additionalProperties: + type: string + type: object + name: + type: string + namespace: + type: string + type: object minavailable: description: The minimal available pods to run for this AppWrapper; the default value is nil @@ -167,6 +186,7 @@ spec: Pod, a ReplicaSet, a ... ?) type: string required: + - metadata - template type: object type: array diff --git a/config/crd/bases/mcad.ibm.com_queuejobs.yaml b/config/crd/bases/mcad.ibm.com_queuejobs.yaml index 486f7e2d2..47a948190 100644 --- a/config/crd/bases/mcad.ibm.com_queuejobs.yaml +++ b/config/crd/bases/mcad.ibm.com_queuejobs.yaml @@ -166,6 +166,23 @@ spec: properties: metadata: description: 'Standard object''s metadata. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata' + properties: + annotations: + additionalProperties: + type: string + type: object + finalizers: + items: + type: string + type: array + labels: + additionalProperties: + type: string + type: object + name: + type: string + namespace: + type: string type: object spec: description: 'Specification of the desired behavior of the @@ -5689,6 +5706,23 @@ spec: that will be copied into the PVC when creating it. No other fields are allowed and will be rejected during validation. + properties: + annotations: + additionalProperties: + type: string + type: object + finalizers: + items: + type: string + type: array + labels: + additionalProperties: + type: string + type: object + name: + type: string + namespace: + type: string type: object spec: description: The specification for the diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml index 4f46af02f..d60d40da4 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_appwrappers.yaml @@ -138,6 +138,25 @@ spec: resource type format: int32 type: integer + metadata: + properties: + annotations: + additionalProperties: + type: string + type: object + finalizers: + items: + type: string + type: array + labels: + additionalProperties: + type: string + type: object + name: + type: string + namespace: + type: string + type: object minavailable: description: The minimal available pods to run for this AppWrapper; the default value is nil @@ -167,6 +186,7 @@ spec: Pod, a ReplicaSet, a ... ?) type: string required: + - metadata - template type: object type: array diff --git a/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml b/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml index 486f7e2d2..47a948190 100644 --- a/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml +++ b/deployment/mcad-controller/crds/mcad.ibm.com_queuejobs.yaml @@ -166,6 +166,23 @@ spec: properties: metadata: description: 'Standard object''s metadata. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata' + properties: + annotations: + additionalProperties: + type: string + type: object + finalizers: + items: + type: string + type: array + labels: + additionalProperties: + type: string + type: object + name: + type: string + namespace: + type: string type: object spec: description: 'Specification of the desired behavior of the @@ -5689,6 +5706,23 @@ spec: that will be copied into the PVC when creating it. No other fields are allowed and will be rejected during validation. + properties: + annotations: + additionalProperties: + type: string + type: object + finalizers: + items: + type: string + type: array + labels: + additionalProperties: + type: string + type: object + name: + type: string + namespace: + type: string type: object spec: description: The specification for the diff --git a/pkg/apis/controller/v1beta1/appwrapper.go b/pkg/apis/controller/v1beta1/appwrapper.go index 85e2e2cec..5ff5b242e 100644 --- a/pkg/apis/controller/v1beta1/appwrapper.go +++ b/pkg/apis/controller/v1beta1/appwrapper.go @@ -84,6 +84,8 @@ type AppWrapperService struct { // AppWrapperResource is App Wrapper aggregation resource // TODO: To be deprecated type AppWrapperResource struct { + metav1.ObjectMeta `json:"metadata"` + // Replicas is the number of desired replicas Replicas int32 `json:"replicas,omitempty" protobuf:"bytes,2,opt,name=replicas"` diff --git a/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go b/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go index 46ec8adc0..94b840244 100644 --- a/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/controller/v1beta1/zz_generated.deepcopy.go @@ -153,6 +153,7 @@ func (in *AppWrapperList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AppWrapperResource) DeepCopyInto(out *AppWrapperResource) { *out = *in + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) if in.MinAvailable != nil { in, out := &in.MinAvailable, &out.MinAvailable *out = new(int32) From 90b0753a5781d6fe7e66273e410d4355bfb93e66 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Wed, 5 Jul 2023 09:35:13 +0200 Subject: [PATCH 10/10] test: Remove ObjectMeta from AppWrapperGenericResource --- test/e2e/util.go | 148 +++++++---------------------------------------- 1 file changed, 21 insertions(+), 127 deletions(-) diff --git a/test/e2e/util.go b/test/e2e/util.go index 9dd2248af..9d10e6b0e 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -1,19 +1,4 @@ /* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -/* Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + package e2e import ( @@ -117,7 +103,7 @@ func initTestContext() *context { Name: cxt.namespace, }, }, metav1.CreateOptions{}) - //Expect(err).NotTo(HaveOccurred()) + // Expect(err).NotTo(HaveOccurred()) /* _, err = cxt.kubeclient.SchedulingV1beta1().PriorityClasses().Create(gcontext.Background(), &schedv1.PriorityClass{ ObjectMeta: metav1.ObjectMeta{ @@ -151,7 +137,7 @@ func namespaceNotExist(ctx *context) wait.ConditionFunc { } func cleanupTestContextExtendedTime(cxt *context, seconds time.Duration) { - //foreground := metav1.DeletePropagationForeground + // foreground := metav1.DeletePropagationForeground /* err := cxt.kubeclient.CoreV1().Namespaces().Delete(gcontext.Background(), cxt.namespace, metav1.DeleteOptions{ PropagationPolicy: &foreground, }) @@ -172,7 +158,7 @@ func cleanupTestContextExtendedTime(cxt *context, seconds time.Duration) { // if err != nil { // fmt.Fprintf(GinkgoWriter, "[cleanupTestContextExtendedTime] Failure check for namespace: %s.\n", cxt.namespace) // } - //Expect(err).NotTo(HaveOccurred()) + // Expect(err).NotTo(HaveOccurred()) } func cleanupTestContext(cxt *context) { @@ -274,10 +260,6 @@ func createGenericAWTimeoutWithStatus(context *context, name string) *arbv1.AppW AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-test-jobtimeout-with-comp-1-job"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -408,8 +390,8 @@ func anyPodsExist(ctx *context, awNamespace string, awName string) wait.Conditio // First find a pod from the list that is part of the AW if awn, found := podFromPodList.Labels["appwrapper.mcad.ibm.com"]; !found || awn != awName { - //DEBUG fmt.Fprintf(GinkgoWriter, "[anyPodsExist] Pod %s in phase: %s not part of AppWrapper: %s, labels: %#v\n", - //DEBUG podFromPodList.Name, podFromPodList.Status.Phase, awName, podFromPodList.Labels) + // DEBUG fmt.Fprintf(GinkgoWriter, "[anyPodsExist] Pod %s in phase: %s not part of AppWrapper: %s, labels: %#v\n", + // DEBUG podFromPodList.Name, podFromPodList.Status.Phase, awName, podFromPodList.Labels) continue } podExistsNum++ @@ -515,7 +497,7 @@ func cleanupTestObjectsVerbose(context *context, appwrappers []*arbv1.AppWrapper } for _, aw := range appwrappers { - //context.karclient.ArbV1().AppWrappers(context.namespace).Delete(aw.Name, &metav1.DeleteOptions{PropagationPolicy: &foreground}) + // context.karclient.ArbV1().AppWrappers(context.namespace).Delete(aw.Name, &metav1.DeleteOptions{PropagationPolicy: &foreground}) pods := getPodsOfAppWrapper(context, aw) awNamespace := aw.Namespace @@ -576,9 +558,9 @@ func awPodPhase(ctx *context, aw *arbv1.AppWrapper, phase []v1.PodPhase, taskNum for _, p := range phase { if pod.Status.Phase == p { - //DEBUGif quite { - //DEBUG fmt.Fprintf(GinkgoWriter, "[awPodPhase] Found pod %s of AppWrapper: %s, phase: %v\n", pod.Name, aw.Name, p) - //DEBUG} + // DEBUGif quite { + // DEBUG fmt.Fprintf(GinkgoWriter, "[awPodPhase] Found pod %s of AppWrapper: %s, phase: %v\n", pod.Name, aw.Name, p) + // DEBUG} readyTaskNum++ break } else { @@ -604,9 +586,9 @@ func awPodPhase(ctx *context, aw *arbv1.AppWrapper, phase []v1.PodPhase, taskNum } } - //DEBUGif taskNum <= readyTaskNum && quite { - //DEBUG fmt.Fprintf(GinkgoWriter, "[awPodPhase] Successfully found %v podList of AppWrapper: %s, state: %s\n", readyTaskNum, aw.Name, aw.Status.State) - //DEBUG} + // DEBUGif taskNum <= readyTaskNum && quite { + // DEBUG fmt.Fprintf(GinkgoWriter, "[awPodPhase] Successfully found %v podList of AppWrapper: %s, state: %s\n", readyTaskNum, aw.Name, aw.Status.State) + // DEBUG} return taskNum <= readyTaskNum, nil } @@ -903,10 +885,6 @@ func createJobAWWithInitContainer(context *context, name string, requeuingTimeIn AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: context.namespace, - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -1703,10 +1681,6 @@ func createGenericDeploymentAW(context *context, name string) *arbv1.AppWrapper AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-generic-deployment-3-item1"), - Namespace: context.namespace, - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -1772,7 +1746,7 @@ func createGenericJobAWWithStatus(context *context, name string) *arbv1.AppWrapp } } }`) - //var schedSpecMin int = 1 + // var schedSpecMin int = 1 aw := &arbv1.AppWrapper{ ObjectMeta: metav1.ObjectMeta{ @@ -1781,15 +1755,11 @@ func createGenericJobAWWithStatus(context *context, name string) *arbv1.AppWrapp }, Spec: arbv1.AppWrapperSpec{ SchedSpec: arbv1.SchedulingSpecTemplate{ - //MinAvailable: schedSpecMin, + // MinAvailable: schedSpecMin, }, AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-test-job-with-comp-1"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -1913,10 +1883,6 @@ func createGenericJobAWWithMultipleStatus(context *context, name string) *arbv1. AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-test-job-with-comp-ms-21-1"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -1924,10 +1890,6 @@ func createGenericJobAWWithMultipleStatus(context *context, name string) *arbv1. CompletionStatus: "Complete", }, { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-test-job-with-comp-ms-21-2"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb2, @@ -1973,10 +1935,6 @@ func createAWGenericItemWithoutStatus(context *context, name string) *arbv1.AppW AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-test-job-with-comp-44"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -2056,10 +2014,6 @@ func createGenericJobAWWithScheduleSpec(context *context, name string) *arbv1.Ap AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-test-job-with-scheduling-spec"), - Namespace: "test", - }, GenericTemplate: runtime.RawExtension{ Raw: rb, }, @@ -2125,7 +2079,7 @@ func createGenericJobAWtWithLargeCompute(context *context, name string) *arbv1.A } } }`) - //var schedSpecMin int = 1 + // var schedSpecMin int = 1 aw := &arbv1.AppWrapper{ ObjectMeta: metav1.ObjectMeta{ @@ -2134,20 +2088,16 @@ func createGenericJobAWtWithLargeCompute(context *context, name string) *arbv1.A }, Spec: arbv1.AppWrapperSpec{ SchedSpec: arbv1.SchedulingSpecTemplate{ - //MinAvailable: schedSpecMin, + // MinAvailable: schedSpecMin, }, AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-test-job-with-large-comp-1"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, }, - //CompletionStatus: "Complete", + // CompletionStatus: "Complete", }, }, }, @@ -2200,7 +2150,7 @@ func createGenericServiceAWWithNoStatus(context *context, name string) *arbv1.Ap "type": "ClusterIP" } }`) - //var schedSpecMin int = 1 + // var schedSpecMin int = 1 aw := &arbv1.AppWrapper{ ObjectMeta: metav1.ObjectMeta{ @@ -2209,15 +2159,11 @@ func createGenericServiceAWWithNoStatus(context *context, name string) *arbv1.Ap }, Spec: arbv1.AppWrapperSpec{ SchedSpec: arbv1.SchedulingSpecTemplate{ - //MinAvailable: schedSpecMin, + // MinAvailable: schedSpecMin, }, AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-test-job-with-comp-1"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -2332,10 +2278,6 @@ func createGenericDeploymentAWWithMultipleItems(context *context, name string) * AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-deployment-2-status"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -2343,10 +2285,6 @@ func createGenericDeploymentAWWithMultipleItems(context *context, name string) * CompletionStatus: "Progressing", }, { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-deployment-3-status"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb1, @@ -2453,10 +2391,6 @@ func createGenericDeploymentAWWithService(context *context, name string) *arbv1. AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "aw-deployment-3-status"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -2464,10 +2398,6 @@ func createGenericDeploymentAWWithService(context *context, name string) *arbv1. CompletionStatus: "Progressing", }, { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "my-service"), - Namespace: "test", - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb1, @@ -2547,10 +2477,6 @@ func createGenericDeploymentWithCPUAW(context *context, name string, cpuDemand s AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item1"), - Namespace: context.namespace, - }, DesiredAvailable: 1, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -2633,10 +2559,6 @@ func createGenericDeploymentCustomPodResourcesWithCPUAW(context *context, name s AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item1"), - Namespace: context.namespace, - }, CustomPodResources: []arbv1.CustomPodResourceTemplate{ { Replicas: replicas, @@ -2868,10 +2790,6 @@ func createGenericStatefulSetAW(context *context, name string) *arbv1.AppWrapper AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item1"), - Namespace: context.namespace, - }, DesiredAvailable: 2, GenericTemplate: runtime.RawExtension{ Raw: rb, @@ -3150,10 +3068,6 @@ func createGenericPodAWCustomDemand(context *context, name string, cpuDemand str AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item"), - Namespace: context.namespace, - }, GenericTemplate: runtime.RawExtension{ Raw: rb, }, @@ -3223,10 +3137,6 @@ func createGenericPodAW(context *context, name string) *arbv1.AppWrapper { AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item"), - Namespace: context.namespace, - }, GenericTemplate: runtime.RawExtension{ Raw: rb, }, @@ -3298,10 +3208,6 @@ func createGenericPodTooBigAW(context *context, name string) *arbv1.AppWrapper { AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item"), - Namespace: context.namespace, - }, GenericTemplate: runtime.RawExtension{ Raw: rb, }, @@ -3356,10 +3262,6 @@ func createBadGenericPodAW(context *context, name string) *arbv1.AppWrapper { AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item"), - Namespace: context.namespace, - }, GenericTemplate: runtime.RawExtension{ Raw: rb, }, @@ -3376,7 +3278,7 @@ func createBadGenericPodAW(context *context, name string) *arbv1.AppWrapper { } func createBadGenericItemAW(context *context, name string) *arbv1.AppWrapper { - //rb := []byte(`""`) + // rb := []byte(`""`) var schedSpecMin int = 1 aw := &arbv1.AppWrapper{ @@ -3391,10 +3293,6 @@ func createBadGenericItemAW(context *context, name string) *arbv1.AppWrapper { AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item"), - Namespace: context.namespace, - }, // GenericTemplate: runtime.RawExtension{ // Raw: rb, // }, @@ -3456,10 +3354,6 @@ func createBadGenericPodTemplateAW(context *context, name string) (*arbv1.AppWra AggrResources: arbv1.AppWrapperResourceList{ GenericItems: []arbv1.AppWrapperGenericResource{ { - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%s", name, "item"), - Namespace: context.namespace, - }, DesiredAvailable: 2, GenericTemplate: runtime.RawExtension{ Raw: rb,