From f65c1f6df21b6f55e050fa3a59c1d309c908a243 Mon Sep 17 00:00:00 2001 From: David Grove Date: Wed, 14 May 2025 14:07:02 -0400 Subject: [PATCH 1/2] doc fix: missed a 2.16 ==> 2.19 update --- setup.RHOAI-v2.19/CLUSTER-SETUP.md | 5 ++--- setup.tmpl/CLUSTER-SETUP.md.tmpl | 2 +- setup.tmpl/RHOAI-v2.16.yaml | 1 + setup.tmpl/RHOAI-v2.17.yaml | 6 ------ setup.tmpl/RHOAI-v2.19.yaml | 1 + 5 files changed, 5 insertions(+), 10 deletions(-) delete mode 100644 setup.tmpl/RHOAI-v2.17.yaml diff --git a/setup.RHOAI-v2.19/CLUSTER-SETUP.md b/setup.RHOAI-v2.19/CLUSTER-SETUP.md index 87046a6..6c20fcf 100644 --- a/setup.RHOAI-v2.19/CLUSTER-SETUP.md +++ b/setup.RHOAI-v2.19/CLUSTER-SETUP.md @@ -46,7 +46,7 @@ oc get ip -n redhat-ods-operator ``` ``` NAMESPACE NAME CSV APPROVAL APPROVED -redhat-ods-operator install-kmh8w rhods-operator.2.16.0 Manual false +redhat-ods-operator install-kmh8w rhods-operator.2.19.0 Manual false ``` Approve install plan replacing the generated plan name below with the actual value: @@ -73,7 +73,7 @@ AI configuration as follows: - `manageJobsWithoutQueueName` is enabled, - `batch/job` integration is disabled, - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, + - `VisibilityOnDemand` feature gate is disabled, - `fairSharing` is enabled, - `enableClusterQueueResources` metrics is enabled, - Codeflare operator: @@ -82,7 +82,6 @@ AI configuration as follows: - `schedulerName` is set to `scheduler-plugins-scheduler`, - `queueName` is set to `default-queue`, - `slackQueueName` is set to `slack-cluster-queue` -- pod priorities, resource requests and limits have been adjusted. diff --git a/setup.tmpl/CLUSTER-SETUP.md.tmpl b/setup.tmpl/CLUSTER-SETUP.md.tmpl index 1cb3f8d..8af0cd4 100644 --- a/setup.tmpl/CLUSTER-SETUP.md.tmpl +++ b/setup.tmpl/CLUSTER-SETUP.md.tmpl @@ -78,7 +78,7 @@ Identify install plan: ``` ``` NAMESPACE NAME CSV APPROVAL APPROVED -redhat-ods-operator install-kmh8w rhods-operator.2.16.0 Manual false +redhat-ods-operator install-kmh8w rhods-operator.{{ .VERSION_NUMBER}} Manual false ``` Approve install plan replacing the generated plan name below with the actual value: diff --git a/setup.tmpl/RHOAI-v2.16.yaml b/setup.tmpl/RHOAI-v2.16.yaml index 17cff67..4fa393b 100644 --- a/setup.tmpl/RHOAI-v2.16.yaml +++ b/setup.tmpl/RHOAI-v2.16.yaml @@ -2,5 +2,6 @@ RHOAI: true VERSION: RHOAI-v2.16 +VERSION_NUMBER: 2.16.0 KUBECTL: oc FAIRSHARE: true diff --git a/setup.tmpl/RHOAI-v2.17.yaml b/setup.tmpl/RHOAI-v2.17.yaml deleted file mode 100644 index c243c3c..0000000 --- a/setup.tmpl/RHOAI-v2.17.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# Values for RHOAI 2.17 - -RHOAI: true -VERSION: RHOAI-v2.17 -KUBECTL: oc -FAIRSHARE: true diff --git a/setup.tmpl/RHOAI-v2.19.yaml b/setup.tmpl/RHOAI-v2.19.yaml index 0b54073..ba5c840 100644 --- a/setup.tmpl/RHOAI-v2.19.yaml +++ b/setup.tmpl/RHOAI-v2.19.yaml @@ -2,5 +2,6 @@ RHOAI: true VERSION: RHOAI-v2.19 +VERSION_NUMBER: 2.16.0 KUBECTL: oc FAIRSHARE: true From d3b2691f3ddfc723454df4e419a2710aaa17e53a Mon Sep 17 00:00:00 2001 From: David Grove Date: Wed, 14 May 2025 15:30:34 -0400 Subject: [PATCH 2/2] Updated instructions and configurations for RHOAI 2.19 --- setup.RHOAI-v2.16/CLUSTER-SETUP.md | 3 - setup.RHOAI-v2.19/CLUSTER-SETUP.md | 5 +- setup.RHOAI-v2.19/UPGRADE-FAST.md | 26 ++- setup.RHOAI-v2.19/UPGRADE-STABLE.md | 26 ++- setup.RHOAI-v2.19/mlbatch-subscription.yaml | 166 +++--------------- .../mlbatch-upgrade-configmaps.yaml | 145 +++++++++++++++ .../mlbatch-upgrade-fast-subscription.yaml | 34 ++++ .../mlbatch-upgrade-stable-subscription.yaml | 34 ++++ setup.k8s/CLUSTER-SETUP.md | 1 - setup.k8s/kueue/kustomization.yaml | 7 - setup.tmpl/CLUSTER-SETUP.md.tmpl | 20 +-- 11 files changed, 285 insertions(+), 182 deletions(-) create mode 100644 setup.RHOAI-v2.19/mlbatch-upgrade-configmaps.yaml create mode 100644 setup.RHOAI-v2.19/mlbatch-upgrade-fast-subscription.yaml create mode 100644 setup.RHOAI-v2.19/mlbatch-upgrade-stable-subscription.yaml diff --git a/setup.RHOAI-v2.16/CLUSTER-SETUP.md b/setup.RHOAI-v2.16/CLUSTER-SETUP.md index a4fcc0a..b6ab4a9 100644 --- a/setup.RHOAI-v2.16/CLUSTER-SETUP.md +++ b/setup.RHOAI-v2.16/CLUSTER-SETUP.md @@ -82,9 +82,6 @@ AI configuration as follows: - `schedulerName` is set to `scheduler-plugins-scheduler`, - `queueName` is set to `default-queue`, - `slackQueueName` is set to `slack-cluster-queue` -- pod priorities, resource requests and limits have been adjusted. - - ## Autopilot diff --git a/setup.RHOAI-v2.19/CLUSTER-SETUP.md b/setup.RHOAI-v2.19/CLUSTER-SETUP.md index 6c20fcf..038ac3e 100644 --- a/setup.RHOAI-v2.19/CLUSTER-SETUP.md +++ b/setup.RHOAI-v2.19/CLUSTER-SETUP.md @@ -46,7 +46,7 @@ oc get ip -n redhat-ods-operator ``` ``` NAMESPACE NAME CSV APPROVAL APPROVED -redhat-ods-operator install-kmh8w rhods-operator.2.19.0 Manual false +redhat-ods-operator install-kmh8w rhods-operator.2.16.0 Manual false ``` Approve install plan replacing the generated plan name below with the actual value: @@ -73,7 +73,6 @@ AI configuration as follows: - `manageJobsWithoutQueueName` is enabled, - `batch/job` integration is disabled, - `waitForPodsReady` is disabled, - - `VisibilityOnDemand` feature gate is disabled, - `fairSharing` is enabled, - `enableClusterQueueResources` metrics is enabled, - Codeflare operator: @@ -83,8 +82,6 @@ AI configuration as follows: - `queueName` is set to `default-queue`, - `slackQueueName` is set to `slack-cluster-queue` - - ## Autopilot Helm charts values and how-to for customization can be found [in the official documentation](https://github.com/IBM/autopilot/blob/main/helm-charts/autopilot/README.md). As-is, Autopilot will run on GPU nodes. diff --git a/setup.RHOAI-v2.19/UPGRADE-FAST.md b/setup.RHOAI-v2.19/UPGRADE-FAST.md index 06db6ab..5a52710 100644 --- a/setup.RHOAI-v2.19/UPGRADE-FAST.md +++ b/setup.RHOAI-v2.19/UPGRADE-FAST.md @@ -18,12 +18,28 @@ install-kpzzl rhods-operator.2.18.0 Manual false install-nqrbp rhods-operator.2.19.0 Manual true ``` -Assuming the install plan exists you can begin the upgrade process. +Before approving the upgrade, you must manually remove v1alpha1 MultiKueue CRD's +from your cluster. These CRDs were replaced by v1beta1 versions in the Kueue 0.9 release, +but the RHOAI operator will not automatically remove CRDs. +Ensure you have no instances: +```sh +kubectl get multikueueclusters.kueue.x-k8s.io --all-namespaces +kubectl get multikueueconfigs.kueue.x-k8s.io --all-namespaces +``` +Delete all any instances. Then delete the CRDs +```sh +kubectl delete crd multikueueclusters.kueue.x-k8s.io +kubectl delete crd multikueueconfigs.kueue.x-k8s.io +``` + +Next, update the MLBatch modifications to the default RHOAI configuration maps and subscription. +```sh +oc apply -f setup.RHOAI-v2.19/mlbatch-upgrade-configmaps.yaml +oc apply -f setup.RHOAI-v2.19/mlbatch-upgrade-subscription.yaml +``` -There are no MLBatch modifications to the default RHOAI configuration maps -beyond those already made in previous installs. Therefore, you can simply -approve the install plan replacing the example plan name below with the actual -value on your cluster: +Finally, you can approve the install plan replacing the example plan name below +with the actual value on your cluster: ```sh oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl ``` diff --git a/setup.RHOAI-v2.19/UPGRADE-STABLE.md b/setup.RHOAI-v2.19/UPGRADE-STABLE.md index 10a4cf5..a332ea4 100644 --- a/setup.RHOAI-v2.19/UPGRADE-STABLE.md +++ b/setup.RHOAI-v2.19/UPGRADE-STABLE.md @@ -21,10 +21,28 @@ install-nqrbp rhods-operator.2.19.0 Manual true Assuming the install plan exists you can begin the upgrade process. -There are no MLBatch modifications to the default RHOAI configuration maps -beyond those already made in previous installs. Therefore, you can simply -approve the install plan replacing the example plan name below with the actual -value on your cluster: +Before approving the upgrade, you must manually remove v1alpha1 MultiKueue CRD's +from your cluster. These CRDs were replaced by v1beta1 versions in the Kueue 0.9 release, +but the RHOAI operator will not automatically remove CRDs. +Ensure you have no instances: +``` +kubectl get multikueueclusters.kueue.x-k8s.io --all-namespaces +kubectl get multikueueconfigs.kueue.x-k8s.io --all-namespaces +``` +Delete all any instances. Then delete the CRDs +``` +kubectl delete crd multikueueclusters.kueue.x-k8s.io +kubectl delete crd multikueueconfigs.kueue.x-k8s.io +``` + +Next, update the MLBatch modifications to the default RHOAI configuration maps and subscription. +```sh +oc apply -f setup.RHOAI-v2.19/mlbatch-upgrade-configmaps.yaml +oc apply -f setup.RHOAI-v2.19/mlbatch-upgrade-stable-subscription.yaml +``` + +Finally, you can approve the install plan replacing the example plan name below +with the actual value on your cluster: ```sh oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl ``` diff --git a/setup.RHOAI-v2.19/mlbatch-subscription.yaml b/setup.RHOAI-v2.19/mlbatch-subscription.yaml index e667279..d47343d 100644 --- a/setup.RHOAI-v2.19/mlbatch-subscription.yaml +++ b/setup.RHOAI-v2.19/mlbatch-subscription.yaml @@ -16,84 +16,6 @@ metadata: --- apiVersion: v1 kind: ConfigMap -metadata: - name: mlbatch-codeflare - namespace: redhat-ods-operator -data: - manager.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: manager - namespace: system - spec: - selector: - matchLabels: - app.kubernetes.io/name: codeflare-operator - app.kubernetes.io/part-of: codeflare - replicas: 1 - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - app.kubernetes.io/name: codeflare-operator - app.kubernetes.io/part-of: codeflare - spec: - priorityClassName: system-node-critical - securityContext: - runAsNonRoot: true - # TODO(user): For common cases that do not require escalating privileges - # it is recommended to ensure that all your Pods/Containers are restrictive. - # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted - # Please uncomment the following code if your project does NOT have to work on old Kubernetes - # versions < 1.20 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). - # seccompProfile: - # type: RuntimeDefault - containers: - - command: - - /manager - image: $(codeflare_operator_controller_image) - imagePullPolicy: Always - name: manager - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - ports: - - containerPort: 8080 - protocol: TCP - name: metrics - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - cpu: "1" - memory: 1Gi - requests: - cpu: "1" - memory: 1Gi - serviceAccountName: controller-manager - terminationGracePeriodSeconds: 10 ---- -apiVersion: v1 -kind: ConfigMap metadata: name: codeflare-operator-config namespace: redhat-ods-applications @@ -129,25 +51,6 @@ data: --- apiVersion: v1 kind: ConfigMap -metadata: - name: mlbatch-kuberay - namespace: redhat-ods-operator -data: - kuberay-operator-image-patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: kuberay-operator - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: kuberay-operator - image: $(image) ---- -apiVersion: v1 -kind: ConfigMap metadata: name: mlbatch-kueue namespace: redhat-ods-operator @@ -158,7 +61,7 @@ data: health: healthProbeBindAddress: :8081 metrics: - bindAddress: :8080 + bindAddress: :8443 enableClusterQueueResources: true webhook: port: 9443 @@ -171,6 +74,7 @@ data: Pod: 5 Workload.kueue.x-k8s.io: 5 LocalQueue.kueue.x-k8s.io: 1 + Cohort.kueue.x-k8s.io: 1 ClusterQueue.kueue.x-k8s.io: 1 ResourceFlavor.kueue.x-k8s.io: 1 clientConnection: @@ -181,6 +85,9 @@ data: enable: false blockAdmission: false manageJobsWithoutQueueName: true + #managedJobsNamespaceSelector: + # matchLabels: + # kueue-managed: "true" #internalCertManagement: # enable: false # webhookServiceName: "" @@ -198,6 +105,8 @@ data: - "kubeflow.org/tfjob" - "kubeflow.org/xgboostjob" # - "pod" + # - "deployment" # requires enabling pod integration + # - "statefulset" # requires enabling pod integration externalFrameworks: - "AppWrapper.v1beta2.workload.codeflare.dev" # podOptions: @@ -209,31 +118,14 @@ data: fairSharing: enable: true preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: controller-manager - namespace: system - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: manager - image: $(image) - args: - - "--config=/controller_manager_config.yaml" - - "--zap-log-level=2" - - "--feature-gates=LendingLimit=true" - volumeMounts: - - name: manager-config - mountPath: /controller_manager_config.yaml - subPath: controller_manager_config.yaml - volumes: - - name: manager-config - configMap: - name: manager-config + #resources: + # excludeResourcePrefixes: [] + # transformations: + # - input: nvidia.com/mig-4g.5gb + # strategy: Replace | Retain + # outputs: + # example.com/accelerator-memory: 5Gi + # example.com/accelerator-gpc: 4 --- apiVersion: v1 kind: ConfigMap @@ -249,20 +141,23 @@ data: spec: template: spec: - priorityClassName: system-node-critical containers: - name: training-operator image: $(image) args: - "--zap-log-level=2" + - --pytorch-init-container-image + - $(image) + - "--webhook-secret-name" + - "kubeflow-training-operator-webhook-cert" + - "--webhook-service-name" + - "kubeflow-training-operator" - "--gang-scheduler-name=scheduler-plugins-scheduler" - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 500m - memory: 1000Mi + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: kubeflow-training-operator-webhook-cert --- apiVersion: operators.coreos.com/v1alpha1 kind: Subscription @@ -283,15 +178,9 @@ spec: - name: mlbatch-codeflare mountPath: /opt/manifests/codeflare/manager/manager.yaml subPath: manager.yaml - - name: mlbatch-kuberay - mountPath: /opt/manifests/ray/openshift/kuberay-operator-image-patch.yaml - subPath: kuberay-operator-image-patch.yaml - name: mlbatch-kueue mountPath: /opt/manifests/kueue/components/manager/controller_manager_config.yaml subPath: controller_manager_config.yaml - - name: mlbatch-kueue - mountPath: /opt/manifests/kueue/rhoai/manager_config_patch.yaml - subPath: manager_config_patch.yaml - name: mlbatch-training-operator mountPath: /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml subPath: manager_config_patch.yaml @@ -299,9 +188,6 @@ spec: - name: mlbatch-codeflare configMap: name: mlbatch-codeflare - - name: mlbatch-kuberay - configMap: - name: mlbatch-kuberay - name: mlbatch-kueue configMap: name: mlbatch-kueue diff --git a/setup.RHOAI-v2.19/mlbatch-upgrade-configmaps.yaml b/setup.RHOAI-v2.19/mlbatch-upgrade-configmaps.yaml new file mode 100644 index 0000000..d85799d --- /dev/null +++ b/setup.RHOAI-v2.19/mlbatch-upgrade-configmaps.yaml @@ -0,0 +1,145 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: codeflare-operator-config + namespace: redhat-ods-applications +data: + config.yaml: | + appwrapper: + enabled: true + Config: + autopilot: + injectAntiAffinities: true + monitorNodes: true + resourceTaints: + nvidia.com/gpu: + - key: autopilot.ibm.com/gpuhealth + value: ERR + effect: NoSchedule + - key: autopilot.ibm.com/gpuhealth + value: TESTING + effect: NoSchedule + - key: autopilot.ibm.com/gpuhealth + value: EVICT + effect: NoExecute + defaultQueueName: default-queue + enableKueueIntegrations: true + kueueJobReconciller: + manageJobsWithoutQueueName: true + waitForPodsReady: + blockAdmission: false + enable: false + schedulerName: scheduler-plugins-scheduler + slackQueueName: slack-cluster-queue + userRBACAdmissionCheck: false +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: mlbatch-kueue + namespace: redhat-ods-operator +data: + controller_manager_config.yaml: | + apiVersion: config.kueue.x-k8s.io/v1beta1 + kind: Configuration + health: + healthProbeBindAddress: :8081 + metrics: + bindAddress: :8443 + enableClusterQueueResources: true + webhook: + port: 9443 + leaderElection: + leaderElect: true + resourceName: c1f6bfd2.kueue.x-k8s.io + controller: + groupKindConcurrency: + Job.batch: 5 + Pod: 5 + Workload.kueue.x-k8s.io: 5 + LocalQueue.kueue.x-k8s.io: 1 + Cohort.kueue.x-k8s.io: 1 + ClusterQueue.kueue.x-k8s.io: 1 + ResourceFlavor.kueue.x-k8s.io: 1 + clientConnection: + qps: 50 + burst: 100 + #pprofBindAddress: :8082 + waitForPodsReady: + enable: false + blockAdmission: false + manageJobsWithoutQueueName: true + #managedJobsNamespaceSelector: + # matchLabels: + # kueue-managed: "true" + #internalCertManagement: + # enable: false + # webhookServiceName: "" + # webhookSecretName: "" + integrations: + frameworks: + # - "batch/job" + - "kubeflow.org/mpijob" + - "ray.io/rayjob" + - "ray.io/raycluster" + - "jobset.x-k8s.io/jobset" + - "kubeflow.org/mxjob" + - "kubeflow.org/paddlejob" + - "kubeflow.org/pytorchjob" + - "kubeflow.org/tfjob" + - "kubeflow.org/xgboostjob" + # - "pod" + # - "deployment" # requires enabling pod integration + # - "statefulset" # requires enabling pod integration + externalFrameworks: + - "AppWrapper.v1beta2.workload.codeflare.dev" + # podOptions: + # namespaceSelector: + # matchExpressions: + # - key: kubernetes.io/metadata.name + # operator: NotIn + # values: [ kube-system, kueue-system ] + fairSharing: + enable: true + preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] + #resources: + # excludeResourcePrefixes: [] + # transformations: + # - input: nvidia.com/mig-4g.5gb + # strategy: Replace | Retain + # outputs: + # example.com/accelerator-memory: 5Gi + # example.com/accelerator-gpc: 4 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: mlbatch-training-operator + namespace: redhat-ods-operator +data: + manager_config_patch.yaml: | + apiVersion: apps/v1 + kind: Deployment + metadata: + name: training-operator + spec: + template: + spec: + containers: + - name: training-operator + image: $(image) + args: + - "--zap-log-level=2" + - --pytorch-init-container-image + - $(image) + - "--webhook-secret-name" + - "kubeflow-training-operator-webhook-cert" + - "--webhook-service-name" + - "kubeflow-training-operator" + - "--gang-scheduler-name=scheduler-plugins-scheduler" + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: kubeflow-training-operator-webhook-cert diff --git a/setup.RHOAI-v2.19/mlbatch-upgrade-fast-subscription.yaml b/setup.RHOAI-v2.19/mlbatch-upgrade-fast-subscription.yaml new file mode 100644 index 0000000..6bf6ec5 --- /dev/null +++ b/setup.RHOAI-v2.19/mlbatch-upgrade-fast-subscription.yaml @@ -0,0 +1,34 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: rhods-operator + namespace: redhat-ods-operator +spec: + channel: fast + installPlanApproval: Manual + name: rhods-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + config: + env: + - name: "DISABLE_DSC_CONFIG" + volumeMounts: + - name: mlbatch-codeflare + mountPath: /opt/manifests/codeflare/manager/manager.yaml + subPath: manager.yaml + - name: mlbatch-kueue + mountPath: /opt/manifests/kueue/components/manager/controller_manager_config.yaml + subPath: controller_manager_config.yaml + - name: mlbatch-training-operator + mountPath: /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml + subPath: manager_config_patch.yaml + volumes: + - name: mlbatch-codeflare + configMap: + name: mlbatch-codeflare + - name: mlbatch-kueue + configMap: + name: mlbatch-kueue + - name: mlbatch-training-operator + configMap: + name: mlbatch-training-operator \ No newline at end of file diff --git a/setup.RHOAI-v2.19/mlbatch-upgrade-stable-subscription.yaml b/setup.RHOAI-v2.19/mlbatch-upgrade-stable-subscription.yaml new file mode 100644 index 0000000..31557aa --- /dev/null +++ b/setup.RHOAI-v2.19/mlbatch-upgrade-stable-subscription.yaml @@ -0,0 +1,34 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: rhods-operator + namespace: redhat-ods-operator +spec: + channel: stable + installPlanApproval: Manual + name: rhods-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + config: + env: + - name: "DISABLE_DSC_CONFIG" + volumeMounts: + - name: mlbatch-codeflare + mountPath: /opt/manifests/codeflare/manager/manager.yaml + subPath: manager.yaml + - name: mlbatch-kueue + mountPath: /opt/manifests/kueue/components/manager/controller_manager_config.yaml + subPath: controller_manager_config.yaml + - name: mlbatch-training-operator + mountPath: /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml + subPath: manager_config_patch.yaml + volumes: + - name: mlbatch-codeflare + configMap: + name: mlbatch-codeflare + - name: mlbatch-kueue + configMap: + name: mlbatch-kueue + - name: mlbatch-training-operator + configMap: + name: mlbatch-training-operator \ No newline at end of file diff --git a/setup.k8s/CLUSTER-SETUP.md b/setup.k8s/CLUSTER-SETUP.md index 9ce72c1..865f024 100644 --- a/setup.k8s/CLUSTER-SETUP.md +++ b/setup.k8s/CLUSTER-SETUP.md @@ -98,7 +98,6 @@ operators as follows: - `manageJobsWithoutQueueName` is enabled and configured via `managedJobsNamespaceSelector` to be scoped to only namespaces that are labeled with `mlbatch-team-namespace=true`. - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, - `fairSharing` is enabled, - `enableClusterQueueResources` metrics is enabled, - AppWrapper operator: diff --git a/setup.k8s/kueue/kustomization.yaml b/setup.k8s/kueue/kustomization.yaml index 5b7004c..dca3860 100644 --- a/setup.k8s/kueue/kustomization.yaml +++ b/setup.k8s/kueue/kustomization.yaml @@ -44,10 +44,3 @@ patches: - get - list - watch -- target: - kind: Deployment - name: controller-manager - patch: | - - op: add - path: /spec/template/spec/containers/0/args/- - value: "--feature-gates=LendingLimit=true" diff --git a/setup.tmpl/CLUSTER-SETUP.md.tmpl b/setup.tmpl/CLUSTER-SETUP.md.tmpl index 8af0cd4..a9cd2b8 100644 --- a/setup.tmpl/CLUSTER-SETUP.md.tmpl +++ b/setup.tmpl/CLUSTER-SETUP.md.tmpl @@ -105,7 +105,9 @@ AI configuration as follows: - `manageJobsWithoutQueueName` is enabled, - `batch/job` integration is disabled, - `waitForPodsReady` is disabled, +{{- if (eq .VERSION "RHOAI-v2.16") }} - `LendingLimit` feature gate is enabled, +{{- end }} {{- if .FAIRSHARE }} - `fairSharing` is enabled, {{- end }} @@ -116,24 +118,7 @@ AI configuration as follows: - `schedulerName` is set to `scheduler-plugins-scheduler`, - `queueName` is set to `default-queue`, - `slackQueueName` is set to `slack-cluster-queue` -- pod priorities, resource requests and limits have been adjusted. - -{{ if (eq .VERSION "RHOAI-v2.13") -}} -To work around https://issues.redhat.com/browse/RHOAIENG-7887 (a race condition -in Red Hat OpenShift AI installation), do a rolling restart of the Kueue manager. -```sh -{{ .KUBECTL }} rollout restart deployment/kueue-controller-manager -n redhat-ods-applications -``` -After doing the restart, verify that you see the following lines in the -kueue-controller-manager's log: -```sh -{"level":"info","ts":"2024-06-25T20:17:25.689638786Z","logger":"controller-runtime.builder","caller":"builder/webhook.go:189","msg":"Registering a validating webhook","GVK":"kubeflow.org/v1, Kind=PyTorchJob","path":"/validate-kubeflow-org-v1-pytorchjob"} -{"level":"info","ts":"2024-06-25T20:17:25.689698615Z","logger":"controller-runtime.webhook","caller":"webhook/server.go:183","msg":"Registering webhook","path":"/validate-kubeflow-org-v1-pytorchjob"} -{"level":"info","ts":"2024-06-25T20:17:25.689743757Z","logger":"setup","caller":"jobframework/setup.go:81","msg":"Set up controller and webhook for job framework","jobFrameworkName":"kubeflow.org/pytorchjob"} - -``` -{{- end }} {{- else -}} ## Install Operators @@ -182,7 +167,6 @@ operators as follows: - `manageJobsWithoutQueueName` is enabled and configured via `managedJobsNamespaceSelector` to be scoped to only namespaces that are labeled with `mlbatch-team-namespace=true`. - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, {{- if .FAIRSHARE }} - `fairSharing` is enabled, {{- end }}