project-codeflare · dgrove-oss · May 14, 2025 · May 14, 2025 · May 14, 2025
diff --git a/setup.RHOAI-v2.16/CLUSTER-SETUP.md b/setup.RHOAI-v2.16/CLUSTER-SETUP.md
@@ -82,9 +82,6 @@ AI configuration as follows:
     - `schedulerName` is set to `scheduler-plugins-scheduler`,
     - `queueName` is set to `default-queue`,
     - `slackQueueName` is set to `slack-cluster-queue`
-- pod priorities, resource requests and limits have been adjusted.
-
-
 
 ## Autopilot
 

diff --git a/setup.RHOAI-v2.19/CLUSTER-SETUP.md b/setup.RHOAI-v2.19/CLUSTER-SETUP.md
@@ -73,7 +73,6 @@ AI configuration as follows:
   - `manageJobsWithoutQueueName` is enabled,
   - `batch/job` integration is disabled,
   - `waitForPodsReady` is disabled,
-  - `LendingLimit` feature gate is enabled,
   - `fairSharing` is enabled,
   - `enableClusterQueueResources` metrics is enabled,
 - Codeflare operator:
@@ -82,9 +81,6 @@ AI configuration as follows:
     - `schedulerName` is set to `scheduler-plugins-scheduler`,
     - `queueName` is set to `default-queue`,
     - `slackQueueName` is set to `slack-cluster-queue`
-- pod priorities, resource requests and limits have been adjusted.
-
-
 
 ## Autopilot
 

diff --git a/setup.RHOAI-v2.19/UPGRADE-FAST.md b/setup.RHOAI-v2.19/UPGRADE-FAST.md
@@ -18,12 +18,28 @@ install-kpzzl   rhods-operator.2.18.0   Manual     false
 install-nqrbp   rhods-operator.2.19.0   Manual     true
 ```
 
-Assuming the install plan exists you can begin the upgrade process.
+Before approving the upgrade, you must manually remove v1alpha1 MultiKueue CRD's
+from your cluster. These CRDs were replaced by v1beta1 versions in the Kueue 0.9 release,
+but the RHOAI operator will not automatically remove CRDs.
+Ensure you have no instances:
+```sh
+kubectl get multikueueclusters.kueue.x-k8s.io --all-namespaces
+kubectl get multikueueconfigs.kueue.x-k8s.io --all-namespaces
+```
+Delete all any instances.  Then delete the CRDs
+```sh
+kubectl delete crd multikueueclusters.kueue.x-k8s.io
+kubectl delete crd multikueueconfigs.kueue.x-k8s.io
+```
+
+Next, update the MLBatch modifications to the default RHOAI configuration maps and subscription.
+```sh
+oc apply -f setup.RHOAI-v2.19/mlbatch-upgrade-configmaps.yaml
+oc apply -f setup.RHOAI-v2.19/mlbatch-upgrade-subscription.yaml
+```
 
-There are no MLBatch modifications to the default RHOAI configuration maps
-beyond those already made in previous installs. Therefore, you can simply
-approve the install plan replacing the example plan name below with the actual
-value on your cluster:
+Finally, you can approve the install plan replacing the example plan name below
+with the actual value on your cluster:
 ```sh
 oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl
 ```
diff --git a/setup.RHOAI-v2.19/UPGRADE-STABLE.md b/setup.RHOAI-v2.19/UPGRADE-STABLE.md
@@ -21,10 +21,28 @@ install-nqrbp   rhods-operator.2.19.0   Manual     true
 
 Assuming the install plan exists you can begin the upgrade process.
 
-There are no MLBatch modifications to the default RHOAI configuration maps
-beyond those already made in previous installs. Therefore, you can simply
-approve the install plan replacing the example plan name below with the actual
-value on your cluster:
+Before approving the upgrade, you must manually remove v1alpha1 MultiKueue CRD's
+from your cluster. These CRDs were replaced by v1beta1 versions in the Kueue 0.9 release,
+but the RHOAI operator will not automatically remove CRDs.
+Ensure you have no instances:
+```
+kubectl get multikueueclusters.kueue.x-k8s.io --all-namespaces
+kubectl get multikueueconfigs.kueue.x-k8s.io --all-namespaces
+```
+Delete all any instances.  Then delete the CRDs
+```
+kubectl delete crd multikueueclusters.kueue.x-k8s.io
+kubectl delete crd multikueueconfigs.kueue.x-k8s.io
+```
+
+Next, update the MLBatch modifications to the default RHOAI configuration maps and subscription.
+```sh
+oc apply -f setup.RHOAI-v2.19/mlbatch-upgrade-configmaps.yaml
+oc apply -f setup.RHOAI-v2.19/mlbatch-upgrade-stable-subscription.yaml
+```
+
+Finally, you can approve the install plan replacing the example plan name below
+with the actual value on your cluster:
 ```sh
 oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl
 ```
diff --git a/setup.RHOAI-v2.19/mlbatch-subscription.yaml b/setup.RHOAI-v2.19/mlbatch-subscription.yaml
@@ -16,84 +16,6 @@ metadata:
 ---
 apiVersion: v1
 kind: ConfigMap
-metadata:
-  name: mlbatch-codeflare
-  namespace: redhat-ods-operator
-data:
-  manager.yaml: |
-    apiVersion: apps/v1
-    kind: Deployment
-    metadata:
-      name: manager
-      namespace: system
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: codeflare-operator
-          app.kubernetes.io/part-of: codeflare
-      replicas: 1
-      template:
-        metadata:
-          annotations:
-            kubectl.kubernetes.io/default-container: manager
-          labels:
-            app.kubernetes.io/name: codeflare-operator
-            app.kubernetes.io/part-of: codeflare
-        spec:
-          priorityClassName: system-node-critical
-          securityContext:
-            runAsNonRoot: true
-            # TODO(user): For common cases that do not require escalating privileges
-            # it is recommended to ensure that all your Pods/Containers are restrictive.
-            # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
-            # Please uncomment the following code if your project does NOT have to work on old Kubernetes
-            # versions < 1.20 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ).
-            # seccompProfile:
-            #   type: RuntimeDefault
-          containers:
-          - command:
-            - /manager
-            image: $(codeflare_operator_controller_image)
-            imagePullPolicy: Always
-            name: manager
-            securityContext:
-              allowPrivilegeEscalation: false
-              capabilities:
-                drop:
-                  - "ALL"
-            env:
-              - name: NAMESPACE
-                valueFrom:
-                  fieldRef:
-                    fieldPath: metadata.namespace
-            ports:
-              - containerPort: 8080
-                protocol: TCP
-                name: metrics
-            livenessProbe:
-              httpGet:
-                path: /healthz
-                port: 8081
-              initialDelaySeconds: 15
-              periodSeconds: 20
-            readinessProbe:
-              httpGet:
-                path: /readyz
-                port: 8081
-              initialDelaySeconds: 5
-              periodSeconds: 10
-            resources:
-              limits:
-                cpu: "1"
-                memory: 1Gi
-              requests:
-                cpu: "1"
-                memory: 1Gi
-          serviceAccountName: controller-manager
-          terminationGracePeriodSeconds: 10
----
-apiVersion: v1
-kind: ConfigMap
 metadata:
   name: codeflare-operator-config
   namespace: redhat-ods-applications
@@ -129,25 +51,6 @@ data:
 ---
 apiVersion: v1
 kind: ConfigMap
-metadata:
-  name: mlbatch-kuberay
-  namespace: redhat-ods-operator
-data:
-  kuberay-operator-image-patch.yaml: |
-    apiVersion: apps/v1
-    kind: Deployment
-    metadata:
-      name: kuberay-operator
-    spec:
-      template:
-        spec:
-          priorityClassName: system-node-critical
-          containers:
-          - name: kuberay-operator
-            image: $(image)
----
-apiVersion: v1
-kind: ConfigMap
 metadata:
   name: mlbatch-kueue
   namespace: redhat-ods-operator
@@ -158,7 +61,7 @@ data:
     health:
       healthProbeBindAddress: :8081
     metrics:
-      bindAddress: :8080
+      bindAddress: :8443
       enableClusterQueueResources: true
     webhook:
       port: 9443
@@ -171,6 +74,7 @@ data:
         Pod: 5
         Workload.kueue.x-k8s.io: 5
         LocalQueue.kueue.x-k8s.io: 1
+        Cohort.kueue.x-k8s.io: 1
         ClusterQueue.kueue.x-k8s.io: 1
         ResourceFlavor.kueue.x-k8s.io: 1
     clientConnection:
@@ -181,6 +85,9 @@ data:
       enable: false
       blockAdmission: false
     manageJobsWithoutQueueName: true
+    #managedJobsNamespaceSelector:
+    #  matchLabels:
+    #    kueue-managed: "true"
     #internalCertManagement:
     #  enable: false
     #  webhookServiceName: ""
@@ -198,6 +105,8 @@ data:
       - "kubeflow.org/tfjob"
       - "kubeflow.org/xgboostjob"
     # - "pod"
+    #  - "deployment" # requires enabling pod integration
+    #  - "statefulset" # requires enabling pod integration
       externalFrameworks:
       - "AppWrapper.v1beta2.workload.codeflare.dev"
     # podOptions:
@@ -209,31 +118,14 @@ data:
     fairSharing:
       enable: true
       preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare]
-  manager_config_patch.yaml: |
-    apiVersion: apps/v1
-    kind: Deployment
-    metadata:
-      name: controller-manager
-      namespace: system
-    spec:
-      template:
-        spec:
-          priorityClassName: system-node-critical
-          containers:
-          - name: manager
-            image: $(image)
-            args:
-            - "--config=/controller_manager_config.yaml"
-            - "--zap-log-level=2"
-            - "--feature-gates=LendingLimit=true"
-            volumeMounts:
-            - name: manager-config
-              mountPath: /controller_manager_config.yaml
-              subPath: controller_manager_config.yaml
-          volumes:
-          - name: manager-config
-            configMap:
-              name: manager-config
+    #resources:
+    #  excludeResourcePrefixes: []
+    #  transformations:
+    #  - input: nvidia.com/mig-4g.5gb
+    #    strategy: Replace | Retain
+    #    outputs:
+    #      example.com/accelerator-memory: 5Gi
+    #      example.com/accelerator-gpc: 4
 ---
 apiVersion: v1
 kind: ConfigMap
@@ -249,20 +141,23 @@ data:
     spec:
       template:
         spec:
-          priorityClassName: system-node-critical
           containers:
           - name:  training-operator
             image: $(image)
             args:
             - "--zap-log-level=2"
+            - --pytorch-init-container-image
+            - $(image)
+            - "--webhook-secret-name"
+            - "kubeflow-training-operator-webhook-cert"
+            - "--webhook-service-name"
+            - "kubeflow-training-operator"
             - "--gang-scheduler-name=scheduler-plugins-scheduler"
-            resources:
-              requests:
-                cpu: 100m
-                memory: 100Mi
-              limits:
-                cpu: 500m
-                memory: 1000Mi
+          volumes:
+          - name: cert
+            secret:
+              defaultMode: 420
+              secretName: kubeflow-training-operator-webhook-cert
 ---
 apiVersion: operators.coreos.com/v1alpha1
 kind: Subscription
@@ -283,25 +178,16 @@ spec:
     - name: mlbatch-codeflare
       mountPath: /opt/manifests/codeflare/manager/manager.yaml
       subPath: manager.yaml
-    - name: mlbatch-kuberay
-      mountPath: /opt/manifests/ray/openshift/kuberay-operator-image-patch.yaml
-      subPath: kuberay-operator-image-patch.yaml
     - name: mlbatch-kueue
       mountPath: /opt/manifests/kueue/components/manager/controller_manager_config.yaml
       subPath: controller_manager_config.yaml
-    - name: mlbatch-kueue
-      mountPath: /opt/manifests/kueue/rhoai/manager_config_patch.yaml
-      subPath: manager_config_patch.yaml
     - name: mlbatch-training-operator
       mountPath: /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml
       subPath: manager_config_patch.yaml
     volumes:
     - name: mlbatch-codeflare
       configMap:
         name: mlbatch-codeflare
-    - name: mlbatch-kuberay
-      configMap:
-        name: mlbatch-kuberay
     - name: mlbatch-kueue
       configMap:
         name: mlbatch-kueue