diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml deleted file mode 100644 index 517eac1..0000000 --- a/.github/workflows/CI.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: CI -on: - push: - branches: [main] - pull_request: - branches: [main] - -jobs: - CI: - runs-on: ubuntu-latest - steps: - - name: checkout code - uses: actions/checkout@v4 - with: - submodules: false - - - name: Set latest tag and branch name - run: | - echo "GIT_BRANCH=gha-ci" >> $GITHUB_ENV - echo "TAG=$GITHUB_RUN_ID" >> $GITHUB_ENV - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install Helm - uses: azure/setup-helm@v4 - - - name: Install Helm unittest plugin - run: | - helm plugin install https://github.com/helm-unittest/helm-unittest.git - - - name: Run pre-commit checks - run: | - pip install pre-commit - pre-commit run --show-diff-on-failure --color=always --all-files - diff --git a/.github/workflows/gh-pages-static.yml b/.github/workflows/gh-pages-static.yml index 18381fb..b9c9160 100644 --- a/.github/workflows/gh-pages-static.yml +++ b/.github/workflows/gh-pages-static.yml @@ -1,8 +1,13 @@ -# Workflow to invoke from another workflow to deploy static content to GitHub Pages +# Simple workflow for deploying static content to GitHub Pages name: Deploy static content to Pages on: - workflow_call: + # Runs on pushes targeting the default branch + push: + branches: ["gh-pages"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages permissions: @@ -17,6 +22,7 @@ concurrency: cancel-in-progress: false jobs: + # Single deploy job since we're just deploying deploy: environment: name: github-pages @@ -25,8 +31,6 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 - with: - ref: gh-pages - name: Setup Pages uses: actions/configure-pages@v5 - name: Upload artifact diff --git a/.github/workflows/release-chart.yaml b/.github/workflows/release-chart.yaml deleted file mode 100644 index a14c938..0000000 --- a/.github/workflows/release-chart.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: Release Charts - -on: - # This workflow is meant to be triggered manually from the Actions tab - workflow_dispatch: - -jobs: - release: - permissions: - contents: write - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Configure Git - run: | - git config user.name "$GITHUB_ACTOR" - git config user.email "$GITHUB_ACTOR@users.noreply.github.com" - - - name: Install Helm - uses: azure/setup-helm@v4 - - - name: Run chart-releaser for pytorchjob-generate - uses: helm/chart-releaser-action@v1.6.0 - with: - charts_dir: tools/pytorchjob-generator - packages_with_index: true - skip_existing: true - env: - CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - - - name: Run chart-releaser for sakkara-deploy - uses: helm/chart-releaser-action@v1.6.0 - with: - charts_dir: tools/sakkara-deploy - packages_with_index: true - skip_existing: true - env: - CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" - - publish: - needs: release - uses: project-codeflare/mlbatch/.github/workflows/gh-pages-static.yml@main diff --git a/.gitignore b/.gitignore deleted file mode 100644 index c96f019..0000000 --- a/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.vscode -node_modules/ \ No newline at end of file diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 523fb29..0000000 --- a/.gitmodules +++ /dev/null @@ -1,4 +0,0 @@ -[submodule "scheduler-plugins"] - path = scheduler-plugins - url = https://github.com/kubernetes-sigs/scheduler-plugins.git - branch = release-1.28 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 01a9314..4e6a92c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,22 +1 @@ -repos: -- repo: https://github.com/norwoodj/helm-docs - rev: "v1.13.1" - hooks: - - id: helm-docs-built - args: - - --chart-search-root=tools/pytorchjob-generator - - --sort-values-order=file -- repo: local - hooks: - - id: helm-unittests - name: run helm unittests - language: system - entry: helm unittest - pass_filenames: false - always_run: true - args: - - tools/pytorchjob-generator/chart -- repo: https://github.com/standard/standard - rev: "v17.1.2" - hooks: - - id: standard +repos: [] diff --git a/CODEFLARE.md b/CODEFLARE.md deleted file mode 100644 index cd0baba..0000000 --- a/CODEFLARE.md +++ /dev/null @@ -1,365 +0,0 @@ -# MLBatch for CodeFlare Users - -MLBatch is an evolution of the [CodeFlare](https://github.com/project-codeflare) -stack for managing AI/ML workloads on Kubernetes and its workload dispatcher -[MCAD](https://github.com/project-codeflare/multi-cluster-app-dispatcher). - -Like MCAD, MLBatch is designed to queue workloads and admit them for execution over time, -accounting for quotas, priorities, and precedence. MLBatch relies on -[AppWrappers](https://github.com/project-codeflare/appwrapper) to bundle -together all the components of a workloads such as pods, PyTorch jobs, Ray jobs, -config maps, secrets, etc. AppWrappers in MLBatch offer improved mechanisms to -automatically detect and retry failed workloads. MLBatch includes a -backward-compatible [pytorch-generator](tools/pytorchjob-generator/) Helm -template to facilitate the specification of PyTorch jobs. - -In this document, we review the key innovations introduced by MLBatch and -differences with the earlier setup built around MCAD. - -## Kueue - -MLBatch replaces MCAD with [Kueue](https://kueue.sigs.k8s.io) to queue and -admit jobs. Kueue introduces a new quota management system based on [cluster -queues](https://kueue.sigs.k8s.io/docs/concepts/cluster_queue/). This quota -system provides more flexibility to allocate compute resources (CPU, memory, and -GPU quotas) than [resource -quotas](https://kubernetes.io/docs/concepts/policy/resource-quotas/) in core -Kubernetes. This system allows the borrowing of unused quota between -cluster queues (see [Priorities and Preemption below](#priorities-and-preemption)). -Borrowing enables high overall cluster resource utilization while -still ensuring that every team always has the ability to run jobs up to their -allocated quotas. Kueue also enables teams to use -priorities to order jobs within their own cluster queue without those -priorities impacting the scheduling of jobs by other cluster queues. - -Unlike MCAD, Kueue only considers quotas when admitting workloads. As a result, -MLBatch must ensure that all resource-consuming workloads in user namespaces are managed -by Kueue. This is accomplished by strictly [limiting the Kinds](#allowed-kinds) -of non-AppWrapper resources users are permitted to create. - -For various reasons, workloads are not directly submitted to cluster queues but -rather to namespaced [local -queues](https://kueue.sigs.k8s.io/docs/concepts/local_queue/) that feed into the -cluster queues. By convention in MLBatch, each team is assigned a namespace and -a cluster queue dedicated to the team. For example, the _platform_ team is -assigned to namespace `platform` and its associated cluster queue named -`platform-cluster-queue`. The local queue name in each namespace in MLBatch is always `default-queue`. -Hence, the `default-queue` in namespace `platform` feeds into the -`platform-cluster-queue`. In short, all workloads must be submitted to the local -queue named `default-queue` but to review quota allocation and usage, one has to -query the cluster queues. - -MLBatch offers a simple [cluster-checker](tools/cluster-checker/) tool to get a -bird’s-eye view of quotas on a cluster from a GPU perspective: -```sh -node checker.js -``` -``` -CLUSTER QUEUE GPU QUOTA GPU USAGE ADMITTED WORKLOADS PENDING WORKLOADS -code-cluster-queue 8 16 1 0 -platform-cluster-queue 8 4 4 0 - -Total GPU count in cluster: 24 -Unschedulable GPU count: - 0 -Schedulable GPU count: = 24 - -Nominal GPU quota: 16 -Slack GPU quota: + 8 -Total GPU quota: = 24 - -GPU usage by admitted workloads: 20 -Borrowed GPU count: 8 -``` -The tool lists the cluster queues defined on the cluster showing the GPU -quota for each one as well as the number of GPUs in use by admitted workloads. -The GPU usage may exceed the GPU quota for the cluster queue if this cluster queue -is borrowing idle capacity. - -The tool also reports the total GPU capacity distinguishing healthy (i.e., -schedulable, available for use) and unhealthy (i.e., unschedulable, unavailable) -GPUs. The nominal GPU quota represents the cumulative GPU quota across all the -teams. MLBatch recommends that cluster admins keep the nominal quota below the -cluster capacity to avoid oversubscribing the GPUs. Typically, a small number of -GPUs is not allocated to any team but retained as a slack quota that any team -may borrow from. MLBatch automatically adjusts the slack quota to ensure the -schedulable GPU count and nominal quota remain equal, unless of course this -slack becomes virtually negative, in which case a cluster admin should decide -how to reduce the nominal quota. - -For more details about the cluster queues run: -```sh -kubectl describe clusterqueues -``` - -## AppWrappers - -MLBatch recommends submitting every workload as an -[AppWrapper](https://github.com/project-codeflare/appwrapper). AppWrappers offer -a number of checks, guarantees, and benefits over submitting unwrapped -[PyTorchJobs](https://www.kubeflow.org/docs/components/training/user-guides/pytorch/) -for example. In particular, the AppWrapper controller automatically injects: -- labels holding the name and id of the user submitting the AppWrapper, -- the `queueName` label required to queue the workload in the `default-queue`, - and -- the `schedulerName` specification required to enable gang scheduling and - packing on the GPU dimension to mitigate node fragmentation. - -Moreover, the AppWrapper controller consistently handles cleanup and retries -across all types of workloads: -- The resources, especially the GPUs, utilized by a failed workload are returned - to the cluster in a timely manner, i.e., within minutes by default, with a - configurable grace period to permit post-mortem debugging. Cluster admins can - enforce an upper bound on this grace period to bound resource wastage. -- The Kubernetes objects associated with a completed workload, in particular the - pods and their logs, are eventually disposed of, by default after a week. -- Failed workloads are automatically retried up to a configurable number of - attempts. - -The AppWrapper specification has been greatly simplified for MLBatch. In most -cases, an AppWrapper yaml adds a simple prefix to a workload yaml, for instance -for a pod: -```yaml -# appwrapper prefix -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: wrapped-pod -spec: - components: - - template: - # indented pod specification - apiVersion: v1 - kind: Pod - metadata: - name: sample-pod - spec: - restartPolicy: Never - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 5"] - resources: - requests: - cpu: 1 -``` -To submit this workload to the cluster, save this yaml to `wrapped-pod.yaml` and -run: -```sh -kubectl apply -f wrapped-pod.yaml -``` - -MLBatch includes an [appwrapper-packager](tools/appwrapper-packager/) tool to -automate the addition this prefix as well as the indentation of the workload -specification. In addition, MLBatch includes a new implementation of the -[pytorch-generator](tools/pytorchjob-generator/) tool to facilitate the -configuration of PyTorch jobs including the addition of the AppWrapper prefix. - -As a result of the AppWrapper simplification for MLBatch, AppWrappers which are -now in version `v1beta2` are not backward compatible with MCAD's `v1beta1` -AppWrappers. The companion pytorch-generator tool for MCAD is not compatible -with MLBatch. However, the pytorch-generator tool included in MLBatch is -backward compatible with the input format of the legacy tool. In other words, -simply rerun `helm template` on the input `value.yaml` files to generate proper -`v1beta2` AppWrappers. Please note that existing fault-tolerance-related -settings from these input files will be ignored and default will be used -instead. Please refer to the tool [documentation](tools/pytorchjob-generator/) -for how to override settings such as max retry counts. - -The list of all AppWrappers in a namespace is obtained by running: -```sh -kubectl get appwrappers -``` -``` -NAME STATUS QUOTA RESERVED RESOURCES DEPLOYED UNHEALTHY -wrapped-pod Succeeded False True False -``` -The status of an AppWrapper is one of: -- Suspended: the AppWrapper is queued, -- Resuming: the AppWrapper is transitioning to Running, -- Running: the AppWrapper is running, -- Succeeded: the execution completed successfully, -- Failed: the execution failed and will not be retried, -- Resetting: a failure has been detected during the current execution and the - AppWrapper is preparing to retry, -- Suspending: the AppWrapper has been evicted by Kueue and is transitioning back - to Suspended. - -```mermaid ---- -title: AppWrapper Lifecycle ---- -stateDiagram-v2 - f : Failed - sp : Suspended - ad : Admitted - s : Succeeded - su: Suspending - - state ad { - [*] --> rs - rs --> rn - rn --> rt - rt --> rs - - rs : Resuming - rn : Running - rt : Resetting - } - - [*] --> sp - sp --> ad - rn --> s - ad --> su - su --> sp - ad --> f - - classDef admitted fill:lightblue - class rs admitted - class rn admitted - class rt admitted - - classDef failed fill:pink - class f failed - - classDef succeeded fill:lightgreen - class s succeeded -``` -In this diagram, the outer loop consisting of the `Suspended`, `Admitted`, and -`Suspending` states is managed by Kueue, while the inner loop consisting of the -`Resuming`, `Running`, and `Resetting` states is managed by the AppWrapper -controller. In particular, the AppWrapper controller handles workload retries -without releasing and reacquiring Kueue quotas, hence without moving retried -workloads to the back of the cluster queue. - -In addition, this AppWrapper table also reports: -- quota reserved: whether Kueue has reserved the quota requested by the - AppWrapper, -- resource deployed: whether the resources wrapped by the AppWrapper, such as -the `sample-pod` in this example have been created on the cluster, -- unhealthy: whether a failure has been detected during the current execution of - the AppWrapper. - -For example, a `Running` AppWrapper has both quota reserved and resource -deployed. A `Succeeded` AppWrapper will no longer reserve quota but the wrapped -resources such as terminated pods will be preserved on the cluster for a period -of time as discussed above to permit log collection. A `Failed` AppWrapper will -transiently continue to reserve quota until the wrapped resources have been -undeployed, so as to avoid oversubscribing GPUs during the cleanup of failed -jobs. - -More details about an AppWrapper condition may be obtained by describing the -AppWrapper: -```sh -kubectl describe appwrapper wrapped-pod -``` -Kueue creates and maintains a companion `Workload` object for each workload it -manages. Further details about the AppWrapper condition such as Kueue's -rationale for evicting the workload may be obtained by accessing this companion -object: -```sh -kubectl get workloads -``` -``` -NAME QUEUE RESERVED IN ADMITTED AGE -appwrapper-wrapped-pod-81d3e default-queue team1-cluster-queue True 161m -``` -```sh -kubectl describe workload appwrapper-wrapped-pod-81d3e -``` -Workload objects are automatically deleted by Kueue when the workload itself, -i.e., the AppWrapper is deleted. - -## Priorities and Preemption - -MLBatch supports the `high-priority`, `default-priority`, and `low-priority` -priority classes. - -If you are using the pytorch-generator tool, you can override the default -`default-priority` of a workload by setting the `priority` variable. If you -are generating your yaml by other means, simply add a `priorityClassName` -to the specification of the wrapped pod templates, for example: -```yaml -# appwrapper prefix -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: wrapped-pod -spec: - components: - - template: - # indented pod specification - apiVersion: v1 - kind: Pod - metadata: - name: sample-pod - spec: - priorityClassName: high-priority # workload priority - restartPolicy: Never - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 5"] - resources: - requests: - cpu: 1 -``` - -Workloads of equal priority are considered for admission by their cluster queue in submission order. -Higher-priority workloads are considered for admission before lower-priority -workloads irrespective of their submission time. However, workloads that cannot be -admitted will not block the admission of newer and/or lower-priority workloads -(if they fit within the nominal quota of the cluster queue). - -To reduce workload churn, Kueue forbids workloads to -simultaneously utilize both preemption and borrowing to acquire the -necessary quota to be admitted. Therefore a workload that by itself -exceeds the nominal quota of its cluster queue will never trigger -preemption. Similarly, if the combined resources of (a) a pending -workload and (b) the sum of all already admitted workloads with equal -or higher priority to the pending workload exceeds the nominal quota -of their cluster queue, Kueue will not preempt already admitted lower -priority workloads of that cluster queue to admit the pending -workload. - -When a workload is pending on a cluster queue and admitting that -workload would still leave the cluster queue at or below its nominal -quota, Kueue may preempt one or more currently admitted workloads of -other cluster queues to reclaim the necessary borrowed quota. When such -preemption is necessary, the decision of which workload(s) to preempt -is based solely on considering the currently admitted workloads of -just those cluster queues that are exceeding their nominal -quota. Workloads admitted by cluster queues that are currently at or -below their nominal quota will not be preempted. - -## Allowed Kinds - -MLBatch allows users to directly create the following Kinds of compute -resources: - + AppWrapper - + PyTorchJob (allowed, but recommend to put inside an AppWrapper) - + RayJob (allowed, but recommend to put inside an AppWrapper) - + RayCluster (allowed, but recommend to put inside an AppWrapper) - -MLBatch also allows users to directly create the following Kinds of -non-compute resources: - + Service - + Secret - + ConfigMap - + PersistentVolumeClaim - + PodGroup (allowed, but recommend to put inside an AppWrapper) - -MLBatch allows users to wrap an arbitrary number of one or more of the -following Kinds inside of an AppWrapper: - + PyTorchJob - + RayJob - + RayCluster - + Deployment - + StatefulSet - + Pod - + Job - + ServiceAccount - + Service - + Secret - + ConfigMap - + PersistentVolumeClaim - + PodGroup diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 261eeb9..0000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/QUOTA_MAINTENANCE.md b/QUOTA_MAINTENANCE.md deleted file mode 100644 index 5b475d9..0000000 --- a/QUOTA_MAINTENANCE.md +++ /dev/null @@ -1,62 +0,0 @@ -# Quota Maintenance - -A *team* in MLBatch is a group of users that share a resource quota. - -In Kueue, the `ClusterQueue` is the abstraction used to define a pool -of resources (`cpu`, `memory`, `nvidia.com/gpu`, etc.) that is -available to a team. A `LocalQueue` is the abstraction used by -members of the team to submit workloads to a `ClusterQueue` for -execution using those resources. - -Kubernetes built-in `ResourceQuotas` should not be used for resources that -are being managed by `ClusterQueues`. The two quota systems are incompatible. - -We strongly recommend maintaining a simple relationship between -between teams, namespaces, `ClusterQueues` and `LocalQueues`. Each -team should assigned to their own namespace that contains a single -`LocalQueue` which is configured to be the only `LocalQueue` that -targets the team's `ClusterQueue`. - -The quotas assigned to a `ClusterQueue` can be dynamically adjusted by -a cluster admin at any time. Adjustments to quotas only impact queued -workloads; workloads already admitted for execution are not impacted -by quota adjustments. - -For Kueue quotas to be effective, the sum of all quotas for each managed -resource (`cpu`, `memory`, `nvidia.com/gpu`, `pods`) must be maintained to -remain less than or equal to the available cluster capacity for this resource. -Concretely, for cluster with 256 NVIDIA GPUs dedicated to MLBatch users, the -cumulative `nomimalQuota` for the `nvidia.com/gpu` resource should be 256 or -less. Quotas should be reduced when the available capacity is reduced whether -because of failures or due to the allocation of resources to non-batch -workloads. - -To facilitate the necessary quota adjustments, we recommend setting up -a dedicated `ClusterQueue` for slack capacity that other `ClusterQueues` -can borrow from. This queue should not be associated with any team, -project, namespace, or local queue. Its `lendingLimit` should be adjusted -dynamically to reflect changes in cluster capacity. If sized -appropriately, this queue will make adjustments to other cluster -queues unnecessary for small cluster capacity changes. The figure -below shows this recommended setup for an MLBatch cluster with three -teams. Beginning with RHOAI 2.12 (AppWrapper v0.23), the dynamic -adjustment of the Slack `ClusterQueue` `lendingLimit` can be -configured to be fully automated. -![Figure with ClusterQueues for three teams and slack](./figures/CohortWithSlackCQ.png) - -Every resource name occurring in the resource requests or limits of a workload -must be covered by a `ClusterQueue` intended to admit the workload, even if the -requested resource count is zero. For example. a `ClusterQueue` must cover -`nvidia.com/roce_gdr`, possibly with an empty quota, to admit a `PyTorchJob` -requesting: -```yaml - resources: - requests: - cpu: 1 - memory: 256Mi - nvidia.com/roce_gdr: 0 - limits: - cpu: 1 - memory: 256Mi - nvidia.com/roce_gdr: 0 -``` diff --git a/README.md b/README.md index bad8ac0..379026f 100644 --- a/README.md +++ b/README.md @@ -1,75 +1,3 @@ -# MLBatch +This project's GitHub pages are only used as a Helm repository. -This repository describes the [setup](SETUP.md) and [use](USAGE.md) of the -MLBatch queuing and quota management system on OpenShift and Kubernetes clusters. MLBatch -leverages [Kueue](https://kueue.sigs.k8s.io), the [Kubeflow Training -Operator](https://www.kubeflow.org/docs/components/training/), -[KubeRay](https://docs.ray.io/en/latest/cluster/kubernetes/index.html), and the -[Codeflare Operator](https://github.com/project-codeflare/codeflare-operator) -from [Red Hat OpenShift -AI](https://www.redhat.com/en/technologies/cloud-computing/openshift/openshift-ai). -MLBatch enables [AppWrappers](https://project-codeflare.github.io/appwrapper/) -and adds -[Coscheduler](https://github.com/kubernetes-sigs/scheduler-plugins/blob/master/pkg/coscheduling/README.md). -MLBatch includes a number of configuration steps to help these components work -in harmony and support large workloads on large clusters. - -MLBatch handles the queuing and dispatching of batch workloads on OpenShift and Kubernetes -clusters. It enforces team quotas at the namespace level. It automates the -borrowing and reclamation of unused quotas across teams. Teams can use -priorities within their namespaces without impact on other teams. Using -AppWrappers to submit workloads activates a number of fault detection and -recovery capabilities, including automatically detecting failed pods and -automatically retrying failed workloads. Coscheduler supports gang scheduling -and minimizes fragmentation by preferentially packing jobs requiring less than a -full node's worth of GPUs together. - -## Cluster Setup - -To learn how to setup MLBatch on a cluster and onboard teams see -[SETUP.md](SETUP.md). - -*Quota maintenance* is a key aspect of smoothly administering an MLBatch cluster. -Cluster admins should carefully read [QUOTA_MAINTENANCE.md](QUOTA_MAINTENANCE.md). - -## Running Workloads - -To learn how to run workloads on an MLBatch cluster see [USAGE.md](USAGE.md) or -[CODEFLARE.md](CODEFLARE.md) if you are already familiar with the -[CodeFlare](https://github.com/project-codeflare) stack for managing AI/ML -workloads on Kubernetes. - -### PyTorchJobs via the MLBatch Helm Chart - -Properly configuring a distributed `PyTorchJob` to make effective use of the -MLBatch system and hardware accelerators (GPUs, RoCE GDR) can be tedious. To -automate this process, we provide a Helm chart that captures best practices and -common configuration options. Using this Helm chart helps eliminate common -mistakes. Please see [pytorchjob-generator](tools/pytorchjob-generator) for -detailed usage instructions. - -## Development Setup - -If you will be contributing to the development of the MLBatch project, you must -setup precommit hooks for your local clone of the repository. Do the following -once, immediately after cloning this repo: -```shell -helm plugin install https://github.com/helm-unittest/helm-unittest.git -pre-commit install -``` - -## License - -Copyright 2024 IBM Corporation. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. +Please see the [GitHub project](https://github.com/project-codeflare/mlbatch) for all user-facing content. diff --git a/SETUP.md b/SETUP.md deleted file mode 100644 index 1b756e9..0000000 --- a/SETUP.md +++ /dev/null @@ -1,60 +0,0 @@ -# MLBatch Setup - -The MLBatch setup consists of a *cluster setup* to be done once -and a *team setup* to be repeated for each team that will -be using the cluster. - -Batch users should only be permitted to create AppWrappers or other -workload Kinds that are natively supported by Kueue. The cluster setup -defines a `mlbatch-edit` role which enforces these restrictions and -will be used in the setup process for each team of MLBatch users that -is onboarded. - -This setup has been developed on Red Hat OpenShift 4.14, Red Hat OpenShift 4.16, -and Kubernetes 1.29 and is intended to support Red Hat OpenShift 4.14 and up -and/or Kubernetes 1.29 and up. - -To start with, recursively clone and enter this repository: -```sh -git clone --recursive https://github.com/project-codeflare/mlbatch.git -cd mlbatch -``` - -Detailed instructions and configuration files can be found in subfolders, -one for each base platform. - -## Red Hat OpenShift AI - -We recommend using the most recent ***stable*** release of -Red Hat OpenShift AI as the base platform for MLBatch. Please see -[Red Hat OpenShift AI Self-Managed Life Cycle](https://access.redhat.com/support/policy/updates/rhoai-sm/lifecycle) -for the life cycle dates of currently supported ***stable*** and ***fast*** releases. - -Instructions are provided for the following Red Hat OpenShift AI ***stable*** releases: -+ Red Hat OpenShift AI 2.19 - + [RHOAI 2.19 Cluster Setup](./setup.RHOAI-v2.19/CLUSTER-SETUP.md) - + [RHOAI 2.19 Team Setup](./setup.RHOAI-v2.19/TEAM-SETUP.md) - + [UPGRADING from RHOAI 2.16](./setup.RHOAI-v2.19/UPGRADE-STABLE.md) - + [UPGRADING from RHOAI 2.18](./setup.RHOAI-v2.19/UPGRADE-FAST.md) - + [RHOAI 2.19 Uninstall](./setup.RHOAI-v2.19/UNINSTALL.md) -+ Red Hat OpenShift AI 2.16 - + [RHOAI 2.16 Cluster Setup](./setup.RHOAI-v2.16/CLUSTER-SETUP.md) - + [RHOAI 2.16 Team Setup](./setup.RHOAI-v2.16/TEAM-SETUP.md) - + [UPGRADING from RHOAI 2.13](./setup.RHOAI-v2.16/UPGRADE-STABLE.md) - + [UPGRADING from RHOAI 2.15](./setup.RHOAI-v2.16/UPGRADE-FAST.md) - + [RHOAI 2.16 Uninstall](./setup.RHOAI-v2.16/UNINSTALL.md) - -Instructions are provided for the following Red Hat OpenShift AI ***fast*** releases: -+ Red Hat OpenShift AI 2.19 - + [RHOAI 2.19 Cluster Setup](./setup.RHOAI-v2.19/CLUSTER-SETUP.md) - + [RHOAI 2.19 Team Setup](./setup.RHOAI-v2.19/TEAM-SETUP.md) - + [UPGRADING from RHOAI 2.18](./setup.RHOAI-v2.19/UPGRADE.md) - + [RHOAI 2.19 Uninstall](./setup.RHOAI-v2.19/UNINSTALL.md) - -## Kubernetes - -MLBatch can be installed on any Kubernetes cluster version 1.29 or later -by following these instructions: - + [Kubernetes Cluster Setup](./setup.k8s/CLUSTER-SETUP.md) - + [Kubternets Team Setup](./setup.k8s/TEAM-SETUP.md) - + [Kubernetes Uninstall](./setup.k8s/UNINSTALL.md) diff --git a/USAGE.md b/USAGE.md deleted file mode 100644 index 604697e..0000000 --- a/USAGE.md +++ /dev/null @@ -1,393 +0,0 @@ -# MLBatch Quick Start - -MLBatch supports `PyTorchJobs`, `RayJobs`, `RayClusters`, as well as -`AppWrappers`, which can wrap and bundle together resources such as `Pods`, -`Jobs`, `Deployments`, `StatefulSets`, `ConfigMaps`, or `Secrets`. - -This document first explains [queues](#queues) then discusses a few [examples -workloads](#example-workloads), [monitoring](#monitoring-workloads-and-queues), -[borrowing](#borrowing-and-reclamation), -[priorities](#priorities-and-preemption), and -[fault-tolerance](#fault-tolerance). - -It is not required to clone this repository to use an MLBatch system. However, -if you want local copies of the examples to enable you to easily try then, you -can recursively clone and enter this repository: -```sh -git clone --recursive https://github.com/project-codeflare/mlbatch.git -cd mlbatch -``` - -## PyTorchJobs via the MLBatch Helm Chart - -Properly configuring a distributed `PyTorchJob` to make effective use of the -MLBatch system and hardware accelerators (GPUs, RoCE GDR) can be tedious. To -automate this process, we provide a Helm chart that captures best practices and -common configuration options. Using this Helm chart helps eliminate common -mistakes. Please see [pytorchjob-generator](tools/pytorchjob-generator) for -detailed usage instructions. - -## Generating AppWrappers from Kubernetes YAML files - -If you have a Kubernetes YAML file containing one or more -non-AppWrapper resources (eg Deployments, Pods, Services, etc), -you can use the [appwrapper-packager](tools/appwrapper-packager) tool -to generate an AppWrapper yaml containing those resources. - -## Queues - -All workloads must target a local queue in their namespace. The local queue name -is specified as a label as follows: -```yaml -apiVersion: ??? -kind: ??? -metadata: - name: ??? - labels: - kueue.x-k8s.io/queue-name: default-queue # queue name -``` -In MLBatch, the default local queue name is `default-queue`. - -Workloads submitted as `AppWrappers` do not need to explicity specify the local -queue name as it will be automatically added if missing. However, other workload -types (`PyTorchJobs`, `RayJobs`, `RayClusters`) must specify the local queue -name as demonstrated above. - -Workloads missing a local queue name will not be admitted. If you forget to -label the workload, you must either delete and resubmit it or use `oc edit` to -add the missing label to the metadata section of your workload object. - -Submitted workloads are queued and dispatched when enough quota is available, -which eventually results in the creation of pods that are submitted to the -cluster's scheduler. By default, this scheduler will scheduler pods one at a -time and spread pods across nodes to even the load across the cluster. Both -behaviors are undesirable for large AI workloads such as pre-training jobs. -MLBatch includes and configures Coscheduler to enable gang scheduling and -packing. Concretely, Coscheduler as configured will strive to schedule all pods -in a job at once using a minimal number of nodes. - -## Example Workloads - -`PytorchJobs`, `RayJobs`, and `RayClusters` may be submitted directly to -MLBatch. Please note however that these workloads will not benefit from the -advanced logic provided by `AppWrappers` for instance pertaining to -[fault-tolerance](#fault-tolerance). Hence, wrapping objects into `AppWrappers` -is the recommended way of submitting workloads. - -### PyTorchJobs - -To submit an unwrapped `PyTorchJob` to MLBatch, simply include the queue name: -```yaml -apiVersion: kubeflow.org/v1 -kind: PyTorchJob -metadata: - name: sample-pytorchjob - labels: - kueue.x-k8s.io/queue-name: default-queue # queue name (required) -spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: OnFailure - template: - spec: - containers: - - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 - command: - - "python3" - - "/opt/pytorch-mnist/mnist.py" - - "--epochs=1" - resources: - requests: - cpu: 1 - Worker: - replicas: 1 - restartPolicy: OnFailure - template: - spec: - containers: - - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 - command: - - "python3" - - "/opt/pytorch-mnist/mnist.py" - - "--epochs=1" - resources: - requests: - cpu: 1 -``` -Try the above with: -```sh -oc apply -n team1 -f samples/pytorchjob.yaml -``` -MLBatch implicitly enables gang scheduling and packing for `PyTorchJobs` by -configuring the Kubeflow Training Operator to automatically inject the -necessary scheduling directives into all Pods it creates for `PyTorchJobs`. - -### AppWrappers - -A `Job`, a `Pod`, or a `Deployment` can be created using an `AppWrapper`, for -example: -```yaml -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: sample-job -spec: - components: - - template: - # job specification - apiVersion: batch/v1 - kind: Job - metadata: - name: sample-job - spec: - template: - spec: - restartPolicy: Never - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 30"] - resources: - requests: - cpu: 1 -``` -Try the above with: -```sh -oc apply -n team1 -f samples/job.yaml -``` -Concretely, the `AppWrapper` adds a simple prefix to the `Job` specification. -See [AppWrappers](https://project-codeflare.github.io/appwrapper/) for more -information and use cases. - -MLBatch implicitly enables packing for `AppWrappers`. For workloads consisting -of multiple pods, add a `PodGroup` to enable gang scheduling, for instance: -```yaml -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: sample-job -spec: - components: - - template: - # pod group specification - apiVersion: scheduling.x-k8s.io/v1alpha1 - kind: PodGroup - metadata: - name: sample-job - spec: - minMember: 2 # replica count - - template: - # job specification - apiVersion: batch/v1 - kind: Job - metadata: - name: sample-job - spec: - parallelism: 2 # replica count - completions: 2 # replica count - template: - metadata: - labels: - scheduling.x-k8s.io/pod-group: sample-job # pod group label - spec: - restartPolicy: Never - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 5"] - resources: - requests: - cpu: 1 -``` -Try the above with: -```sh -oc apply -n team1 -f samples/job-with-podgroup.yaml -``` - -## Monitoring Workloads and Queues - -Check the status of the local queue for the namespace with: -```sh -oc -n team1 get localqueue -``` -``` -NAME CLUSTERQUEUE PENDING WORKLOADS ADMITTED WORKLOADS -localqueue.kueue.x-k8s.io/default-queue team1-cluster-queue 0 1 -``` -Check the status of the workloads in the namespace with: -```sh -oc -n team1 get workloads -``` -``` -NAME QUEUE ADMITTED BY AGE -pytorchjob-sample-pytorchjob-9fc41 default-queue team1-cluster-queue 11m -``` -As usual, replace `get` with `describe` for more details on the local queue or -workloads. See [Kueue](https://kueue.sigs.k8s.io) for more information. - -## Borrowing and Reclamation - -A workload can borrow unused quotas from other namespaces if not enough quota is -available in the team namespace unless disallowed by the `ClusterQueue` of the -team namespace (`borrowingLimit`) or target namespace(s) (`lendingLimit`). - -Borrowed quotas are immediately returned to the target namespace(s) upon -request. In other words, the submission of a workload in a target namespace will -preempt borrowers if necessary to obtain the quota requested by the new -workload. - -## Priorities and Preemption - -A workload can specify a priority by means of pod priorities, for instance for a -wrapped job: -```yaml -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: sample-job -spec: - components: - - template: - # job specification - apiVersion: batch/v1 - kind: Job - metadata: - name: sample-job - spec: - template: - spec: - restartPolicy: Never - priorityClassName: high-priority # workload priority - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 5"] - resources: - requests: - cpu: 1 -``` -Workloads of equal priority are considered for admission in submission order. -Higher-priority workloads are considered for admission before lower-priority -workloads irrespective of arrival time. However, workloads that cannot be -admitted will not block the admission of newer and possibly lower-priority -workloads (if they fit within the quota). - -A workload will preempt lower-priority workloads in the same namespace to meet -its quota if necessary. It may also preempt newer, equal-priority workloads in -the same namespace. - -Preemption across namespaces can only be triggered by the reclamation of -borrowed quota, which is independent from priorities. - -## Fault-tolerance - -AppWrappers are the mechanism used by the MLBatch system to automate fault -detection and retry/recovery of executing workloads. By adding automation, we -can achieve higher levels of system utilization by greatly reducing the reliance -on constant human monitoring of workload health. AppWrappers should be used to -submit all workloads that are intended to run without close human supervision of -their progress. - -```mermaid ---- -title: Overview of AppWrapper Fault Tolerance Phase Transitions ---- -stateDiagram-v2 - - rn : Running - s : Succeeded - f : Failed - rt : Resetting - rs : Resuming - - %% Happy Path - rn --> s - - %% Requeuing - rn --> f : Retries Exceeded - rn --> rt : Workload Unhealthy - rt --> rs : All Resources Removed - rs --> rn : All Resources Recreated - - classDef quota fill:lightblue - class rs quota - class rn quota - class rt quota - - classDef failed fill:pink - class f failed - - classDef succeeded fill:lightgreen - class s succeeded -``` - -Throughout the execution of the workload, the AppWrapper controller -monitors the number and health of the workload's Pods. It also watches -the top-level created resources and for selected resources types -understands how to interpret their status information. This information -is combined to determine if a workload is unhealthy. A workload can be -deemed *unhealthy* if any of the following conditions are true: - + There are a non-zero number of `Failed` Pods. - + It takes longer than `AdmissionGracePeriod` for the expected - number of Pods to reach the `Pending` state. - + It takes longer than the `WarmupGracePeriod` for the expected - number of Pods to reach the `Running` state. - + If a non-zero number of `Running` Pods are using resources - that Autopilot has tagged as `NoExecute`. - + The status information of a batch/v1 Job or PyTorchJob indicates - that it has failed. - + A top-level wrapped resource is externally deleted. - -If a workload is determined to be unhealthy by one of the first three -Pod-level conditions above, the AppWrapper controller first waits for -a `FailureGracePeriod` to allow the primary resource controller an -opportunity to react and return the workload to a healthy state. The -`FailureGracePeriod` is elided for the remaining conditions because the -primary resource controller is not expected to take any further -action. If the `FailureGracePeriod` passes and the workload is still -unhealthy, the AppWrapper controller will *reset* the workload by -deleting its resources, waiting for a `RetryPausePeriod`, and then -creating new instances of the resources. During this retry pause, the -AppWrapper **does not** release the workload's quota; this ensures -that when the resources are recreated they will still have sufficient -quota to execute. The number of times an AppWrapper is reset is -tracked as part of its status; if the number of resets exceeds the -`RetryLimit`, then the AppWrapper moves into a `Failed` state and its -resources are deleted (thus finally releasing its quota). External deletion -of a top-level wrapped resource will cause the AppWrapper to directly enter -the `Failed` state independent of the `RetryLimit`. - -To support debugging `Failed` workloads, an annotation can be added to an -AppWrapper that adds a `DeletionOnFailureGracePeriod` between the time the -AppWrapper enters the `Failed` state and when the process of deleting its -resources begins. Since the AppWrapper continues to consume quota during this -delayed deletion period, this annotation should be used sparingly and only when -interactive debugging of the failed workload is being actively pursued. - -All child resources for an AppWrapper that successfully completed will be -automatically deleted after a `SuccessTTLPeriod` after the AppWrapper entered -the `Succeeded` state. - -The parameters of the retry loop described about are configured at the system -level, but can be customized by the user on a per-AppWrapper basis by adding -annotations. The table below lists the parameters, gives their default, and the -annotation that can be used to customize them. The MLBatch Helm chart also -supports customization these values. - -| Parameter | Default Value | Annotation | -|------------------------------|---------------|------------------------------------------------------------------------| -| AdmissionGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/admissionGracePeriodDuration | -| WarmupGracePeriod | 5 Minutes | workload.codeflare.dev.appwrapper/warmupGracePeriodDuration | -| FailureGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/failureGracePeriodDuration | -| RetryPausePeriod | 90 Seconds | workload.codeflare.dev.appwrapper/retryPausePeriodDuration | -| RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit | -| DeletionOnFailureGracePeriod | 0 Seconds | workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration | -| SuccessTTL | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration | -| GracePeriodMaximum | 24 Hours | Not Applicable | - -The `GracePeriodMaximum` imposes a system-wide upper limit on all other grace -periods to limit the potential impact of user-added annotations on overall -system utilization. diff --git a/figures/CohortWithSlackCQ.png b/figures/CohortWithSlackCQ.png deleted file mode 100644 index 43b88f6..0000000 Binary files a/figures/CohortWithSlackCQ.png and /dev/null differ diff --git a/index.html b/index.html new file mode 100644 index 0000000..a360974 --- /dev/null +++ b/index.html @@ -0,0 +1,10 @@ + + + + + +

This project's GitHub pages are only used as a Helm repository.

+ +

Please see the GitHub Project for all user-facing content.

+ + diff --git a/index.yaml b/index.yaml new file mode 100644 index 0000000..85a5c93 --- /dev/null +++ b/index.yaml @@ -0,0 +1,125 @@ +apiVersion: v1 +entries: + pytorchjob-generator: + - apiVersion: v2 + appVersion: v1beta2 + created: "2025-04-17T15:21:49.560503007Z" + description: An AppWrapper generator for PyTorchJobs + digest: f337335a00711773647ad4badc94bf0fbd223d475f7c15c046da2b294dbea883 + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.1.9.tgz + version: 1.1.9 + - apiVersion: v2 + appVersion: v1beta2 + created: "2025-03-11T15:02:32.514276254Z" + description: An AppWrapper generator for PyTorchJobs + digest: acc7b8406e7affb48ce042a26f95d22087e89731626eab6c69f95a9262778642 + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.1.8.tgz + version: 1.1.8 + - apiVersion: v2 + appVersion: v1beta2 + created: "2025-01-17T16:39:09.676825318Z" + description: An AppWrapper generator for PyTorchJobs + digest: 11ffcfa4de8f8693555b589506176b6b5e2a853a095d5760b59e3f29499e1937 + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.1.7.tgz + version: 1.1.7 + - apiVersion: v2 + appVersion: v1beta2 + created: "2024-12-19T18:08:41.657987953Z" + description: An AppWrapper generator for PyTorchJobs + digest: c234a289e554cfa242961aefe078d30f488acac4186e740acf877be4495c076b + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.1.6.tgz + version: 1.1.6 + - apiVersion: v2 + appVersion: v1beta2 + created: "2024-11-20T22:13:02.06061729Z" + description: An AppWrapper generator for PyTorchJobs + digest: 43fb65fed7977e694561a966e6627f356e614c28099635deeea414e5be520041 + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.1.5.tgz + version: 1.1.5 + - apiVersion: v2 + appVersion: v1beta2 + created: "2024-11-19T21:41:47.613139069Z" + description: An AppWrapper generator for PyTorchJobs + digest: 71b89d5be657f20bf73e2e84080f48f508f77f03bbb804f1ce05c32e449eb082 + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.1.4.tgz + version: 1.1.4 + - apiVersion: v2 + appVersion: v1beta2 + created: "2024-11-11T21:54:33.70476505Z" + description: An AppWrapper generator for PyTorchJobs + digest: 842697095572d3fc52eeb730eb400f638bd8090b9f114692f0a1c94fe64dd6bf + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.1.3.tgz + version: 1.1.3 + - apiVersion: v2 + appVersion: v1beta2 + created: "2024-10-22T21:33:02.486129435Z" + description: An AppWrapper generator for PyTorchJobs + digest: 4be744b1d7ecea211bec9c620dc78f9ccc5f6498d45ded22446989958d1634af + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.1.2.tgz + version: 1.1.2 + - apiVersion: v2 + appVersion: v1beta2 + created: "2024-10-02T16:03:18.169906585Z" + description: An AppWrapper generator for PyTorchJobs + digest: 9ab17b12786dfef7e4ef80d7e276c912f19951fbacb10bc96b3323d3039c3164 + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.1.1.tgz + version: 1.1.1 + - apiVersion: v2 + appVersion: v1beta2 + created: "2024-09-17T18:17:07.36957093Z" + description: An AppWrapper generator for PyTorchJobs + digest: 143775437649ae7354ec3845f8f5bdc328334cb34cb2ab22daeb2afc20534252 + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.1.0.tgz + version: 1.1.0 + - apiVersion: v2 + appVersion: v1beta2 + created: "2024-06-28T20:59:24.791117701Z" + description: An AppWrapper generator for PyTorchJobs + digest: 09bfee511e20c00ebfbcd0da2b225aa298ddeed08c851963f2c0330672e59995 + name: pytorchjob-generator + type: application + urls: + - pytorchjob-generator-1.0.0.tgz + version: 1.0.0 + sakkara-scheduler: + - apiVersion: v2 + appVersion: v0.29.7 + created: "2025-03-13T00:36:06.475815135Z" + description: Deploy sakkara group and topology aware scheduler plugin in a cluster + digest: 8cc9e150054292d005e923cbc684be42496defb73e9819152113e07590e7a57c + name: sakkara-scheduler + type: application + urls: + - sakkara-scheduler-0.0.1.tgz + version: 0.0.1 +generated: "2025-04-17T15:21:49.560549655Z" diff --git a/pytorchjob-generator-1.0.0.tgz b/pytorchjob-generator-1.0.0.tgz new file mode 100644 index 0000000..839204d Binary files /dev/null and b/pytorchjob-generator-1.0.0.tgz differ diff --git a/pytorchjob-generator-1.1.0.tgz b/pytorchjob-generator-1.1.0.tgz new file mode 100644 index 0000000..ba17d8b Binary files /dev/null and b/pytorchjob-generator-1.1.0.tgz differ diff --git a/pytorchjob-generator-1.1.1.tgz b/pytorchjob-generator-1.1.1.tgz new file mode 100644 index 0000000..8e3b2ff Binary files /dev/null and b/pytorchjob-generator-1.1.1.tgz differ diff --git a/pytorchjob-generator-1.1.2.tgz b/pytorchjob-generator-1.1.2.tgz new file mode 100644 index 0000000..4d99578 Binary files /dev/null and b/pytorchjob-generator-1.1.2.tgz differ diff --git a/pytorchjob-generator-1.1.3.tgz b/pytorchjob-generator-1.1.3.tgz new file mode 100644 index 0000000..37b26c3 Binary files /dev/null and b/pytorchjob-generator-1.1.3.tgz differ diff --git a/pytorchjob-generator-1.1.4.tgz b/pytorchjob-generator-1.1.4.tgz new file mode 100644 index 0000000..0927475 Binary files /dev/null and b/pytorchjob-generator-1.1.4.tgz differ diff --git a/pytorchjob-generator-1.1.5.tgz b/pytorchjob-generator-1.1.5.tgz new file mode 100644 index 0000000..b9eab6b Binary files /dev/null and b/pytorchjob-generator-1.1.5.tgz differ diff --git a/pytorchjob-generator-1.1.6.tgz b/pytorchjob-generator-1.1.6.tgz new file mode 100644 index 0000000..8023190 Binary files /dev/null and b/pytorchjob-generator-1.1.6.tgz differ diff --git a/pytorchjob-generator-1.1.7.tgz b/pytorchjob-generator-1.1.7.tgz new file mode 100644 index 0000000..d0cd79d Binary files /dev/null and b/pytorchjob-generator-1.1.7.tgz differ diff --git a/pytorchjob-generator-1.1.8.tgz b/pytorchjob-generator-1.1.8.tgz new file mode 100644 index 0000000..4896149 Binary files /dev/null and b/pytorchjob-generator-1.1.8.tgz differ diff --git a/pytorchjob-generator-1.1.9.tgz b/pytorchjob-generator-1.1.9.tgz new file mode 100644 index 0000000..0c71eb1 Binary files /dev/null and b/pytorchjob-generator-1.1.9.tgz differ diff --git a/sakkara-scheduler-0.0.1.tgz b/sakkara-scheduler-0.0.1.tgz new file mode 100644 index 0000000..3e91b76 Binary files /dev/null and b/sakkara-scheduler-0.0.1.tgz differ diff --git a/samples/job-with-podgroup.yaml b/samples/job-with-podgroup.yaml deleted file mode 100644 index 653c2ad..0000000 --- a/samples/job-with-podgroup.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: sample-job -spec: - components: - - template: - # pod group specification - apiVersion: scheduling.x-k8s.io/v1alpha1 - kind: PodGroup - metadata: - name: sample-job - spec: - minMember: 2 # replica count - - template: - # job specification - apiVersion: batch/v1 - kind: Job - metadata: - name: sample-job - spec: - parallelism: 2 # replica count - completions: 2 # replica count - template: - metadata: - labels: - scheduling.x-k8s.io/pod-group: sample-job # pod group label - spec: - restartPolicy: Never - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 5"] - resources: - requests: - cpu: 1 diff --git a/samples/job.yaml b/samples/job.yaml deleted file mode 100644 index 382c2aa..0000000 --- a/samples/job.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: sample-job -spec: - components: - - template: - # job specification - apiVersion: batch/v1 - kind: Job - metadata: - name: sample-job - spec: - template: - spec: - restartPolicy: Never - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 5"] - resources: - requests: - cpu: 1 diff --git a/samples/pod.yaml b/samples/pod.yaml deleted file mode 100644 index 6406a93..0000000 --- a/samples/pod.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: sample-pod -spec: - components: - - template: - # pod specification - apiVersion: v1 - kind: Pod - metadata: - name: sample-pod - spec: - restartPolicy: Never - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 5"] - resources: - requests: - cpu: 1 diff --git a/samples/pytorchjob-in-appwrapper.yaml b/samples/pytorchjob-in-appwrapper.yaml deleted file mode 100644 index 2ded053..0000000 --- a/samples/pytorchjob-in-appwrapper.yaml +++ /dev/null @@ -1,44 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: sample-aw-pytorchjob -spec: - components: - - template: - # job specification - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: sample-aw-pytorchjob - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: OnFailure - template: - spec: - containers: - - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 - command: - - "python3" - - "/opt/pytorch-mnist/mnist.py" - - "--epochs=1" - resources: - requests: - cpu: 1 - Worker: - replicas: 1 - restartPolicy: OnFailure - template: - spec: - containers: - - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 - command: - - "python3" - - "/opt/pytorch-mnist/mnist.py" - - "--epochs=1" - resources: - requests: - cpu: 1 diff --git a/samples/pytorchjob.yaml b/samples/pytorchjob.yaml deleted file mode 100644 index 52f7bee..0000000 --- a/samples/pytorchjob.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: kubeflow.org/v1 -kind: PyTorchJob -metadata: - name: sample-pytorchjob - labels: - kueue.x-k8s.io/queue-name: default-queue # queue name (required) -spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: OnFailure - template: - spec: - containers: - - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 - command: - - "python3" - - "/opt/pytorch-mnist/mnist.py" - - "--epochs=1" - resources: - requests: - cpu: 1 - Worker: - replicas: 1 - restartPolicy: OnFailure - template: - spec: - containers: - - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1 - command: - - "python3" - - "/opt/pytorch-mnist/mnist.py" - - "--epochs=1" - resources: - requests: - cpu: 1 diff --git a/scheduler-plugins b/scheduler-plugins deleted file mode 160000 index 96a3366..0000000 --- a/scheduler-plugins +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 96a33663d5e57edf09d4e2817f841f25caf60229 diff --git a/setup.KubeConEU25/README.md b/setup.KubeConEU25/README.md deleted file mode 100644 index 03bb1bf..0000000 --- a/setup.KubeConEU25/README.md +++ /dev/null @@ -1,1122 +0,0 @@ -# MLBatch Tutorial - -MLBatch is the software stack we developed in IBM Research to facilitate the -setup, administration, and use of Kubernetes clusters dedicated to batch AI/ML -workloads. It leverages a number of community projects such as -[Kueue](https://kueue.sigs.k8s.io), [Kubeflow -Trainer](https://www.kubeflow.org/docs/components/training/), -[KubeRay](https://docs.ray.io/en/latest/cluster/kubernetes/index.html), and -[vLLM](https://docs.vllm.ai/en/latest/). It complements them with several -open-source components born in IBM Research including -[AutoPilot](https://github.com/IBM/autopilot), -[AppWrapper](https://project-codeflare.github.io/appwrapper/), and -[Sakkara](https://github.com/atantawi/4986-kep-sakkara). MLBatch manages teams, -queues, quotas, and resource allocation. It monitors key cluster components, -detecting faults and to a degree automating fault recovery. - -In this tutorial, we walk through all the steps necessary to setup MLBatch on a -Kubernetes cluster and run a few example workloads. -- We configure persistent storage using -[NFS](https://en.wikipedia.org/wiki/Network_File_System). -- We deploy MLBatch following the - [CLUSTER-SETUP.md](../setup.k8s/CLUSTER-SETUP.md) instructions. -- We configure example teams following the - [TEAM-SETUP.md](../setup.k8s/TEAM-SETUP.md) instructions. -- We reconfigure Autopilot to periodically assess the storage class in addition - to running network and GPU tests. _This is optional._ -- We deploy [Prometheus](https://prometheus.io) and [Grafana -dashboards](https://grafana.com/grafana/dashboards/) to monitor the health of -the cluster and GPU utilization. _This is optional._ -- We demonstrate the queuing, quota management, and fault recovery capabilities - of MLBatch using synthetic workloads. -- We run example workloads using vLLM, PyTorch, and Ray. - -## Cluster Characteristics - -Our target cluster comprises three control planes nodes and three worker nodes -running Kubernetes 1.29, specifically [OpenShift -4.16](https://docs.openshift.com/container-platform/4.16/release_notes/ocp-4-16-release-notes.html). - -
- -```sh -kubectl get nodes -``` -``` -NAME STATUS ROLES AGE VERSION -pokprod-b93r38s3 Ready worker 5d13h v1.29.11+148a389 -pokprod-b93r39s2 Ready worker 5d12h v1.29.11+148a389 -pokprod-b93r44s0 Ready worker 5d13h v1.29.11+148a389 -pokprod002ctrl0 Ready control-plane,master 5d15h v1.29.11+148a389 -pokprod002ctrl1 Ready control-plane,master 5d15h v1.29.11+148a389 -pokprod002ctrl2 Ready control-plane,master 5d15h v1.29.11+148a389 -``` -Each worker node is equipped with eight [NVIDIA -H100](https://www.nvidia.com/en-us/data-center/h100/) GPUs. -```sh -oc debug node/pokprod-b93r38s3 -- chroot /host lspci -d 10de: -``` -``` -Starting pod/pokprod-b93r38s3-debug-4bv4j ... -To use host binaries, run `chroot /host` -05:00.0 Bridge: NVIDIA Corporation GH100 [H100 NVSwitch] (rev a1) -06:00.0 Bridge: NVIDIA Corporation GH100 [H100 NVSwitch] (rev a1) -07:00.0 Bridge: NVIDIA Corporation GH100 [H100 NVSwitch] (rev a1) -08:00.0 Bridge: NVIDIA Corporation GH100 [H100 NVSwitch] (rev a1) -18:00.0 3D controller: NVIDIA Corporation GH100 [H100 SXM5 80GB] (rev a1) -2a:00.0 3D controller: NVIDIA Corporation GH100 [H100 SXM5 80GB] (rev a1) -3a:00.0 3D controller: NVIDIA Corporation GH100 [H100 SXM5 80GB] (rev a1) -5d:00.0 3D controller: NVIDIA Corporation GH100 [H100 SXM5 80GB] (rev a1) -9a:00.0 3D controller: NVIDIA Corporation GH100 [H100 SXM5 80GB] (rev a1) -ab:00.0 3D controller: NVIDIA Corporation GH100 [H100 SXM5 80GB] (rev a1) -ba:00.0 3D controller: NVIDIA Corporation GH100 [H100 SXM5 80GB] (rev a1) -db:00.0 3D controller: NVIDIA Corporation GH100 [H100 SXM5 80GB] (rev a1) - -Removing debug pod ... -``` -For this tutorial, we assume the [NVIDIA GPU -operator](https://docs.nvidia.com/datacenter/cloud-native/GPU-operator/latest/index.html) -is already -[installed](https://docs.nvidia.com/datacenter/cloud-native/GPU-operator/latest/getting-started.html) -on the cluster. While this cluster is capable of [GPU-direct RDMA (GDR) with -ROCE (RDMA over Converged -Ethernet)](https://medium.com/@sunyanan.choochotkaew1/unlocking-GPUdirect-rdma-on-roce-in-kubernetes-based-cluster-on-cloud-through-multi-nic-cni-1e69ffb96296), -we will not cover or rely on advanced networking configurations in this -tutorial. -```sh -kubectl get operators -A -``` -``` -NAME AGE -gpu-operator-certified.nvidia-gpu-operator 18h -nfd.openshift-nfd 18h -``` -```sh -kubectl get node pokprod-b93r38s3 -o yaml | yq .status.capacity -``` -``` -cpu: "224" -ephemeral-storage: 1873933640Ki -hugepages-1Gi: "0" -hugepages-2Mi: "0" -memory: 2113411288Ki -nvidia.com/gpu: "8" -pods: "250" -``` - - -
- -## Persistent Storage Setup - -We assume storage is available by means of a preexisting -[NFS](https://en.wikipedia.org/wiki/Network_File_System) server. We configure -one storage class using the [NFS Subdir External -Provisioner](https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner). - -
- -```sh -helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner -helm repo update - -helm install -n nfs-provisioner pokprod nfs-subdir-external-provisioner/nfs-subdir-external-provisioner \ - --create-namespace \ - --set nfs.server=192.168.98.96 \ - --set nfs.path=/gpfs/fs_ec/pokprod002 \ - --set storageClass.name=nfs-client-pokprod \ - --set storageClass.provisionerName=k8s-sigs.io/pokprod-nfs-subdir-external-provisioner -``` -Make sure to set the `nfs.server` and `nfs.path` values to the right values for -your environment. -```sh -kubectl get storageclasses -``` -``` -NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE -nfs-client-pokprod k8s-sigs.io/pokprod-nfs-subdir-external-provisioner Delete Immediate true 11s -``` -OpenShift clusters require an additional configuration step to permit the -provisioner pod to mount the storage volume. -```sh -oc adm policy add-scc-to-user hostmount-anyuid \ - system:serviceaccount:nfs-provisioner:pokprod-nfs-subdir-external-provisioner -``` - -
- -## MLBatch Cluster Setup - -We deploy MLBatch to the cluster following -[CLUSTER-SETUP.md](../setup.k8s/CLUSTER-SETUP.md). - -
- -```sh -# Clone MLBatch repository -git clone --recursive https://github.com/project-codeflare/mlbatch.git -cd mlbatch - -# Setup priority classes -kubectl apply -f setup.k8s/mlbatch-priorities.yaml - -# Deploy scheduler-plugins -helm install scheduler-plugins -n scheduler-plugins --create-namespace \ - scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \ - --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]' - -# Patch scheduler-plugins pod priorities -kubectl patch deployment -n scheduler-plugins --type=json \ - --patch-file setup.k8s/scheduler-priority-patch.yaml scheduler-plugins-controller -kubectl patch deployment -n scheduler-plugins --type=json \ - --patch-file setup.k8s/scheduler-priority-patch.yaml scheduler-plugins-scheduler - -# Wait for scheduler-plugins pods to be ready -kubectl -n scheduler-plugins wait --timeout=300s --for=condition=Available deployments --all - -# Create mlbatch-system namespace -kubectl create namespace mlbatch-system - -# Deploy Kubeflow training operator -kubectl apply --server-side -k setup.k8s/training-operator/coscheduling - -# Deploy KubeRay -kubectl apply --server-side -k setup.k8s/kuberay - -# Deploy Kueue -kubectl apply --server-side -k setup.k8s/kueue - -# Wait for Kueue to be ready -kubectl -n mlbatch-system wait --timeout=300s --for=condition=Available deployments kueue-controller-manager - -# Deploy AppWrapper -kubectl apply --server-side -k setup.k8s/appwrapper/coscheduling - -# Deploy Autopilot -helm repo add autopilot https://ibm.github.io/autopilot/ -helm repo update - -helm upgrade -i autopilot -n autopilot autopilot/autopilot --create-namespace - -# Create Kueue's default flavor -kubectl apply -f setup.k8s/default-flavor.yaml - -# Setup mlbatch-edit-role -kubectl apply -f setup.k8s/mlbatch-edit-role.yaml -``` -We reserve 8 GPUs out of 24 for MLBatch's slack queue. -```yaml -kubectl apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: slack-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 224 - - name: "memory" - nominalQuota: 2000G - - name: "nvidia.com/gpu" - nominalQuota: 8 - - name: "pods" - nominalQuota: 100 -EOF -``` - -
- -## MLBatch Teams Setup - -We configure team `blue` with user `alice` and `red` with user `bob` following -[TEAM-SETUP.md](../setup.k8s/TEAM-SETUP.md). Each team has a nominal quota of 8 -GPUs. - -
- -For `alice` in team `blue`: -```yaml -# Create namespaces -kubectl create ns blue - -# Label namespace -kubectl label namespace blue mlbatch-team-namespace=true - -# Create cluster queue -kubectl -n blue apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: blue-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 224 - - name: "memory" - nominalQuota: 2000G - - name: "nvidia.com/gpu" - nominalQuota: 8 - - name: "pods" - nominalQuota: 100 -EOF - -# Create default queue for namespace -kubectl apply -n blue -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: default-queue -spec: - clusterQueue: blue-cluster-queue -EOF - -# Authorize alice -kubectl -n blue apply -f- << EOF -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: alice -subjects: - - apiGroup: rbac.authorization.k8s.io - kind: User - name: alice -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: mlbatch-edit -EOF -``` -For `bob` in team `red`: -```yaml -kubectl create ns red - -kubectl label namespace red mlbatch-team-namespace=true - -kubectl apply -n red -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: red-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 224 - - name: "memory" - nominalQuota: 2000G - - name: "nvidia.com/gpu" - nominalQuota: 8 - - name: "pods" - nominalQuota: 100 -EOF - -kubectl apply -n red -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: default-queue -spec: - clusterQueue: red-cluster-queue -EOF - -kubectl -n red apply -f- << EOF -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: bob -subjects: - - apiGroup: rbac.authorization.k8s.io - kind: User - name: bob -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: mlbatch-edit -EOF -``` -While we gave permissions to Kubernetes users `alice` and `bob`, we have not -tied these names to any identity provider as the details of this setup are not -portable. In this tutorial, we will rely on [user -impersonation](https://kubernetes.io/docs/reference/access-authn-authz/authentication/#user-impersonation) -with `kubectl` to run as a specific user. - -
- -## Extended Autopilot Setup - -Optionally, we configure Autopilot to test PVC creation and deletion with the -`nfs-client-pokprod` storage class. - -
- -First create the extended Autopilot configuration. -```sh -cat << EOF > autopilot-extended.yaml -env: - - name: "PERIODIC_CHECKS" - value: "pciebw,remapped,dcgm,ping,gpupower,pvc" - - name: "PVC_TEST_STORAGE_CLASS" - value: "nfs-client-pokprod" -EOF -``` -Then reapply the helm chart, this will start a rollout update. -```sh -helm upgrade -i autopilot autopilot/autopilot -n autopilot --create-namespace -f autopilot-extended.yaml -``` - -
- -## Monitoring Setup - -Optionally, we deploy [Prometheus](https://prometheus.io) and [Grafana -dashboards](https://grafana.com/grafana/dashboards/) to the cluster. - -
- -We follow the setup provided by the `prometheus-community/kube-prometheus-stack` -Helm chart. - -```sh -helm repo add prometheus-community https://prometheus-community.github.io/helm-charts && helm repo update -``` - -The charts will install: Prometheus, Grafana, Alert Manager, Prometheus Node -Exporter and Kube State Metrics. We set up the chart with the following: - -- Persistent storage for Prometheus, Grafana and Alert Manager; -- Override the Prometheus Node Exporter port; -- Disable CRDs creation as they are already present. - -You may leave the CRDs creation on, along with the default Node Exporter pod. -These changes are needed when deploying a separate Prometheus instance in -OpenShift. - -```sh -cat << EOF > config.yaml -crds: - enabled: false - -prometheus-node-exporter: - service: - port: 9110 - -alertmanager: - alertmanagerSpec: - storage: - volumeClaimTemplate: - spec: - storageClassName: nfs-client-pokprod - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: 50Gi - -prometheus: - prometheusSpec: - storageSpec: - volumeClaimTemplate: - spec: - storageClassName: nfs-client-pokprod - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: 50Gi - emptyDir: - medium: Memory - -grafana: - persistence: - enabled: true - type: sts - storageClassName: "nfs-client-pokprod" - accessModes: - - ReadWriteOnce - size: 20Gi - finalizers: - - kubernetes.io/pvc-protection -EOF - -helm upgrade -i kube-prometheus-stack -n prometheus prometheus-community/kube-prometheus-stack --create-namespace -f config.yaml -``` - -If deploying on OpenShift based systems, you need to assign the privileged -security context to the service accounts that are created by the helm chart. - -```sh -oc adm policy add-scc-to-user privileged system:serviceaccount:prometheus:kube-prometheus-stack-admission system:serviceaccount:prometheus:kube-prometheus-stack-alertmanager system:serviceaccount:prometheus:kube-prometheus-stack-grafana system:serviceaccount:prometheus:kube-prometheus-stack-kube-state-metrics system:serviceaccount:prometheus:kube-prometheus-stack-operator system:serviceaccount:prometheus:kube-prometheus-stack-prometheus system:serviceaccount:prometheus:kube-prometheus-stack-prometheus-node-exporter -``` - -You should expect the following pods: - -```sh -kubectl get pods -``` -```sh -NAME READY STATUS RESTARTS AGE -alertmanager-kube-prometheus-stack-alertmanager-0 2/2 Running 0 16m -kube-prometheus-stack-grafana-0 3/3 Running 0 16m -kube-prometheus-stack-kube-state-metrics-6f76b98d89-pxs69 1/1 Running 0 16m -kube-prometheus-stack-operator-7fbfc985bb-mm9bk 1/1 Running 0 16m -kube-prometheus-stack-prometheus-node-exporter-44llp 1/1 Running 0 16m -kube-prometheus-stack-prometheus-node-exporter-95gp8 1/1 Running 0 16m -kube-prometheus-stack-prometheus-node-exporter-dxf5f 1/1 Running 0 16m -kube-prometheus-stack-prometheus-node-exporter-f45dx 1/1 Running 0 16m -kube-prometheus-stack-prometheus-node-exporter-pfrzk 1/1 Running 0 16m -kube-prometheus-stack-prometheus-node-exporter-zpfzb 1/1 Running 0 16m -prometheus-kube-prometheus-stack-prometheus-0 2/2 Running 0 16m -``` - -To access the Grafana dashboard on `localhost:3000`: - -```sh -kubectl -n prometheus get secrets kube-prometheus-stack-grafana -o jsonpath="{.data.admin-password}" | base64 -d ; echo -``` -```sh -export POD_NAME=$(kubectl -n prometheus get pod -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=kube-prometheus-stack" -oname) - kubectl -n prometheus port-forward $POD_NAME 3000 -``` - -To import NVidia and Autopilot metrics, from the Grafana dashboard: - -- Select the `+` drop down menu on the top right, and **Import dashboard** -- In the `Grafana.com dashboard URL or ID` box, add - [https://grafana.com/grafana/dashboards/23123-autopilot-metrics/](https://grafana.com/grafana/dashboards/23123-autopilot-metrics/) - and click Load, then repeat with the NVidia dashboard - [https://grafana.com/grafana/dashboards/12239-nvidia-dcgm-exporter-dashboard/](https://grafana.com/grafana/dashboards/12239-nvidia-dcgm-exporter-dashboard/) - -To visualize the metrics, we need to label the service monitor objects in both -`autopilot` and `nvidia-GPU-operator` namespaces with the Prometheus release -name. - -```sh -kubectl label servicemonitors.monitoring.coreos.com -n autopilot autopilot-metrics-monitor release=kube-prometheus-stack --overwrite -``` -```sh -kubectl label servicemonitors.monitoring.coreos.com -n nvidia-gpu-operator nvidia-dcgm-exporter gpu-operator nvidia-node-status-exporter release=kube-prometheus-stack --overwrite -``` - -
- -## Workload Management - -We will now demonstrate the queuing, quota management, and fault recovery capabilities of MLBatch -using synthetic workloads. - -
-For this portion of the tutorial, we will use variations on the simple batch/v1 Job shown below. -All variations will create multiple pods, each requesting some number of GPUs, and sleep for -a specified interval before completing successfully. - -```yaml -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - generateName: - labels: - kueue.x-k8s.io/queue-name: default-queue -spec: - components: - - template: - apiVersion: batch/v1 - kind: Job - metadata: - generateName: - spec: - completions: - parallelism: - template: - spec: - restartPolicy: Never - terminationGracePeriodSeconds: 0 - priorityClassName: - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 600"] - resources: - limits: - nvidia.com/gpu: 4 -``` - -We will use four types of jobs: - -| Job Type | Priority | Duration | Number of Pods | GPU Usage | -|----------|----------|----------|----------------|------------| -| short | normal | 30s | 2 | 2 X 4 = 8 | -| normal | normal | 600s | 2 | 2 X 4 = 8 | -| important| high | 600s | 2 | 2 x 4 = 8 | -| large | normal | 600s | 4 | 4 x 4 = 16 | - -### Queuing - -First, Alice will submit a burst of short running jobs that exceeds -the number of available GPUs in the cluster. The excess jobs will -suspended by Kueue and admitted in turn as resources become available. - -```sh -kubectl create -f ./setup.KubeConEU25/sample-jobs/short.yaml -n blue --as alice -kubectl create -f ./setup.KubeConEU25/sample-jobs/short.yaml -n blue --as alice -kubectl create -f ./setup.KubeConEU25/sample-jobs/short.yaml -n blue --as alice -kubectl create -f ./setup.KubeConEU25/sample-jobs/short.yaml -n blue --as alice -kubectl create -f ./setup.KubeConEU25/sample-jobs/short.yaml -n blue --as alice -kubectl create -f ./setup.KubeConEU25/sample-jobs/short.yaml -n blue --as alice -kubectl create -f ./setup.KubeConEU25/sample-jobs/short.yaml -n blue --as alice -``` - -Since no one else is using the cluster, Alice is able to utilize -both her blue team's quota of 8 GPUs and to borrow all 8 GPUs from the red team's quota -and the 8 GPUs allocated to the slack cluster queue. During this part of the demo, -we will start with 3 admitted jobs and 5 pending jobs on the blue cluster queue. Over -the next two minutes, the queue will drain as the short running jobs complete and the -next pending job is admitted. - -### Borrowing and Preemption - -Alice will now submit 4 normal jobs. Again, with borrowing, three of these jobs -will be able to run immediately and the 4th job will be queued. - -```sh -kubectl create -f ./setup.KubeConEU25/sample-jobs/normal.yaml -n blue --as alice -kubectl create -f ./setup.KubeConEU25/sample-jobs/normal.yaml -n blue --as alice -kubectl create -f ./setup.KubeConEU25/sample-jobs/normal.yaml -n blue --as alice -kubectl create -f ./setup.KubeConEU25/sample-jobs/normal.yaml -n blue --as alice -``` - -Alice can use priorities to ensure her important jobs run quickly. - -```sh -kubectl create -f ./setup.KubeConEU25/sample-jobs/important.yaml -n blue --as alice -``` - -One of Alice's normal jobs is automatically suspended and put back on the queue of -waiting jobs to make its resource available for her high priority job. - -Finally Bob on the red team arrives at work and submits two jobs. - -```sh -kubectl create -f ./setup.KubeConEU25/sample-jobs/normal.yaml -n red --as bob -kubectl create -f ./setup.KubeConEU25/sample-jobs/normal.yaml -n red --as bob -``` - -Kueue ensures that Bob has immediate access to his team's allocated quota -by evicting borrowing jobs. One of Alice's running -jobs is quickly suspended and returned to her team's queue of pending jobs. - -### Fault Tolerance - -In this scenario, we will start fresh with an empty cluster. Alice will submit -a single large job: - -```sh -kubectl create -f ./setup.KubeConEU25/sample-jobs/large.yaml -n blue --as alice -``` - -After the job is running, we will simulate Autopilot detecting a serious GPU failure -on by labeling a Node: - -```sh - kubectl label node autopilot.ibm.com/gpuhealth=EVICT --overwrite -``` - -MLBatch will automatically trigger a reset of all running jobs with Pods on -the impacted node. This reset first does a clean removal of all of the job's -Pods and then creates fresh versions of them. Since MLBatch automatically injects -the Kubernetes affinities shown below into all Pods it creates for user workloads, -the Kubernetes scheduler will avoid scheduling the new Pods on the impacted Node. -```yaml - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT -``` - -
- -## Example Workloads - -We now will now run some sample workloads that are representative of what is run -on an AI GPU Cluster. - -### Batch Inference with vLLM - -In this example, `alice` runs a batch inference workload using -[vLLM](https://docs.vllm.ai/en/latest/) to serve IBM's -[granite-3.2-8b-instruct](https://huggingface.co/ibm-granite/granite-3.2-8b-instruct) -model. - -
- -First, `alice` creates a persistent volume claim to cache the model weights on -first invocation so that subsequent instantiations of the model will reuse the -cached model weights. -```yaml -kubectl apply --as alice -n blue -f- << EOF -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: granite-3.2-8b-instruct -spec: - accessModes: - - ReadWriteMany - resources: - requests: - storage: 50Gi - storageClassName: nfs-client-pokprod -EOF -``` -The workload wraps a Kubernetes Job in an AppWrapper. The Job consists of one -Pod with two containers. The `vllm` container runs the inference runtime using -an upstream `vllm-openai` image. The `load-generator` container submits a random -series of requests to the inference runtime and reports a number of metrics such -as _Time to First Token_ (TTFT) and _Time per Output Token_ (TPOT). -```yaml -kubectl apply --as alice -n blue -f- << EOF -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: batch-inference -spec: - components: - - template: - apiVersion: batch/v1 - kind: Job - metadata: - name: batch-inference - spec: - template: - metadata: - labels: - app: batch-inference - spec: - restartPolicy: Never - containers: - - name: vllm - image: quay.io/tardieu/vllm-openai:v0.7.3 # vllm/vllm-openai:v0.7.3 - command: - # serve model and wait for halt signal - - sh - - -c - - | - vllm serve ibm-granite/granite-3.2-8b-instruct & - until [ -f /.config/halt ]; do sleep 1; done - ports: - - containerPort: 8000 - resources: - requests: - cpu: 4 - memory: 64Gi - nvidia.com/gpu: 1 - limits: - cpu: 4 - memory: 64Gi - nvidia.com/gpu: 1 - volumeMounts: - - name: cache - mountPath: /.cache - - name: config - mountPath: /.config - - name: load-generator - image: quay.io/tardieu/vllm-benchmarks:v0.7.3 - command: - # wait for vllm, submit batch of requests, send halt signal - - sh - - -c - - | - until nc -zv localhost 8000; do sleep 1; done; - python3 benchmark_serving.py \ - --model=ibm-granite/granite-3.2-8b-instruct \ - --backend=vllm \ - --dataset-name=random \ - --random-input-len=128 \ - --random-output-len=128 \ - --max-concurrency=16 \ - --num-prompts=512; - touch /.config/halt - volumeMounts: - - name: cache - mountPath: /.cache - - name: config - mountPath: /.config - volumes: - - name: cache - persistentVolumeClaim: - claimName: granite-3.2-8b-instruct - - name: config - emptyDir: {} -EOF -``` -The two containers are synchronized as follows: `load-generator` waits for -`vllm` to be ready to accept requests and, upon completion of the batch, signals -`vllm` to make it quit. - -Stream the logs of the `vllm` container with: -```sh -kubectl logs --as alice -n blue -l app=batch-inference -c vllm -f -``` -Stream the logs of the `load-generator` container with: -```sh -kubectl logs --as alice -n blue -l app=batch-inference -c load-generator -f -``` -Delete the complete workload with: -```sh -kubectl delete --as alice -n blue appwrapper batch-inference -``` - -
- -### Pre-Training with PyTorch - -In this example, `alice` uses the [Kubeflow Trainer](https://github.com/kubeflow/trainer) -to run a job that uses [PyTorch](https://pytorch.org) to train a machine learning model. - -
- -This example was constructed by converting a [PyTorch tutorial on FSDP](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html) -into a KubeFlow Trainer [notebook](./sample-jobs/pytorch-training.ipynb) that we used to generate -the yaml for a `PyTorchJob`. The YAML generated by running the notebook was then put inside an -`AppWrapper` using MLBatch's [awpack tool](../tools/appwrapper-packager/awpack.py) to produce the final -[pytorch-training.yaml](sample-jobs/pytorch-training.yaml) that will apply to run the workload. - -```sh -kubectl apply --as alice -n blue -f ./setup.KubeConEU25/sample-jobs/pytorch-training.yaml -``` - -This will create 2 Pods, each requesting 2 GPUs. On our cluster, it will take about 30 seconds -to execute this training workload. We can check on the status of the PyTorchJob by using the command: - -```sh -kubectl get pytorchjob -n blue --watch -``` - -After the jobs completes, we can get the log of the worker Pod with - -```sh -kubectl logs mnist-training-worker-0 -n blue -``` - -At the beginning of the log we can see messages from each Python process -with its rank information: -```sh -... -FSDP Training for WORLD_SIZE: 4, RANK: 3, LOCAL_RANK: 1 -... -FSDP Training for WORLD_SIZE: 4, RANK: 2, LOCAL_RANK: 0 -``` -And at the end of the log, we can see the messages from the `LOCAL_RANK` `0` -process summarizing each epoch: -```sh -... - -Train Epoch: 1 Loss: 0.247396 -Test set: Average loss: 0.0498, Accuracy: 9824/10000 (98.24%) - -Train Epoch: 2 Loss: 0.070375 -Test set: Average loss: 0.0355, Accuracy: 9874/10000 (98.74%) - -Train Epoch: 3 Loss: 0.047944 -Test set: Average loss: 0.0291, Accuracy: 9900/10000 (99.00%) - -Train Epoch: 4 Loss: 0.038316 -Test set: Average loss: 0.0282, Accuracy: 9906/10000 (99.06%) - -Train Epoch: 5 Loss: 0.032751 -Test set: Average loss: 0.0276, Accuracy: 9906/10000 (99.06%) - -Train Epoch: 6 Loss: 0.028068 -Test set: Average loss: 0.0275, Accuracy: 9905/10000 (99.05%) - -Train Epoch: 7 Loss: 0.028161 -Test set: Average loss: 0.0254, Accuracy: 9916/10000 (99.16%) - -Train Epoch: 8 Loss: 0.025051 -Test set: Average loss: 0.0260, Accuracy: 9911/10000 (99.11%) - -Train Epoch: 9 Loss: 0.023851 -Test set: Average loss: 0.0264, Accuracy: 9916/10000 (99.16%) - -Train Epoch: 10 Loss: 0.023334 -Test set: Average loss: 0.0255, Accuracy: 9916/10000 (99.16%) -``` - -When we are all done, we can delete the completed `AppWrapper` with: - -```sh - kubectl delete appwrapper pytorch-mnist-training -n blue -``` -
- -### Fine-Tuning with Ray - -In this example, `alice` uses [KubeRay](https://github.com/ray-project/kuberay) -to run a job that uses [Ray](https://github.com/ray-project/ray) to fine tune a -machine learning model. - -This workload is an adaptation from [this blog post by Red Hat](https://developers.redhat.com/articles/2024/09/30/fine-tune-llama-openshift-ai), in turn adapted from [an example on Ray documentation](https://github.com/ray-project/ray/tree/master/doc/source/templates/04_finetuning_llms_with_deepspeed). -The example is about fine tuning Llama 3.1 with Ray, with DeepSpeed and LoRA. - -
- -Let's set up the environment by installing Ray and cloning the repository - -```bash -uv venv myenv --python 3.12 --seed && source myenv/bin/activate && uv pip install ray datasets -``` - -We are going to impersonate Alice in this example. - -First, we create the PVC where we can download the model and save the checkpoints from the fine tuning job. We are calling this PVC `finetuning-pvc` and we need to add this to the Ray cluster YAML. If another name is used, please update the `claimName` entry in the Ray cluster definition. - -```bash -kubectl apply --as alice -n blue -f- << EOF -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: finetuning-pvc -spec: - accessModes: - - ReadWriteMany - resources: - requests: - storage: 100Gi - storageClassName: nfs-client-pokprod -EOF -``` - -Now, let's create an AppWrapper version of the Ray cluster. Notice that: - -- We are using the container image `quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26` from Red Hat, but you can use the images from DockerHub if preferred -- We are setting the number of worker replicas to `7`. Since we want to run on one GPU node, we are assigning one to the Ray Head pod, and one each to the 7 worker pods. - -```bash -cd tools/appwrapper-packager/ -cat << EOF > ray.yaml -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: ray -spec: - headGroupSpec: - enableIngress: false - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '1' - resources: '"{}"' - serviceType: ClusterIP - template: - metadata: {} - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - image: 'quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26' - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - '-c' - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - protocol: TCP - - containerPort: 8265 - name: dashboard - protocol: TCP - - containerPort: 10001 - name: client - protocol: TCP - resources: - limits: - cpu: '16' - memory: 256G - nvidia.com/gpu: '1' - requests: - cpu: '16' - memory: 128G - nvidia.com/gpu: '1' - volumeMounts: - - mountPath: /model - name: model - volumes: - - name: model - persistentVolumeClaim: - claimName: finetuning-pvc - rayVersion: 2.35.0 - workerGroupSpecs: - - groupName: small-group-ray - rayStartParams: - block: 'true' - num-gpus: '1' - resources: '"{}"' - replicas: 7 - scaleStrategy: {} - template: - metadata: {} - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - image: 'quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26' - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - '-c' - - ray stop - name: machine-learning - resources: - limits: - cpu: '16' - memory: 256G - nvidia.com/gpu: '1' - requests: - cpu: '16' - memory: 128G - nvidia.com/gpu: '1' - volumeMounts: - - mountPath: /model - name: model - volumes: - - name: model - persistentVolumeClaim: - claimName: finetuning-pvc -EOF -``` - -Now let's use the tool to create the appwrapper: - -```bash -./awpack.py -o ray-aw.yaml -n ray-appwrapper -i ray.yaml -``` - -Now we can submit the job while impersonating Alice - -```bash -kubectl create -f ray-aw.yaml -n blue --as alice -``` - -Now that the Ray cluster is set up, first we need to expose the `ray-head` service, as that is the entrypoint for all job submissions. In another terminal, type: - -```bash -kubectl port-forward svc/ray-head-svc 8265:8265 -n blue --as alice -``` - -Now we can download the git repository with the fine tuning workload. - -```bash -git clone https://github.com/opendatahub-io/distributed-workloads -cd distributed-workloads/examples/ray-finetune-llm-deepspeed -``` - -We also create a Python program that launches the job in the Ray cluster using the Ray API. -Notice that: - -- We set the `--num-devices=8` as it is the total number of accelerators being used by head and workers -- we set the `HF_HOME` to the shared PVC, so the model will be downloaded as a single instance and shared among all executors -- we set `epochs` to just one for a shorter run -- we use localhost as entry point for submitting Ray jobs as we exposed the service earlier. - -```bash -cat << EOF > finetuning.py -import create_dataset -create_dataset.gsm8k_qa_no_tokens_template() - -from ray.job_submission import JobSubmissionClient - -client = JobSubmissionClient("http://127.0.0.1:8265") - -kick_off_pytorch_benchmark = ( - "git clone https://github.com/opendatahub-io/distributed-workloads || true;" - # Run the benchmark. - "python ray_finetune_llm_deepspeed.py" - " --model-name=meta-llama/Meta-Llama-3.1-8B --lora --num-devices=8 --num-epochs=1 --ds-config=./deepspeed_configs/zero_3_offload_optim_param.json --storage-path=/model/ --batch-size-per-device=32 --eval-batch-size-per-device=32" -) - - -submission_id = client.submit_job( - entrypoint=kick_off_pytorch_benchmark, - runtime_env={ - "env_vars": { - 'HF_HOME': "/model/ray_finetune_llm_deepspeed/cache/", - }, - 'pip': 'requirements.txt', - 'working_dir': './', - "excludes": ["/docs/", "*.ipynb", "*.md"] - }, -) - -print("Use the following command to follow this Job's logs:") -print(f"ray job logs '{submission_id}' --address http://127.0.0.1:8265 --follow") -EOF -python finetuning.py -``` -The expected output is like the following: -```bash -2025-03-24 16:37:53,029 INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_21ddaa8b13d30deb.zip. -2025-03-24 16:37:53,030 INFO packaging.py:575 -- Creating a file package for local module './'. -Use the following command to follow this Job's logs: -ray job logs 'raysubmit_C6hVCvdhpmapgQB8' --address http://127.0.0.1:8265 --follow -``` - -We can now either follow the logs on the terminal with `ray job logs` command, or open the Ray dashboard and follow from there. To access the Ray dashboard from localhost, as we exposed the service earlier. - -Once the job is completed, the checkpoint with the fine tuned model is saved in the folder -``` -/model/meta-llama/Meta-Llama-3.1-8B/TorchTrainer_/TorchTrainer_/checkpoint_ -``` -
diff --git a/setup.KubeConEU25/UNINSTALL.md b/setup.KubeConEU25/UNINSTALL.md deleted file mode 100644 index 77f9858..0000000 --- a/setup.KubeConEU25/UNINSTALL.md +++ /dev/null @@ -1,49 +0,0 @@ -# Uninstall Procedure - -```sh -kubectl delete appwrappers --all -A -kubectl delete pvc -n blue -kubectl delete pvc -n red - -kubectl delete clusterqueues --all -A -kubectl delete localqueues --all -A -kubectl delete flavors --all -A - -kubectl delete rolebinding -n blue alice -kubectl delete rolebinding -n red bob -kubectl delete ns blue red - -kubectl delete -k setup.k8s/appwrapper/base -kubectl delete -k setup.k8s/kueue -kubectl delete -k setup.k8s/kuberay -kubectl delete -k setup.k8s/training-operator/base -kubectl delete ns mlbatch-system -kubectl delete clusterrole mlbatch-edit - -helm uninstall -n scheduler-plugins scheduler-plugins -kubectl delete ns scheduler-plugins - -helm uninstall -n autopilot autopilot -kubectl delete ns autopilot - -helm uninstall -n prometheus kube-prometheus-stack -kubectl delete pvc -n prometheus --all -kubectl delete ns prometheus - -helm uninstall -n nfs-provisioner pokprod -kubectl delete ns nfs-provisioner - -# OpenShift-specific steps - -oc adm policy remove-scc-from-user hostmount-anyuid \ - system:serviceaccount:nfs-provisioner:pokprod-nfs-subdir-external-provisioner - -oc adm policy remove-scc-from-user privileged \ - system:serviceaccount:prometheus:kube-prometheus-stack-admission \ - system:serviceaccount:prometheus:kube-prometheus-stack-alertmanager \ - system:serviceaccount:prometheus:kube-prometheus-stack-grafana \ - system:serviceaccount:prometheus:kube-prometheus-stack-kube-state-metrics \ - system:serviceaccount:prometheus:kube-prometheus-stack-operator \ - system:serviceaccount:prometheus:kube-prometheus-stack-prometheus \ - system:serviceaccount:prometheus:kube-prometheus-stack-prometheus-node-exporter -``` diff --git a/setup.KubeConEU25/sample-jobs/important.yaml b/setup.KubeConEU25/sample-jobs/important.yaml deleted file mode 100644 index 2e95d2f..0000000 --- a/setup.KubeConEU25/sample-jobs/important.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - generateName: important - labels: - kueue.x-k8s.io/queue-name: default-queue -spec: - components: - - template: - apiVersion: batch/v1 - kind: Job - metadata: - generateName: important - spec: - completions: 2 - parallelism: 2 - template: - spec: - restartPolicy: Never - terminationGracePeriodSeconds: 0 - priorityClassName: high-priority - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 600"] - resources: - limits: - nvidia.com/gpu: 4 diff --git a/setup.KubeConEU25/sample-jobs/large.yaml b/setup.KubeConEU25/sample-jobs/large.yaml deleted file mode 100644 index bd5255a..0000000 --- a/setup.KubeConEU25/sample-jobs/large.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - generateName: large - labels: - kueue.x-k8s.io/queue-name: default-queue - annotations: - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 5s -spec: - components: - - template: - apiVersion: batch/v1 - kind: Job - metadata: - generateName: large - spec: - completions: 4 - parallelism: 4 - template: - spec: - restartPolicy: Never - terminationGracePeriodSeconds: 0 - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 600"] - resources: - limits: - nvidia.com/gpu: 4 diff --git a/setup.KubeConEU25/sample-jobs/normal.yaml b/setup.KubeConEU25/sample-jobs/normal.yaml deleted file mode 100644 index b02f64d..0000000 --- a/setup.KubeConEU25/sample-jobs/normal.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - generateName: normal - labels: - kueue.x-k8s.io/queue-name: default-queue -spec: - components: - - template: - apiVersion: batch/v1 - kind: Job - metadata: - generateName: normal - spec: - completions: 2 - parallelism: 2 - template: - spec: - restartPolicy: Never - terminationGracePeriodSeconds: 0 - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 600"] - resources: - limits: - nvidia.com/gpu: 4 diff --git a/setup.KubeConEU25/sample-jobs/pytorch-training.ipynb b/setup.KubeConEU25/sample-jobs/pytorch-training.ipynb deleted file mode 100644 index 2fd86b6..0000000 --- a/setup.KubeConEU25/sample-jobs/pytorch-training.ipynb +++ /dev/null @@ -1,450 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Tune Model on MNIST dataset using PyTorchJob and FSDP" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This Notebook will tune a small model on the MNIST dataset using FSDP.\n", - "\n", - "This Notebook will use **4** GPUs to train the model on 2 Nodes. This example is based on [the official PyTorch FSDP tutorial](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## FSDP with multi-node multi-worker training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This Notebook demonstrates multi-node, multi-worker distributed training with Fully Sharded Data Parallel (FSDP) and PyTorchJob.\n", - "\n", - "When a model is trained with FSDP, the GPU memory footprint is smaller compare to Distributed Data Parallel (DDP),\n", - "as the model parameters are sharded across GPU devices.\n", - "\n", - "This enables training of very large models that would otherwise be impossible to fit on a single GPU device.\n", - "\n", - "Check this guide to learn more about PyTorch FSDP: https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "## Install the required packages\n", - "\n", - "Install the Kubeflow Training Python SDK." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# TODO (andreyvelich): Use the release version of SDK.\n", - "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create script to train using MNIST using FSDP\n", - "\n", - "We need to wrap our fine-tuning script in a function to create Kubeflow PyTorchJob." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def train_function(parameters):\n", - " import os\n", - " import time\n", - " import functools\n", - "\n", - " import torch\n", - " import torch.nn as nn\n", - " import torch.nn.functional as F\n", - " import torch.optim as optim\n", - " from torchvision import datasets, transforms\n", - "\n", - " from torch.optim.lr_scheduler import StepLR\n", - "\n", - " import torch.distributed as dist\n", - " import torch.distributed as dist\n", - " import torch.multiprocessing as mp\n", - " from torch.nn.parallel import DistributedDataParallel as DDP\n", - " from torch.utils.data.distributed import DistributedSampler\n", - " from torch.distributed.fsdp import FullyShardedDataParallel as FSDP\n", - " from torch.distributed.fsdp.fully_sharded_data_parallel import (\n", - " CPUOffload,\n", - " BackwardPrefetch,\n", - " )\n", - " from torch.distributed.fsdp.wrap import (\n", - " size_based_auto_wrap_policy,\n", - " enable_wrap,\n", - " wrap,\n", - " )\n", - "\n", - " class Net(nn.Module):\n", - " def __init__(self):\n", - " super(Net, self).__init__()\n", - " self.conv1 = nn.Conv2d(1, 32, 3, 1)\n", - " self.conv2 = nn.Conv2d(32, 64, 3, 1)\n", - " self.dropout1 = nn.Dropout(0.25)\n", - " self.dropout2 = nn.Dropout(0.5)\n", - " self.fc1 = nn.Linear(9216, 128)\n", - " self.fc2 = nn.Linear(128, 10)\n", - "\n", - " def forward(self, x):\n", - "\n", - " x = self.conv1(x)\n", - " x = F.relu(x)\n", - " x = self.conv2(x)\n", - " x = F.relu(x)\n", - " x = F.max_pool2d(x, 2)\n", - " x = self.dropout1(x)\n", - " x = torch.flatten(x, 1)\n", - " x = self.fc1(x)\n", - " x = F.relu(x)\n", - " x = self.dropout2(x)\n", - " x = self.fc2(x)\n", - " output = F.log_softmax(x, dim=1)\n", - " return output\n", - " \n", - "\n", - " def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):\n", - " model.train()\n", - " ddp_loss = torch.zeros(2).to(rank)\n", - " if sampler:\n", - " sampler.set_epoch(epoch)\n", - " for batch_idx, (data, target) in enumerate(train_loader):\n", - " data, target = data.to(rank), target.to(rank)\n", - " optimizer.zero_grad()\n", - " output = model(data)\n", - " loss = F.nll_loss(output, target, reduction='sum')\n", - " loss.backward()\n", - " optimizer.step()\n", - " ddp_loss[0] += loss.item()\n", - " ddp_loss[1] += len(data)\n", - "\n", - " dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)\n", - " if rank == 0:\n", - " print('Train Epoch: {} \\tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1]))\n", - " \n", - " def test(model, rank, world_size, test_loader):\n", - " model.eval()\n", - " correct = 0\n", - " ddp_loss = torch.zeros(3).to(rank)\n", - " with torch.no_grad():\n", - " for data, target in test_loader:\n", - " data, target = data.to(rank), target.to(rank)\n", - " output = model(data)\n", - " ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss\n", - " pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability\n", - " ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item()\n", - " ddp_loss[2] += len(data)\n", - "\n", - " dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)\n", - "\n", - " if rank == 0:\n", - " test_loss = ddp_loss[0] / ddp_loss[2]\n", - " print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\\n'.format(\n", - " test_loss, int(ddp_loss[1]), int(ddp_loss[2]),\n", - " 100. * ddp_loss[1] / ddp_loss[2]))\n", - "\n", - "\n", - " # [1] Setup PyTorch distributed and get the distributed parameters.\n", - " torch.manual_seed(parameters[\"seed\"])\n", - " dist.init_process_group(\"nccl\")\n", - " local_rank = int(os.environ[\"LOCAL_RANK\"])\n", - " rank = dist.get_rank()\n", - " world_size = dist.get_world_size()\n", - "\n", - " # Local rank identifies the GPU number inside the pod.\n", - " torch.cuda.set_device(local_rank)\n", - "\n", - " print(\n", - " f\"FSDP Training for WORLD_SIZE: {world_size}, RANK: {rank}, LOCAL_RANK: {local_rank}\"\n", - " )\n", - "\n", - " transform=transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " dataset1 = datasets.MNIST('../data', train=True, download=True,\n", - " transform=transform)\n", - " dataset2 = datasets.MNIST('../data', train=False,\n", - " transform=transform)\n", - "\n", - " sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True)\n", - " sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size)\n", - "\n", - " train_kwargs = {'batch_size': parameters[\"batch-size\"], 'sampler': sampler1}\n", - " test_kwargs = {'batch_size': parameters[\"test-batch-size\"], 'sampler': sampler2}\n", - " cuda_kwargs = {'num_workers': 2,\n", - " 'pin_memory': True,\n", - " 'shuffle': False}\n", - " train_kwargs.update(cuda_kwargs)\n", - " test_kwargs.update(cuda_kwargs)\n", - "\n", - " train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)\n", - " test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)\n", - " my_auto_wrap_policy = functools.partial(\n", - " size_based_auto_wrap_policy, min_num_params=100\n", - " )\n", - "\n", - " init_start_event = torch.cuda.Event(enable_timing=True)\n", - " init_end_event = torch.cuda.Event(enable_timing=True)\n", - "\n", - " model = Net().to(local_rank)\n", - "\n", - " model = FSDP(model)\n", - "\n", - " optimizer = optim.Adadelta(model.parameters(), lr=parameters[\"lr\"])\n", - "\n", - " scheduler = StepLR(optimizer, step_size=1, gamma=parameters[\"gamma\"])\n", - " init_start_event.record()\n", - " for epoch in range(1, parameters[\"epochs\"] + 1):\n", - " train(parameters, model, local_rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)\n", - " test(model, local_rank, world_size, test_loader)\n", - " scheduler.step()\n", - "\n", - " init_end_event.record()\n", - "\n", - " if rank == 0:\n", - " init_end_event.synchronize()\n", - " print(f\"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec\")\n", - " print(f\"{model}\")\n", - "\n", - " if parameters[\"save-model\"]:\n", - " # use a barrier to make sure training is done on all ranks\n", - " dist.barrier()\n", - " states = model.state_dict()\n", - " if rank == 0:\n", - " torch.save(states, \"mnist_cnn.pt\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Kubeflow PyTorchJob to train on MNIST with FSDP\n", - "\n", - "Use `TrainingClient()` to create PyTorchJob which will train on **2 workers** using **2 GPU** for each worker.\n", - "\n", - "If you don't have enough GPU resources, you can decrease number of workers or number of GPUs per worker." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from kubeflow.training import TrainingClient\n", - "\n", - "job_name = \"mnist-training\"\n", - "\n", - "parameters = {\n", - " \"batch-size\": 64,\n", - " \"test-batch-size\": 1000,\n", - " \"epochs\": 10,\n", - " \"lr\": 1.0,\n", - " \"gamma\": 0.7,\n", - " \"seed\": 1,\n", - " \"save-model\": False,\n", - "}\n" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Create the PyTorchJob.\n", - "TrainingClient().create_job(\n", - " name=job_name,\n", - " train_func=train_function,\n", - " parameters=parameters,\n", - " num_workers=2, # You can modify number of workers or number of GPUs.\n", - " num_procs_per_worker=2,\n", - " resources_per_worker={\"gpu\": 2},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "### Check the PyTorchJob conditions\n", - "\n", - "Use `TrainingClient()` APIs to get information about created PyTorchJob." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"PyTorchJob Conditions\")\n", - "print(TrainingClient().get_job_conditions(job_name))\n", - "print(\"-\" * 40)\n", - "\n", - "# Wait until PyTorchJob has the Running condition.\n", - "job = TrainingClient().wait_for_job_conditions(\n", - " job_name,\n", - " expected_conditions={\"Running\"},\n", - ")\n", - "print(\"PyTorchJob is running\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get the PyTorchJob pod names\n", - "\n", - "Since we define 2 workers, PyTorchJob will create 1 master pod and 1 worker pod to run FSDP fine-tuning." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['mnist-training-master-0', 'mnist-training-worker-0']" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "TrainingClient().get_job_pod_names(job_name)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": { - "iopub.status.busy": "2022-09-01T20:10:25.759950Z", - "iopub.status.idle": "2022-09-01T20:10:25.760581Z", - "shell.execute_reply": "2022-09-01T20:10:25.760353Z", - "shell.execute_reply.started": "2022-09-01T20:10:25.760328Z" - }, - "tags": [] - }, - "source": [ - "### Get the PyTorchJob training logs\n", - "\n", - "Model parameters are sharded across all workers and GPU devices." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "logs, _ = TrainingClient().get_job_logs(job_name, follow=True)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": { - "iopub.execute_input": "2024-03-01T23:44:15.511173Z", - "iopub.status.busy": "2024-03-01T23:44:15.510932Z", - "iopub.status.idle": "2024-03-01T23:44:15.539921Z", - "shell.execute_reply": "2024-03-01T23:44:15.539352Z", - "shell.execute_reply.started": "2024-03-01T23:44:15.511155Z" - }, - "tags": [] - }, - "source": [ - "## Delete the PyTorchJob\n", - "\n", - "You can delete the created PyTorchJob." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "TrainingClient().delete_job(name=job_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pt-demo", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/setup.KubeConEU25/sample-jobs/pytorch-training.yaml b/setup.KubeConEU25/sample-jobs/pytorch-training.yaml deleted file mode 100644 index fc063e1..0000000 --- a/setup.KubeConEU25/sample-jobs/pytorch-training.yaml +++ /dev/null @@ -1,346 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: pytorch-mnist-training - labels: - kueue.x-k8s.io/queue-name: default-queue -spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: mnist-training - spec: - nprocPerNode: "2" - pytorchReplicaSpecs: - Master: - replicas: 1 - template: - metadata: - annotations: - sidecar.istio.io/inject: "false" - spec: - containers: - - args: - - |2- - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM - def train_function(parameters): - import os - import time - import functools - import torch - import torch.nn as nn - import torch.nn.functional as F - import torch.optim as optim - from torchvision import datasets, transforms - from torch.optim.lr_scheduler import StepLR - import torch.distributed as dist - import torch.distributed as dist - import torch.multiprocessing as mp - from torch.nn.parallel import DistributedDataParallel as DDP - from torch.utils.data.distributed import DistributedSampler - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - from torch.distributed.fsdp.fully_sharded_data_parallel import ( - CPUOffload, - BackwardPrefetch, - ) - from torch.distributed.fsdp.wrap import ( - size_based_auto_wrap_policy, - enable_wrap, - wrap, - ) - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 32, 3, 1) - self.conv2 = nn.Conv2d(32, 64, 3, 1) - self.dropout1 = nn.Dropout(0.25) - self.dropout2 = nn.Dropout(0.5) - self.fc1 = nn.Linear(9216, 128) - self.fc2 = nn.Linear(128, 10) - def forward(self, x): - x = self.conv1(x) - x = F.relu(x) - x = self.conv2(x) - x = F.relu(x) - x = F.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = F.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - output = F.log_softmax(x, dim=1) - return output - def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None): - model.train() - ddp_loss = torch.zeros(2).to(rank) - if sampler: - sampler.set_epoch(epoch) - for batch_idx, (data, target) in enumerate(train_loader): - data, target = data.to(rank), target.to(rank) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target, reduction='sum') - loss.backward() - optimizer.step() - ddp_loss[0] += loss.item() - ddp_loss[1] += len(data) - dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) - if rank == 0: - print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1])) - def test(model, rank, world_size, test_loader): - model.eval() - correct = 0 - ddp_loss = torch.zeros(3).to(rank) - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(rank), target.to(rank) - output = model(data) - ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss - pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability - ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item() - ddp_loss[2] += len(data) - dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) - if rank == 0: - test_loss = ddp_loss[0] / ddp_loss[2] - print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( - test_loss, int(ddp_loss[1]), int(ddp_loss[2]), - 100. * ddp_loss[1] / ddp_loss[2])) - # [1] Setup PyTorch distributed and get the distributed parameters. - torch.manual_seed(parameters["seed"]) - dist.init_process_group("nccl") - local_rank = int(os.environ["LOCAL_RANK"]) - rank = dist.get_rank() - world_size = dist.get_world_size() - # Local rank identifies the GPU number inside the pod. - torch.cuda.set_device(local_rank) - print( - f"FSDP Training for WORLD_SIZE: {world_size}, RANK: {rank}, LOCAL_RANK: {local_rank}" - ) - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ]) - dataset1 = datasets.MNIST('/tmp/data', train=True, download=True, - transform=transform) - dataset2 = datasets.MNIST('/tmp/data', train=False, - transform=transform) - sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True) - sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size) - train_kwargs = {'batch_size': parameters["batch-size"], 'sampler': sampler1} - test_kwargs = {'batch_size': parameters["test-batch-size"], 'sampler': sampler2} - cuda_kwargs = {'num_workers': 2, - 'pin_memory': True, - 'shuffle': False} - train_kwargs.update(cuda_kwargs) - test_kwargs.update(cuda_kwargs) - train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) - test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) - my_auto_wrap_policy = functools.partial( - size_based_auto_wrap_policy, min_num_params=100 - ) - init_start_event = torch.cuda.Event(enable_timing=True) - init_end_event = torch.cuda.Event(enable_timing=True) - model = Net().to(local_rank) - model = FSDP(model) - optimizer = optim.Adadelta(model.parameters(), lr=parameters["lr"]) - scheduler = StepLR(optimizer, step_size=1, gamma=parameters["gamma"]) - init_start_event.record() - for epoch in range(1, parameters["epochs"] + 1): - train(parameters, model, local_rank, world_size, train_loader, optimizer, epoch, sampler=sampler1) - test(model, local_rank, world_size, test_loader) - scheduler.step() - init_end_event.record() - if rank == 0: - init_end_event.synchronize() - print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec") - print(f"{model}") - if parameters["save-model"]: - # use a barrier to make sure training is done on all ranks - dist.barrier() - states = model.state_dict() - if rank == 0: - torch.save(states, "mnist_cnn.pt") - train_function({'batch-size': 64, 'test-batch-size': 1000, 'epochs': 10, 'lr': 1.0, 'gamma': 0.7, 'seed': 1, 'save-model': False}) - EOM - printf "%s" "$SCRIPT" > "$program_path/ephemeral_script.py" - torchrun "$program_path/ephemeral_script.py" - command: - - bash - - -c - image: docker.io/pytorch/pytorch:2.1.2-cuda11.8-cudnn8-runtime - name: pytorch - resources: - limits: - nvidia.com/gpu: "2" - requests: - nvidia.com/gpu: "2" - Worker: - replicas: 1 - template: - metadata: - annotations: - sidecar.istio.io/inject: "false" - spec: - containers: - - args: - - |2- - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM - def train_function(parameters): - import os - import time - import functools - import torch - import torch.nn as nn - import torch.nn.functional as F - import torch.optim as optim - from torchvision import datasets, transforms - from torch.optim.lr_scheduler import StepLR - import torch.distributed as dist - import torch.distributed as dist - import torch.multiprocessing as mp - from torch.nn.parallel import DistributedDataParallel as DDP - from torch.utils.data.distributed import DistributedSampler - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - from torch.distributed.fsdp.fully_sharded_data_parallel import ( - CPUOffload, - BackwardPrefetch, - ) - from torch.distributed.fsdp.wrap import ( - size_based_auto_wrap_policy, - enable_wrap, - wrap, - ) - class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 32, 3, 1) - self.conv2 = nn.Conv2d(32, 64, 3, 1) - self.dropout1 = nn.Dropout(0.25) - self.dropout2 = nn.Dropout(0.5) - self.fc1 = nn.Linear(9216, 128) - self.fc2 = nn.Linear(128, 10) - def forward(self, x): - x = self.conv1(x) - x = F.relu(x) - x = self.conv2(x) - x = F.relu(x) - x = F.max_pool2d(x, 2) - x = self.dropout1(x) - x = torch.flatten(x, 1) - x = self.fc1(x) - x = F.relu(x) - x = self.dropout2(x) - x = self.fc2(x) - output = F.log_softmax(x, dim=1) - return output - def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None): - model.train() - ddp_loss = torch.zeros(2).to(rank) - if sampler: - sampler.set_epoch(epoch) - for batch_idx, (data, target) in enumerate(train_loader): - data, target = data.to(rank), target.to(rank) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target, reduction='sum') - loss.backward() - optimizer.step() - ddp_loss[0] += loss.item() - ddp_loss[1] += len(data) - dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) - if rank == 0: - print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1])) - def test(model, rank, world_size, test_loader): - model.eval() - correct = 0 - ddp_loss = torch.zeros(3).to(rank) - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(rank), target.to(rank) - output = model(data) - ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss - pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability - ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item() - ddp_loss[2] += len(data) - dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) - if rank == 0: - test_loss = ddp_loss[0] / ddp_loss[2] - print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( - test_loss, int(ddp_loss[1]), int(ddp_loss[2]), - 100. * ddp_loss[1] / ddp_loss[2])) - # [1] Setup PyTorch distributed and get the distributed parameters. - torch.manual_seed(parameters["seed"]) - dist.init_process_group("nccl") - local_rank = int(os.environ["LOCAL_RANK"]) - rank = dist.get_rank() - world_size = dist.get_world_size() - # Local rank identifies the GPU number inside the pod. - torch.cuda.set_device(local_rank) - print( - f"FSDP Training for WORLD_SIZE: {world_size}, RANK: {rank}, LOCAL_RANK: {local_rank}" - ) - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ]) - dataset1 = datasets.MNIST('/tmp/data', train=True, download=True, - transform=transform) - dataset2 = datasets.MNIST('/tmp/data', train=False, - transform=transform) - sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True) - sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size) - train_kwargs = {'batch_size': parameters["batch-size"], 'sampler': sampler1} - test_kwargs = {'batch_size': parameters["test-batch-size"], 'sampler': sampler2} - cuda_kwargs = {'num_workers': 2, - 'pin_memory': True, - 'shuffle': False} - train_kwargs.update(cuda_kwargs) - test_kwargs.update(cuda_kwargs) - train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) - test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) - my_auto_wrap_policy = functools.partial( - size_based_auto_wrap_policy, min_num_params=100 - ) - init_start_event = torch.cuda.Event(enable_timing=True) - init_end_event = torch.cuda.Event(enable_timing=True) - model = Net().to(local_rank) - model = FSDP(model) - optimizer = optim.Adadelta(model.parameters(), lr=parameters["lr"]) - scheduler = StepLR(optimizer, step_size=1, gamma=parameters["gamma"]) - init_start_event.record() - for epoch in range(1, parameters["epochs"] + 1): - train(parameters, model, local_rank, world_size, train_loader, optimizer, epoch, sampler=sampler1) - test(model, local_rank, world_size, test_loader) - scheduler.step() - init_end_event.record() - if rank == 0: - init_end_event.synchronize() - print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec") - print(f"{model}") - if parameters["save-model"]: - # use a barrier to make sure training is done on all ranks - dist.barrier() - states = model.state_dict() - if rank == 0: - torch.save(states, "mnist_cnn.pt") - train_function({'batch-size': 64, 'test-batch-size': 1000, 'epochs': 10, 'lr': 1.0, 'gamma': 0.7, 'seed': 1, 'save-model': False}) - EOM - printf "%s" "$SCRIPT" > "$program_path/ephemeral_script.py" - torchrun "$program_path/ephemeral_script.py" - command: - - bash - - -c - image: docker.io/pytorch/pytorch:2.1.2-cuda11.8-cudnn8-runtime - name: pytorch - resources: - limits: - nvidia.com/gpu: "2" - requests: - nvidia.com/gpu: "2" - runPolicy: - suspend: false diff --git a/setup.KubeConEU25/sample-jobs/short.yaml b/setup.KubeConEU25/sample-jobs/short.yaml deleted file mode 100644 index bef54fd..0000000 --- a/setup.KubeConEU25/sample-jobs/short.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - generateName: short - labels: - kueue.x-k8s.io/queue-name: default-queue -spec: - components: - - template: - apiVersion: batch/v1 - kind: Job - metadata: - generateName: short - spec: - completions: 2 - parallelism: 2 - template: - spec: - restartPolicy: Never - terminationGracePeriodSeconds: 0 - containers: - - name: busybox - image: quay.io/project-codeflare/busybox:1.36 - command: ["sh", "-c", "sleep 30"] - resources: - limits: - nvidia.com/gpu: 4 diff --git a/setup.RHOAI-v2.16/CLUSTER-SETUP.md b/setup.RHOAI-v2.16/CLUSTER-SETUP.md deleted file mode 100644 index a4fcc0a..0000000 --- a/setup.RHOAI-v2.16/CLUSTER-SETUP.md +++ /dev/null @@ -1,171 +0,0 @@ -# Cluster Setup - -The cluster setup installs Red Hat OpenShift AI and configures Scheduler Plugins, Kueue, -cluster roles, and priority classes. - -## Priorities - -Create `default-priority`, `high-priority`, and `low-priority` priority classes: -```sh -oc apply -f setup.RHOAI-v2.16/mlbatch-priorities.yaml -``` - -## Scheduler Configuration - -MLBatch configures Kubernetes scheduling to accomplish two objectives: -+ Obtaining gang (all or nothing) scheduling for multi-Pod workloads. -+ Packing Pods whose GPU request is less than the number of GPUs on a Node to - maximize the number of Nodes available for Pods that request all the GPUs on a Node. - -This is done by installing the Coscheduling out-of-tree scheduler plugin and configuring -the default NodeResourcesFit scheduler plugin to pack in the GPU dimension. - - -```sh -helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \ - scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \ - --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]' -``` -Patch scheduler-plugins pod priorities: -```sh -oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.16/scheduler-priority-patch.yaml scheduler-plugins-controller -oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.16/scheduler-priority-patch.yaml scheduler-plugins-scheduler -``` - - - -## Red Hat OpenShift AI - -Create the Red Hat OpenShift AI subscription: -```sh -oc apply -f setup.RHOAI-v2.16/mlbatch-subscription.yaml -```` -Identify install plan: -```sh -oc get ip -n redhat-ods-operator -``` -``` -NAMESPACE NAME CSV APPROVAL APPROVED -redhat-ods-operator install-kmh8w rhods-operator.2.16.0 Manual false -``` -Approve install plan replacing the generated plan name below with the actual -value: -```sh -oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kmh8w -``` -Create DSC Initialization: -```sh -oc apply -f setup.RHOAI-v2.16/mlbatch-dsci.yaml -``` -Create Data Science Cluster: -```sh -oc apply -f setup.RHOAI-v2.16/mlbatch-dsc.yaml -``` -The provided DSCI and DSC are intended to install a minimal set of Red Hat OpenShift -AI managed components: `codeflare`, `kueue`, `ray`, and `trainingoperator`. The -remaining components such as `dashboard` can be optionally enabled. - -The configuration of the managed components differs from the default Red Hat OpenShift -AI configuration as follows: -- Kubeflow Training Operator: - - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, -- Kueue: - - `manageJobsWithoutQueueName` is enabled, - - `batch/job` integration is disabled, - - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, - - `fairSharing` is enabled, - - `enableClusterQueueResources` metrics is enabled, -- Codeflare operator: - - the AppWrapper controller is enabled and configured as follows: - - `userRBACAdmissionCheck` is disabled, - - `schedulerName` is set to `scheduler-plugins-scheduler`, - - `queueName` is set to `default-queue`, - - `slackQueueName` is set to `slack-cluster-queue` -- pod priorities, resource requests and limits have been adjusted. - - - -## Autopilot - -Helm charts values and how-to for customization can be found [in the official documentation](https://github.com/IBM/autopilot/blob/main/helm-charts/autopilot/README.md). As-is, Autopilot will run on GPU nodes. - -- Add the Autopilot Helm repository - -```bash -helm repo add autopilot https://ibm.github.io/autopilot/ -helm repo update -``` - -- Install the chart (idempotent command). The config file is for customizing the helm values and it is optional. - -```bash -helm upgrade autopilot autopilot/autopilot --install --namespace=autopilot --create-namespace -f your-config.yml -``` - -### Enabling Prometheus metrics - -After completing the installation, manually label the namespace to enable metrics to be scraped by Prometheus with the following command: - -```bash -oc label ns autopilot openshift.io/cluster-monitoring=true -``` - -The `ServiceMonitor` labeling is not required. - -## Kueue Configuration - -Create Kueue's default flavor: -```sh -oc apply -f setup.RHOAI-v2.16/default-flavor.yaml -``` - -## Cluster Role - -Create `mlbatch-edit` role: -```sh -oc apply -f setup.RHOAI-v2.16/mlbatch-edit-role.yaml -``` - -## Slack Cluster Queue - -Create the designated slack `ClusterQueue` which will be used to automate -minor adjustments to cluster capacity caused by node failures and -scheduler maintanence. -```sh -oc apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: slack-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - - name: "memory" - nominalQuota: 128Gi - - name: "nvidia.com/gpu" - nominalQuota: 8 - - name: "nvidia.com/roce_gdr" - nominalQuota: 1 - - name: "pods" - nominalQuota: 100 -EOF -``` -Edit the above quantities to adjust the quota to the desired -values. Pod counts are optional and can be omitted from the list of -covered resources. The `lendingLimit` for each resource will be -dynamically adjusted by the MLBatch system to reflect reduced cluster -capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a -detailed discussion of the role of the slack `ClusterQueue`. diff --git a/setup.RHOAI-v2.16/TEAM-SETUP.md b/setup.RHOAI-v2.16/TEAM-SETUP.md deleted file mode 100644 index 85c9429..0000000 --- a/setup.RHOAI-v2.16/TEAM-SETUP.md +++ /dev/null @@ -1,91 +0,0 @@ -# Team Setup - -A *team* in MLBatch is a group of users that share a resource quota. - -Before setting up your teams and quotas, please read [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) -for a discussion of our recommended best practices. - - -Setting up a new team requires the cluster admin to create a project, -a user group, a quota, a queue, and the required role bindings as described below. - -Create project: -```sh -oc new-project team1 -``` -Create user group: -```sh -oc adm groups new team1-edit-group -``` -Add users to group for example: -```sh -oc adm groups add-users team1-edit-group user1 -``` -Bind cluster role to group in namespace: -```sh -oc adm policy add-role-to-group mlbatch-edit team1-edit-group --role-namespace="" --namespace team1 -``` - -Specify the intended quota for the namespace by creating a `ClusterQueue`: -```sh -oc apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: team1-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "memory" - nominalQuota: 128Gi - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/gpu" - nominalQuota: 16 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/roce_gdr" - nominalQuota: 4 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "pods" - nominalQuota: 100 - # borrowingLimit: 0 - # lendingLimit: 0 -EOF -``` -Edit the above quantities to adjust the quota to the desired values. Pod counts -are optional and can be omitted from the list of covered resources. - -Uncomment all `borrowingLimit` lines to prevent this namespace from borrowing -quota from other namespaces. Uncomment all `lendingLimit` lines to prevent other -namespaces from borrowing quota from this namespace. - -Create a `LocalQueue` to bind the `ClusterQueue` to the namespace: -```sh -oc apply -n team1 -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: default-queue -spec: - clusterQueue: team1-cluster-queue -EOF -``` -We recommend naming the local queue `default-queue` as `AppWrappers` will -default to this queue name. - diff --git a/setup.RHOAI-v2.16/UNINSTALL.md b/setup.RHOAI-v2.16/UNINSTALL.md deleted file mode 100644 index 776045d..0000000 --- a/setup.RHOAI-v2.16/UNINSTALL.md +++ /dev/null @@ -1,23 +0,0 @@ -# Uninstall - -***First, remove all team projects and corresponding cluster queues.*** - -Then to uninstall the MLBatch controllers and reclaim the corresponding -namespaces, run: -```sh -# OpenShift AI uninstall -oc delete dsc mlbatch-dsc -oc delete dsci mlbatch-dsci -oc delete subscription -n redhat-ods-operator rhods-operator -oc delete csv -n redhat-ods-operator -l operators.coreos.com/rhods-operator.redhat-ods-operator -oc delete crd featuretrackers.features.opendatahub.io \ - dscinitializations.dscinitialization.opendatahub.io \ - datascienceclusters.datasciencecluster.opendatahub.io -oc delete operators rhods-operator.redhat-ods-operator -oc delete operatorgroup -n redhat-ods-operator rhods-operator -oc delete namespace redhat-ods-applications redhat-ods-monitoring redhat-ods-operator - -# Coscheduler uninstall -helm uninstall -n scheduler-plugins scheduler-plugins -oc delete namespace scheduler-plugins -``` diff --git a/setup.RHOAI-v2.16/UPGRADE-FAST.md b/setup.RHOAI-v2.16/UPGRADE-FAST.md deleted file mode 100644 index eeb9bb3..0000000 --- a/setup.RHOAI-v2.16/UPGRADE-FAST.md +++ /dev/null @@ -1,34 +0,0 @@ -# Upgrading from RHOAI 2.15 - -These instructions assume you installed and configured RHOAI 2.15 following -the MLBatch [install instructions for RHOAI-v2.15](../setup.RHOAI-v2.15/CLUSTER-SETUP.md) -or the [upgrade instructions for RHOAI-V2.15](../setup.RHOAI-v2.15/UPGRADE.md) - -Your subscription will have automatically created an unapproved -install plan to upgrade to RHOAI 2.16. - -Before beginning, verify that the expected install plan exists: -```sh -oc get ip -n redhat-ods-operator -``` -Typical output would be: -```sh -NAME CSV APPROVAL APPROVED -install-kpzzl rhods-operator.2.16.0 Manual false -install-nqrbp rhods-operator.2.15.0 Manual true -``` - -Assuming the install plan exists you can begin the upgrade process. - -First, update the MLBatch modifications to the default RHOAI configuration maps. -```sh -oc apply -f setup.RHOAI-v2.16/mlbatch-upgrade-configmaps.yaml -``` - -There are no MLBatch modifications to the default RHOAI configuration maps -beyond those already made in previous installs. Therefore, you can simply -approve the install plan replacing the example plan name below with the actual -value on your cluster: -```sh -oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl -``` diff --git a/setup.RHOAI-v2.16/UPGRADE-STABLE.md b/setup.RHOAI-v2.16/UPGRADE-STABLE.md deleted file mode 100644 index e17651e..0000000 --- a/setup.RHOAI-v2.16/UPGRADE-STABLE.md +++ /dev/null @@ -1,31 +0,0 @@ -# Upgrading from RHOAI 2.13 - -These instructions assume you installed and configured RHOAI 2.13 following -the MLBatch [install instructions for RHOAI-v2.13](../setup.RHOAI-v2.13/CLUSTER-SETUP.md). - -Your subscription will have automatically created an unapproved -install plan to upgrade to RHOAI 2.16. - -Before beginning, verify that the expected install plan exists: -```sh -oc get ip -n redhat-ods-operator -``` -Typical output would be: -```sh -NAME CSV APPROVAL APPROVED -install-kpzzl rhods-operator.2.16.0 Manual false -install-nqrbp rhods-operator.2.13.0 Manual true -``` - -Assuming the install plan exists you can begin the upgrade process. - -First, update the MLBatch modifications to the default RHOAI configuration maps. -```sh -oc apply -f setup.RHOAI-v2.16/mlbatch-upgrade-configmaps.yaml -``` - -Second, approve the install plan replacing the example plan name below with the actual -value on your cluster: -```sh -oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl -``` diff --git a/setup.RHOAI-v2.16/default-flavor.yaml b/setup.RHOAI-v2.16/default-flavor.yaml deleted file mode 100644 index 6cbccf3..0000000 --- a/setup.RHOAI-v2.16/default-flavor.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ResourceFlavor -metadata: - name: default-flavor diff --git a/setup.RHOAI-v2.16/mlbatch-dsc.yaml b/setup.RHOAI-v2.16/mlbatch-dsc.yaml deleted file mode 100644 index 66336bc..0000000 --- a/setup.RHOAI-v2.16/mlbatch-dsc.yaml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: datasciencecluster.opendatahub.io/v1 -kind: DataScienceCluster -metadata: - name: mlbatch-dsc -spec: - components: - codeflare: - managementState: Managed - dashboard: - managementState: Removed - datasciencepipelines: - managementState: Removed - kserve: - managementState: Removed - serving: - ingressGateway: - certificate: - type: SelfSigned - managementState: Removed - name: knative-serving - kueue: - managementState: Managed - modelmeshserving: - managementState: Removed - ray: - managementState: Managed - trainingoperator: - managementState: Managed - trustyai: - managementState: Removed - workbenches: - managementState: Removed diff --git a/setup.RHOAI-v2.16/mlbatch-dsci.yaml b/setup.RHOAI-v2.16/mlbatch-dsci.yaml deleted file mode 100644 index 77785c3..0000000 --- a/setup.RHOAI-v2.16/mlbatch-dsci.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: dscinitialization.opendatahub.io/v1 -kind: DSCInitialization -metadata: - name: mlbatch-dsci -spec: - applicationsNamespace: redhat-ods-applications - monitoring: - managementState: Managed - namespace: redhat-ods-monitoring - serviceMesh: - managementState: Removed - trustedCABundle: - customCABundle: "" - managementState: Managed diff --git a/setup.RHOAI-v2.16/mlbatch-edit-role.yaml b/setup.RHOAI-v2.16/mlbatch-edit-role.yaml deleted file mode 100644 index fd86cc6..0000000 --- a/setup.RHOAI-v2.16/mlbatch-edit-role.yaml +++ /dev/null @@ -1,151 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: mlbatch-edit -rules: -- apiGroups: - - "" - resources: - - pods - verbs: - - delete - - get - - list - - watch -- apiGroups: - - apps - resources: - - deployments - - statefulsets - verbs: - - delete - - get - - list - - watch -- apiGroups: - - "" - resources: - - services - - secrets - - configmaps - - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - kueue.x-k8s.io - resources: - - "*" - verbs: - - get - - list - - watch -- apiGroups: - - kubeflow.org - resources: - - pytorchjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - ray.io - resources: - - rayjobs - - rayclusters - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - delete - - get - - list - - watch -- apiGroups: - - workload.codeflare.dev - resources: - - appwrappers - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - scheduling.k8s.io - resources: - - priorityclasses - verbs: - - get - - list - - watch -- apiGroups: - - scheduling.x-k8s.io - resources: - - podgroups - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - get - - list - - watch -- apiGroups: - - "" - resources: - - namespaces - - pods/log - verbs: - - get -- apiGroups: - - "" - resources: - - pods/exec - - pods/portforward - verbs: - - create -- apiGroups: - - route.openshift.io - resources: - - routes - verbs: - - get - - list - - watch - - delete -- apiGroups: - - "" - - project.openshift.io - resources: - - projects - verbs: - - get diff --git a/setup.RHOAI-v2.16/mlbatch-priorities.yaml b/setup.RHOAI-v2.16/mlbatch-priorities.yaml deleted file mode 100644 index 77c8f3b..0000000 --- a/setup.RHOAI-v2.16/mlbatch-priorities.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: low-priority -value: 1 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class for all lower priority jobs." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: default-priority -value: 5 -preemptionPolicy: PreemptLowerPriority -globalDefault: true -description: "This is the priority class for all jobs (default priority)." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: high-priority -value: 10 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class defined for highly important jobs that would evict lower and default priority jobs." diff --git a/setup.RHOAI-v2.16/mlbatch-subscription.yaml b/setup.RHOAI-v2.16/mlbatch-subscription.yaml deleted file mode 100644 index 0f5eec1..0000000 --- a/setup.RHOAI-v2.16/mlbatch-subscription.yaml +++ /dev/null @@ -1,310 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: redhat-ods-operator ---- -apiVersion: v1 -kind: Namespace -metadata: - name: redhat-ods-applications ---- -apiVersion: operators.coreos.com/v1 -kind: OperatorGroup -metadata: - name: rhods-operator - namespace: redhat-ods-operator ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-codeflare - namespace: redhat-ods-operator -data: - manager.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: manager - namespace: system - spec: - selector: - matchLabels: - app.kubernetes.io/name: codeflare-operator - app.kubernetes.io/part-of: codeflare - replicas: 1 - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - app.kubernetes.io/name: codeflare-operator - app.kubernetes.io/part-of: codeflare - spec: - priorityClassName: system-node-critical - securityContext: - runAsNonRoot: true - # TODO(user): For common cases that do not require escalating privileges - # it is recommended to ensure that all your Pods/Containers are restrictive. - # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted - # Please uncomment the following code if your project does NOT have to work on old Kubernetes - # versions < 1.20 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). - # seccompProfile: - # type: RuntimeDefault - containers: - - command: - - /manager - image: $(codeflare_operator_controller_image) - imagePullPolicy: Always - name: manager - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - ports: - - containerPort: 8080 - protocol: TCP - name: metrics - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - cpu: "1" - memory: 1Gi - requests: - cpu: "1" - memory: 1Gi - serviceAccountName: controller-manager - terminationGracePeriodSeconds: 10 ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: codeflare-operator-config - namespace: redhat-ods-applications -data: - config.yaml: | - appwrapper: - enabled: true - Config: - autopilot: - injectAntiAffinities: true - monitorNodes: true - resourceTaints: - nvidia.com/gpu: - - key: autopilot.ibm.com/gpuhealth - value: ERR - effect: NoSchedule - - key: autopilot.ibm.com/gpuhealth - value: TESTING - effect: NoSchedule - - key: autopilot.ibm.com/gpuhealth - value: EVICT - effect: NoExecute - defaultQueueName: default-queue - enableKueueIntegrations: true - kueueJobReconciller: - manageJobsWithoutQueueName: true - waitForPodsReady: - blockAdmission: false - enable: false - schedulerName: scheduler-plugins-scheduler - slackQueueName: slack-cluster-queue - userRBACAdmissionCheck: false ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-kuberay - namespace: redhat-ods-operator -data: - kuberay-operator-image-patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: kuberay-operator - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: kuberay-operator - image: $(image) ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-kueue - namespace: redhat-ods-operator -data: - controller_manager_config.yaml: | - apiVersion: config.kueue.x-k8s.io/v1beta1 - kind: Configuration - health: - healthProbeBindAddress: :8081 - metrics: - bindAddress: :8080 - enableClusterQueueResources: true - webhook: - port: 9443 - leaderElection: - leaderElect: true - resourceName: c1f6bfd2.kueue.x-k8s.io - controller: - groupKindConcurrency: - Job.batch: 5 - Pod: 5 - Workload.kueue.x-k8s.io: 5 - LocalQueue.kueue.x-k8s.io: 1 - ClusterQueue.kueue.x-k8s.io: 1 - ResourceFlavor.kueue.x-k8s.io: 1 - clientConnection: - qps: 50 - burst: 100 - #pprofBindAddress: :8082 - waitForPodsReady: - enable: false - blockAdmission: false - manageJobsWithoutQueueName: true - #internalCertManagement: - # enable: false - # webhookServiceName: "" - # webhookSecretName: "" - integrations: - frameworks: - # - "batch/job" - - "kubeflow.org/mpijob" - - "ray.io/rayjob" - - "ray.io/raycluster" - - "jobset.x-k8s.io/jobset" - - "kubeflow.org/mxjob" - - "kubeflow.org/paddlejob" - - "kubeflow.org/pytorchjob" - - "kubeflow.org/tfjob" - - "kubeflow.org/xgboostjob" - # - "pod" - externalFrameworks: - - "AppWrapper.v1beta2.workload.codeflare.dev" - # podOptions: - # namespaceSelector: - # matchExpressions: - # - key: kubernetes.io/metadata.name - # operator: NotIn - # values: [ kube-system, kueue-system ] - fairSharing: - enable: true - preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: controller-manager - namespace: system - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: manager - image: $(image) - args: - - "--config=/controller_manager_config.yaml" - - "--zap-log-level=2" - - "--feature-gates=LendingLimit=true" - volumeMounts: - - name: manager-config - mountPath: /controller_manager_config.yaml - subPath: controller_manager_config.yaml - volumes: - - name: manager-config - configMap: - name: manager-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-training-operator - namespace: redhat-ods-operator -data: - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: training-operator - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: training-operator - image: $(image) - args: - - "--zap-log-level=2" - - "--gang-scheduler-name=scheduler-plugins-scheduler" - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 500m - memory: 1000Mi ---- -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - name: rhods-operator - namespace: redhat-ods-operator -spec: - channel: stable - installPlanApproval: Manual - name: rhods-operator - source: redhat-operators - sourceNamespace: openshift-marketplace - startingCSV: rhods-operator.2.16.0 - config: - env: - - name: "DISABLE_DSC_CONFIG" - volumeMounts: - - name: mlbatch-codeflare - mountPath: /opt/manifests/codeflare/manager/manager.yaml - subPath: manager.yaml - - name: mlbatch-kuberay - mountPath: /opt/manifests/ray/openshift/kuberay-operator-image-patch.yaml - subPath: kuberay-operator-image-patch.yaml - - name: mlbatch-kueue - mountPath: /opt/manifests/kueue/components/manager/controller_manager_config.yaml - subPath: controller_manager_config.yaml - - name: mlbatch-kueue - mountPath: /opt/manifests/kueue/rhoai/manager_config_patch.yaml - subPath: manager_config_patch.yaml - - name: mlbatch-training-operator - mountPath: /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml - subPath: manager_config_patch.yaml - volumes: - - name: mlbatch-codeflare - configMap: - name: mlbatch-codeflare - - name: mlbatch-kuberay - configMap: - name: mlbatch-kuberay - - name: mlbatch-kueue - configMap: - name: mlbatch-kueue - - name: mlbatch-training-operator - configMap: - name: mlbatch-training-operator diff --git a/setup.RHOAI-v2.16/mlbatch-upgrade-configmaps.yaml b/setup.RHOAI-v2.16/mlbatch-upgrade-configmaps.yaml deleted file mode 100644 index c111aa4..0000000 --- a/setup.RHOAI-v2.16/mlbatch-upgrade-configmaps.yaml +++ /dev/null @@ -1,125 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: codeflare-operator-config - namespace: redhat-ods-applications -data: - config.yaml: | - appwrapper: - enabled: true - Config: - autopilot: - injectAntiAffinities: true - monitorNodes: true - resourceTaints: - nvidia.com/gpu: - - key: autopilot.ibm.com/gpuhealth - value: ERR - effect: NoSchedule - - key: autopilot.ibm.com/gpuhealth - value: TESTING - effect: NoSchedule - - key: autopilot.ibm.com/gpuhealth - value: EVICT - effect: NoExecute - defaultQueueName: default-queue - enableKueueIntegrations: true - kueueJobReconciller: - manageJobsWithoutQueueName: true - waitForPodsReady: - blockAdmission: false - enable: false - schedulerName: scheduler-plugins-scheduler - slackQueueName: slack-cluster-queue - userRBACAdmissionCheck: false ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-kueue - namespace: redhat-ods-operator -data: - controller_manager_config.yaml: | - apiVersion: config.kueue.x-k8s.io/v1beta1 - kind: Configuration - health: - healthProbeBindAddress: :8081 - metrics: - bindAddress: :8080 - enableClusterQueueResources: true - webhook: - port: 9443 - leaderElection: - leaderElect: true - resourceName: c1f6bfd2.kueue.x-k8s.io - controller: - groupKindConcurrency: - Job.batch: 5 - Pod: 5 - Workload.kueue.x-k8s.io: 5 - LocalQueue.kueue.x-k8s.io: 1 - ClusterQueue.kueue.x-k8s.io: 1 - ResourceFlavor.kueue.x-k8s.io: 1 - clientConnection: - qps: 50 - burst: 100 - #pprofBindAddress: :8082 - waitForPodsReady: - enable: false - blockAdmission: false - manageJobsWithoutQueueName: true - #internalCertManagement: - # enable: false - # webhookServiceName: "" - # webhookSecretName: "" - integrations: - frameworks: - # - "batch/job" - - "kubeflow.org/mpijob" - - "ray.io/rayjob" - - "ray.io/raycluster" - - "jobset.x-k8s.io/jobset" - - "kubeflow.org/mxjob" - - "kubeflow.org/paddlejob" - - "kubeflow.org/pytorchjob" - - "kubeflow.org/tfjob" - - "kubeflow.org/xgboostjob" - # - "pod" - externalFrameworks: - - "AppWrapper.v1beta2.workload.codeflare.dev" - # podOptions: - # namespaceSelector: - # matchExpressions: - # - key: kubernetes.io/metadata.name - # operator: NotIn - # values: [ kube-system, kueue-system ] - fairSharing: - enable: true - preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: controller-manager - namespace: system - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: manager - image: $(image) - args: - - "--config=/controller_manager_config.yaml" - - "--zap-log-level=2" - - "--feature-gates=LendingLimit=true" - volumeMounts: - - name: manager-config - mountPath: /controller_manager_config.yaml - subPath: controller_manager_config.yaml - volumes: - - name: manager-config - configMap: - name: manager-config ---- diff --git a/setup.RHOAI-v2.16/scheduler-priority-patch.yaml b/setup.RHOAI-v2.16/scheduler-priority-patch.yaml deleted file mode 100644 index 278802f..0000000 --- a/setup.RHOAI-v2.16/scheduler-priority-patch.yaml +++ /dev/null @@ -1,3 +0,0 @@ -- op: add - path: /spec/template/spec/priorityClassName - value: system-node-critical diff --git a/setup.RHOAI-v2.19/CLUSTER-SETUP.md b/setup.RHOAI-v2.19/CLUSTER-SETUP.md deleted file mode 100644 index 87046a6..0000000 --- a/setup.RHOAI-v2.19/CLUSTER-SETUP.md +++ /dev/null @@ -1,171 +0,0 @@ -# Cluster Setup - -The cluster setup installs Red Hat OpenShift AI and configures Scheduler Plugins, Kueue, -cluster roles, and priority classes. - -## Priorities - -Create `default-priority`, `high-priority`, and `low-priority` priority classes: -```sh -oc apply -f setup.RHOAI-v2.19/mlbatch-priorities.yaml -``` - -## Scheduler Configuration - -MLBatch configures Kubernetes scheduling to accomplish two objectives: -+ Obtaining gang (all or nothing) scheduling for multi-Pod workloads. -+ Packing Pods whose GPU request is less than the number of GPUs on a Node to - maximize the number of Nodes available for Pods that request all the GPUs on a Node. - -This is done by installing the Coscheduling out-of-tree scheduler plugin and configuring -the default NodeResourcesFit scheduler plugin to pack in the GPU dimension. - - -```sh -helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \ - scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \ - --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]' -``` -Patch scheduler-plugins pod priorities: -```sh -oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.19/scheduler-priority-patch.yaml scheduler-plugins-controller -oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.19/scheduler-priority-patch.yaml scheduler-plugins-scheduler -``` - - - -## Red Hat OpenShift AI - -Create the Red Hat OpenShift AI subscription: -```sh -oc apply -f setup.RHOAI-v2.19/mlbatch-subscription.yaml -```` -Identify install plan: -```sh -oc get ip -n redhat-ods-operator -``` -``` -NAMESPACE NAME CSV APPROVAL APPROVED -redhat-ods-operator install-kmh8w rhods-operator.2.16.0 Manual false -``` -Approve install plan replacing the generated plan name below with the actual -value: -```sh -oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kmh8w -``` -Create DSC Initialization: -```sh -oc apply -f setup.RHOAI-v2.19/mlbatch-dsci.yaml -``` -Create Data Science Cluster: -```sh -oc apply -f setup.RHOAI-v2.19/mlbatch-dsc.yaml -``` -The provided DSCI and DSC are intended to install a minimal set of Red Hat OpenShift -AI managed components: `codeflare`, `kueue`, `ray`, and `trainingoperator`. The -remaining components such as `dashboard` can be optionally enabled. - -The configuration of the managed components differs from the default Red Hat OpenShift -AI configuration as follows: -- Kubeflow Training Operator: - - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, -- Kueue: - - `manageJobsWithoutQueueName` is enabled, - - `batch/job` integration is disabled, - - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, - - `fairSharing` is enabled, - - `enableClusterQueueResources` metrics is enabled, -- Codeflare operator: - - the AppWrapper controller is enabled and configured as follows: - - `userRBACAdmissionCheck` is disabled, - - `schedulerName` is set to `scheduler-plugins-scheduler`, - - `queueName` is set to `default-queue`, - - `slackQueueName` is set to `slack-cluster-queue` -- pod priorities, resource requests and limits have been adjusted. - - - -## Autopilot - -Helm charts values and how-to for customization can be found [in the official documentation](https://github.com/IBM/autopilot/blob/main/helm-charts/autopilot/README.md). As-is, Autopilot will run on GPU nodes. - -- Add the Autopilot Helm repository - -```bash -helm repo add autopilot https://ibm.github.io/autopilot/ -helm repo update -``` - -- Install the chart (idempotent command). The config file is for customizing the helm values and it is optional. - -```bash -helm upgrade autopilot autopilot/autopilot --install --namespace=autopilot --create-namespace -f your-config.yml -``` - -### Enabling Prometheus metrics - -After completing the installation, manually label the namespace to enable metrics to be scraped by Prometheus with the following command: - -```bash -oc label ns autopilot openshift.io/cluster-monitoring=true -``` - -The `ServiceMonitor` labeling is not required. - -## Kueue Configuration - -Create Kueue's default flavor: -```sh -oc apply -f setup.RHOAI-v2.19/default-flavor.yaml -``` - -## Cluster Role - -Create `mlbatch-edit` role: -```sh -oc apply -f setup.RHOAI-v2.19/mlbatch-edit-role.yaml -``` - -## Slack Cluster Queue - -Create the designated slack `ClusterQueue` which will be used to automate -minor adjustments to cluster capacity caused by node failures and -scheduler maintanence. -```sh -oc apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: slack-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - - name: "memory" - nominalQuota: 128Gi - - name: "nvidia.com/gpu" - nominalQuota: 8 - - name: "nvidia.com/roce_gdr" - nominalQuota: 1 - - name: "pods" - nominalQuota: 100 -EOF -``` -Edit the above quantities to adjust the quota to the desired -values. Pod counts are optional and can be omitted from the list of -covered resources. The `lendingLimit` for each resource will be -dynamically adjusted by the MLBatch system to reflect reduced cluster -capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a -detailed discussion of the role of the slack `ClusterQueue`. diff --git a/setup.RHOAI-v2.19/TEAM-SETUP.md b/setup.RHOAI-v2.19/TEAM-SETUP.md deleted file mode 100644 index 85c9429..0000000 --- a/setup.RHOAI-v2.19/TEAM-SETUP.md +++ /dev/null @@ -1,91 +0,0 @@ -# Team Setup - -A *team* in MLBatch is a group of users that share a resource quota. - -Before setting up your teams and quotas, please read [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) -for a discussion of our recommended best practices. - - -Setting up a new team requires the cluster admin to create a project, -a user group, a quota, a queue, and the required role bindings as described below. - -Create project: -```sh -oc new-project team1 -``` -Create user group: -```sh -oc adm groups new team1-edit-group -``` -Add users to group for example: -```sh -oc adm groups add-users team1-edit-group user1 -``` -Bind cluster role to group in namespace: -```sh -oc adm policy add-role-to-group mlbatch-edit team1-edit-group --role-namespace="" --namespace team1 -``` - -Specify the intended quota for the namespace by creating a `ClusterQueue`: -```sh -oc apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: team1-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "memory" - nominalQuota: 128Gi - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/gpu" - nominalQuota: 16 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/roce_gdr" - nominalQuota: 4 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "pods" - nominalQuota: 100 - # borrowingLimit: 0 - # lendingLimit: 0 -EOF -``` -Edit the above quantities to adjust the quota to the desired values. Pod counts -are optional and can be omitted from the list of covered resources. - -Uncomment all `borrowingLimit` lines to prevent this namespace from borrowing -quota from other namespaces. Uncomment all `lendingLimit` lines to prevent other -namespaces from borrowing quota from this namespace. - -Create a `LocalQueue` to bind the `ClusterQueue` to the namespace: -```sh -oc apply -n team1 -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: default-queue -spec: - clusterQueue: team1-cluster-queue -EOF -``` -We recommend naming the local queue `default-queue` as `AppWrappers` will -default to this queue name. - diff --git a/setup.RHOAI-v2.19/UNINSTALL.md b/setup.RHOAI-v2.19/UNINSTALL.md deleted file mode 100644 index 776045d..0000000 --- a/setup.RHOAI-v2.19/UNINSTALL.md +++ /dev/null @@ -1,23 +0,0 @@ -# Uninstall - -***First, remove all team projects and corresponding cluster queues.*** - -Then to uninstall the MLBatch controllers and reclaim the corresponding -namespaces, run: -```sh -# OpenShift AI uninstall -oc delete dsc mlbatch-dsc -oc delete dsci mlbatch-dsci -oc delete subscription -n redhat-ods-operator rhods-operator -oc delete csv -n redhat-ods-operator -l operators.coreos.com/rhods-operator.redhat-ods-operator -oc delete crd featuretrackers.features.opendatahub.io \ - dscinitializations.dscinitialization.opendatahub.io \ - datascienceclusters.datasciencecluster.opendatahub.io -oc delete operators rhods-operator.redhat-ods-operator -oc delete operatorgroup -n redhat-ods-operator rhods-operator -oc delete namespace redhat-ods-applications redhat-ods-monitoring redhat-ods-operator - -# Coscheduler uninstall -helm uninstall -n scheduler-plugins scheduler-plugins -oc delete namespace scheduler-plugins -``` diff --git a/setup.RHOAI-v2.19/UPGRADE-FAST.md b/setup.RHOAI-v2.19/UPGRADE-FAST.md deleted file mode 100644 index 06db6ab..0000000 --- a/setup.RHOAI-v2.19/UPGRADE-FAST.md +++ /dev/null @@ -1,29 +0,0 @@ -# Upgrading from RHOAI 2.19 - -These instructions assume you installed and configured RHOAI 2.18 following -the MLBatch [install instructions for RHOAI-v2.18](../setup.RHOAI-v2.18/CLUSTER-SETUP.md) -or the [upgrade instructions for RHOAI-V2.18](../setup.RHOAI-v2.18/UPGRADE.md) - -Your subscription will have automatically created an unapproved -install plan to upgrade to RHOAI 2.19. - -Before beginning, verify that the expected install plan exists: -```sh -oc get ip -n redhat-ods-operator -``` -Typical output would be: -```sh -NAME CSV APPROVAL APPROVED -install-kpzzl rhods-operator.2.18.0 Manual false -install-nqrbp rhods-operator.2.19.0 Manual true -``` - -Assuming the install plan exists you can begin the upgrade process. - -There are no MLBatch modifications to the default RHOAI configuration maps -beyond those already made in previous installs. Therefore, you can simply -approve the install plan replacing the example plan name below with the actual -value on your cluster: -```sh -oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl -``` diff --git a/setup.RHOAI-v2.19/UPGRADE-STABLE.md b/setup.RHOAI-v2.19/UPGRADE-STABLE.md deleted file mode 100644 index 10a4cf5..0000000 --- a/setup.RHOAI-v2.19/UPGRADE-STABLE.md +++ /dev/null @@ -1,30 +0,0 @@ -# Upgrading from RHOAI 2.16 - -These instructions assume you installed and configured RHOAI 2.16 following -the MLBatch [install instructions for RHOAI-v2.16](../setup.RHOAI-v2.16/CLUSTER-SETUP.md) -or the [stable stream upgrade instructions for RHOAI-V2.16](../setup.RHOAI-v2.16/UPGRADE-STABLE.md) -and are subscribed to the stable channel. - -Your subscription will have automatically created an unapproved -install plan to upgrade to RHOAI 2.19. - -Before beginning, verify that the expected install plan exists: -```sh -oc get ip -n redhat-ods-operator -``` -Typical output would be: -```sh -NAME CSV APPROVAL APPROVED -install-kpzzl rhods-operator.2.16.0 Manual false -install-nqrbp rhods-operator.2.19.0 Manual true -``` - -Assuming the install plan exists you can begin the upgrade process. - -There are no MLBatch modifications to the default RHOAI configuration maps -beyond those already made in previous installs. Therefore, you can simply -approve the install plan replacing the example plan name below with the actual -value on your cluster: -```sh -oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl -``` diff --git a/setup.RHOAI-v2.19/default-flavor.yaml b/setup.RHOAI-v2.19/default-flavor.yaml deleted file mode 100644 index 6cbccf3..0000000 --- a/setup.RHOAI-v2.19/default-flavor.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ResourceFlavor -metadata: - name: default-flavor diff --git a/setup.RHOAI-v2.19/mlbatch-dsc.yaml b/setup.RHOAI-v2.19/mlbatch-dsc.yaml deleted file mode 100644 index 66336bc..0000000 --- a/setup.RHOAI-v2.19/mlbatch-dsc.yaml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: datasciencecluster.opendatahub.io/v1 -kind: DataScienceCluster -metadata: - name: mlbatch-dsc -spec: - components: - codeflare: - managementState: Managed - dashboard: - managementState: Removed - datasciencepipelines: - managementState: Removed - kserve: - managementState: Removed - serving: - ingressGateway: - certificate: - type: SelfSigned - managementState: Removed - name: knative-serving - kueue: - managementState: Managed - modelmeshserving: - managementState: Removed - ray: - managementState: Managed - trainingoperator: - managementState: Managed - trustyai: - managementState: Removed - workbenches: - managementState: Removed diff --git a/setup.RHOAI-v2.19/mlbatch-dsci.yaml b/setup.RHOAI-v2.19/mlbatch-dsci.yaml deleted file mode 100644 index 77785c3..0000000 --- a/setup.RHOAI-v2.19/mlbatch-dsci.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: dscinitialization.opendatahub.io/v1 -kind: DSCInitialization -metadata: - name: mlbatch-dsci -spec: - applicationsNamespace: redhat-ods-applications - monitoring: - managementState: Managed - namespace: redhat-ods-monitoring - serviceMesh: - managementState: Removed - trustedCABundle: - customCABundle: "" - managementState: Managed diff --git a/setup.RHOAI-v2.19/mlbatch-edit-role.yaml b/setup.RHOAI-v2.19/mlbatch-edit-role.yaml deleted file mode 100644 index fd86cc6..0000000 --- a/setup.RHOAI-v2.19/mlbatch-edit-role.yaml +++ /dev/null @@ -1,151 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: mlbatch-edit -rules: -- apiGroups: - - "" - resources: - - pods - verbs: - - delete - - get - - list - - watch -- apiGroups: - - apps - resources: - - deployments - - statefulsets - verbs: - - delete - - get - - list - - watch -- apiGroups: - - "" - resources: - - services - - secrets - - configmaps - - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - kueue.x-k8s.io - resources: - - "*" - verbs: - - get - - list - - watch -- apiGroups: - - kubeflow.org - resources: - - pytorchjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - ray.io - resources: - - rayjobs - - rayclusters - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - delete - - get - - list - - watch -- apiGroups: - - workload.codeflare.dev - resources: - - appwrappers - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - scheduling.k8s.io - resources: - - priorityclasses - verbs: - - get - - list - - watch -- apiGroups: - - scheduling.x-k8s.io - resources: - - podgroups - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - get - - list - - watch -- apiGroups: - - "" - resources: - - namespaces - - pods/log - verbs: - - get -- apiGroups: - - "" - resources: - - pods/exec - - pods/portforward - verbs: - - create -- apiGroups: - - route.openshift.io - resources: - - routes - verbs: - - get - - list - - watch - - delete -- apiGroups: - - "" - - project.openshift.io - resources: - - projects - verbs: - - get diff --git a/setup.RHOAI-v2.19/mlbatch-priorities.yaml b/setup.RHOAI-v2.19/mlbatch-priorities.yaml deleted file mode 100644 index 77c8f3b..0000000 --- a/setup.RHOAI-v2.19/mlbatch-priorities.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: low-priority -value: 1 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class for all lower priority jobs." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: default-priority -value: 5 -preemptionPolicy: PreemptLowerPriority -globalDefault: true -description: "This is the priority class for all jobs (default priority)." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: high-priority -value: 10 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class defined for highly important jobs that would evict lower and default priority jobs." diff --git a/setup.RHOAI-v2.19/mlbatch-subscription.yaml b/setup.RHOAI-v2.19/mlbatch-subscription.yaml deleted file mode 100644 index e667279..0000000 --- a/setup.RHOAI-v2.19/mlbatch-subscription.yaml +++ /dev/null @@ -1,310 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: redhat-ods-operator ---- -apiVersion: v1 -kind: Namespace -metadata: - name: redhat-ods-applications ---- -apiVersion: operators.coreos.com/v1 -kind: OperatorGroup -metadata: - name: rhods-operator - namespace: redhat-ods-operator ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-codeflare - namespace: redhat-ods-operator -data: - manager.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: manager - namespace: system - spec: - selector: - matchLabels: - app.kubernetes.io/name: codeflare-operator - app.kubernetes.io/part-of: codeflare - replicas: 1 - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - app.kubernetes.io/name: codeflare-operator - app.kubernetes.io/part-of: codeflare - spec: - priorityClassName: system-node-critical - securityContext: - runAsNonRoot: true - # TODO(user): For common cases that do not require escalating privileges - # it is recommended to ensure that all your Pods/Containers are restrictive. - # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted - # Please uncomment the following code if your project does NOT have to work on old Kubernetes - # versions < 1.20 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). - # seccompProfile: - # type: RuntimeDefault - containers: - - command: - - /manager - image: $(codeflare_operator_controller_image) - imagePullPolicy: Always - name: manager - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - ports: - - containerPort: 8080 - protocol: TCP - name: metrics - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - cpu: "1" - memory: 1Gi - requests: - cpu: "1" - memory: 1Gi - serviceAccountName: controller-manager - terminationGracePeriodSeconds: 10 ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: codeflare-operator-config - namespace: redhat-ods-applications -data: - config.yaml: | - appwrapper: - enabled: true - Config: - autopilot: - injectAntiAffinities: true - monitorNodes: true - resourceTaints: - nvidia.com/gpu: - - key: autopilot.ibm.com/gpuhealth - value: ERR - effect: NoSchedule - - key: autopilot.ibm.com/gpuhealth - value: TESTING - effect: NoSchedule - - key: autopilot.ibm.com/gpuhealth - value: EVICT - effect: NoExecute - defaultQueueName: default-queue - enableKueueIntegrations: true - kueueJobReconciller: - manageJobsWithoutQueueName: true - waitForPodsReady: - blockAdmission: false - enable: false - schedulerName: scheduler-plugins-scheduler - slackQueueName: slack-cluster-queue - userRBACAdmissionCheck: false ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-kuberay - namespace: redhat-ods-operator -data: - kuberay-operator-image-patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: kuberay-operator - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: kuberay-operator - image: $(image) ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-kueue - namespace: redhat-ods-operator -data: - controller_manager_config.yaml: | - apiVersion: config.kueue.x-k8s.io/v1beta1 - kind: Configuration - health: - healthProbeBindAddress: :8081 - metrics: - bindAddress: :8080 - enableClusterQueueResources: true - webhook: - port: 9443 - leaderElection: - leaderElect: true - resourceName: c1f6bfd2.kueue.x-k8s.io - controller: - groupKindConcurrency: - Job.batch: 5 - Pod: 5 - Workload.kueue.x-k8s.io: 5 - LocalQueue.kueue.x-k8s.io: 1 - ClusterQueue.kueue.x-k8s.io: 1 - ResourceFlavor.kueue.x-k8s.io: 1 - clientConnection: - qps: 50 - burst: 100 - #pprofBindAddress: :8082 - waitForPodsReady: - enable: false - blockAdmission: false - manageJobsWithoutQueueName: true - #internalCertManagement: - # enable: false - # webhookServiceName: "" - # webhookSecretName: "" - integrations: - frameworks: - # - "batch/job" - - "kubeflow.org/mpijob" - - "ray.io/rayjob" - - "ray.io/raycluster" - - "jobset.x-k8s.io/jobset" - - "kubeflow.org/mxjob" - - "kubeflow.org/paddlejob" - - "kubeflow.org/pytorchjob" - - "kubeflow.org/tfjob" - - "kubeflow.org/xgboostjob" - # - "pod" - externalFrameworks: - - "AppWrapper.v1beta2.workload.codeflare.dev" - # podOptions: - # namespaceSelector: - # matchExpressions: - # - key: kubernetes.io/metadata.name - # operator: NotIn - # values: [ kube-system, kueue-system ] - fairSharing: - enable: true - preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: controller-manager - namespace: system - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: manager - image: $(image) - args: - - "--config=/controller_manager_config.yaml" - - "--zap-log-level=2" - - "--feature-gates=LendingLimit=true" - volumeMounts: - - name: manager-config - mountPath: /controller_manager_config.yaml - subPath: controller_manager_config.yaml - volumes: - - name: manager-config - configMap: - name: manager-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-training-operator - namespace: redhat-ods-operator -data: - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: training-operator - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: training-operator - image: $(image) - args: - - "--zap-log-level=2" - - "--gang-scheduler-name=scheduler-plugins-scheduler" - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 500m - memory: 1000Mi ---- -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - name: rhods-operator - namespace: redhat-ods-operator -spec: - channel: stable - installPlanApproval: Manual - name: rhods-operator - source: redhat-operators - sourceNamespace: openshift-marketplace - startingCSV: rhods-operator.2.19.0 - config: - env: - - name: "DISABLE_DSC_CONFIG" - volumeMounts: - - name: mlbatch-codeflare - mountPath: /opt/manifests/codeflare/manager/manager.yaml - subPath: manager.yaml - - name: mlbatch-kuberay - mountPath: /opt/manifests/ray/openshift/kuberay-operator-image-patch.yaml - subPath: kuberay-operator-image-patch.yaml - - name: mlbatch-kueue - mountPath: /opt/manifests/kueue/components/manager/controller_manager_config.yaml - subPath: controller_manager_config.yaml - - name: mlbatch-kueue - mountPath: /opt/manifests/kueue/rhoai/manager_config_patch.yaml - subPath: manager_config_patch.yaml - - name: mlbatch-training-operator - mountPath: /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml - subPath: manager_config_patch.yaml - volumes: - - name: mlbatch-codeflare - configMap: - name: mlbatch-codeflare - - name: mlbatch-kuberay - configMap: - name: mlbatch-kuberay - - name: mlbatch-kueue - configMap: - name: mlbatch-kueue - - name: mlbatch-training-operator - configMap: - name: mlbatch-training-operator diff --git a/setup.RHOAI-v2.19/scheduler-priority-patch.yaml b/setup.RHOAI-v2.19/scheduler-priority-patch.yaml deleted file mode 100644 index 278802f..0000000 --- a/setup.RHOAI-v2.19/scheduler-priority-patch.yaml +++ /dev/null @@ -1,3 +0,0 @@ -- op: add - path: /spec/template/spec/priorityClassName - value: system-node-critical diff --git a/setup.k8s/CLUSTER-SETUP.md b/setup.k8s/CLUSTER-SETUP.md deleted file mode 100644 index 9ce72c1..0000000 --- a/setup.k8s/CLUSTER-SETUP.md +++ /dev/null @@ -1,193 +0,0 @@ -# Cluster Setup - -The cluster setup installs and configures the following components: -+ Scheduler Plugins -+ Kubeflow Training Operator -+ KubeRay -+ Kueue -+ AppWrappers -+ Cluster roles and priority classes -+ Autopilot - -## Priorities - -Create `default-priority`, `high-priority`, and `low-priority` priority classes: -```sh -kubectl apply -f setup.k8s/mlbatch-priorities.yaml -``` - -## Scheduler Configuration - -MLBatch configures Kubernetes scheduling to accomplish two objectives: -+ Obtaining gang (all or nothing) scheduling for multi-Pod workloads. -+ Packing Pods whose GPU request is less than the number of GPUs on a Node to - maximize the number of Nodes available for Pods that request all the GPUs on a Node. - -The currently recommend way to do this is by installing the Coscheduling out-of-tree scheduler -plugin and configuring the default NodeResourcesFit scheduler plugin to pack in the GPU dimension. -Alternatively, you can skip the helm install and patch commands shown below and instead install -the experimental Sakkara scheduler plugin (described next). - - -```sh -helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \ - scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \ - --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]' -``` -Patch scheduler-plugins pod priorities: -```sh -kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s/scheduler-priority-patch.yaml scheduler-plugins-controller -kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s/scheduler-priority-patch.yaml scheduler-plugins-scheduler -``` - -### Sakkara - -[Sakkara](https://github.com/atantawi/scheduler-plugins/tree/sakkara) is an experimental -new scheduler plugin with advanced support for topology-aware scheduling. - -Install Sakkara as a secondary scheduler: -```sh -helm install sakkara-scheduler --namespace sakkara-scheduler --create-namespace mlbatch/sakkara-scheduler -``` -Optionally, create a config map capturing your cluster's topology as described in the [Sakkara documentation](https://github.com/atantawi/sakkara-deploy/tree/main?tab=readme-ov-file#cluster-topology). This step is optional but recommended for production clusters. If the config map is not present Sakkara will default to a single-level hierarchy containing the Nodes of the cluster. - -## Install Operators - -Create the mlbatch-system namespace -```sh -kubectl create namespace mlbatch-system -``` - -Install the Kubeflow Training Operator - -If you are using Coscheduling do: -```sh -kubectl apply --server-side -k setup.k8s/training-operator/coscheduling -``` -If you are using Sakkara do: -```sh -kubectl apply --server-side -k setup.k8s/training-operator/sakkara -``` - -Install the KubeRay Operator -```sh -kubectl apply --server-side -k setup.k8s/kuberay -``` - -Install Kueue -```sh -kubectl apply --server-side -k setup.k8s/kueue -``` - -Install the AppWrapper Operator -If you are using Coscheduling do: -```sh -kubectl apply --server-side -k setup.k8s/appwrapper/coscheduling -``` -If you are using Sakkara do: -```sh -kubectl apply --server-side -k setup.k8s/appwrapper/sakkara -``` - -The provided configuration differs from the default configuration of the -operators as follows: -- Kubeflow Training Operator: - - `gang-scheduler-name` is set to either `scheduler-plugins-scheduler` or `sakkara-scheduler`, -- Kueue: - - `batch/job` integration is disabled, - - `manageJobsWithoutQueueName` is enabled and configured via `managedJobsNamespaceSelector` to be - scoped to only namespaces that are labeled with `mlbatch-team-namespace=true`. - - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, - - `fairSharing` is enabled, - - `enableClusterQueueResources` metrics is enabled, -- AppWrapper operator: - - `userRBACAdmissionCheck` is disabled, - - `schedulerName` is set to `scheduler-plugins-scheduler` or `sakkara-scheduler`, - - `queueName` is set to `default-queue`, -- pod priorities, resource requests and limits have been adjusted. - -## Autopilot - -Helm charts values and how-to for customization can be found [in the official documentation](https://github.com/IBM/autopilot/blob/main/helm-charts/autopilot/README.md). As-is, Autopilot will run on GPU nodes. - -- Add the Autopilot Helm repository - -```bash -helm repo add autopilot https://ibm.github.io/autopilot/ -helm repo update -``` - -- Install the chart (idempotent command). The config file is for customizing the helm values and it is optional. - -```bash -helm upgrade autopilot autopilot/autopilot --install --namespace=autopilot --create-namespace -f your-config.yml -``` - -### Enabling Prometheus metrics - -The `ServiceMonitor` object is the one that enables Prometheus to scrape the metrics produced by Autopilot. -In order for Prometheus to find the right objects, the `ServiceMonitor` needs to be annotated with the Prometheus' release name. It is usually `prometheus`, and that's the default added in the Autopilot release. -If that is not the case in your cluster, the correct release label can be found by checking in the `ServiceMonitor` of Prometheus itself, or the name of Prometheus helm chart. -Then, Autopilot's `ServiceMonitor` can be labeled with the following command - -```bash -kubectl label servicemonitors.monitoring.coreos.com -n autopilot autopilot-metrics-monitor release= --overwrite -``` - -## Kueue Configuration - -Create Kueue's default flavor: -```sh -kubectl apply -f setup.k8s/default-flavor.yaml -``` - -## Cluster Role - -Create `mlbatch-edit` role: -```sh -kubectl apply -f setup.k8s/mlbatch-edit-role.yaml -``` - -## Slack Cluster Queue - -Create the designated slack `ClusterQueue` which will be used to automate -minor adjustments to cluster capacity caused by node failures and -scheduler maintanence. -```sh -kubectl apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: slack-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - - name: "memory" - nominalQuota: 128Gi - - name: "nvidia.com/gpu" - nominalQuota: 8 - - name: "nvidia.com/roce_gdr" - nominalQuota: 1 - - name: "pods" - nominalQuota: 100 -EOF -``` -Edit the above quantities to adjust the quota to the desired -values. Pod counts are optional and can be omitted from the list of -covered resources. The `lendingLimit` for each resource will be -dynamically adjusted by the MLBatch system to reflect reduced cluster -capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a -detailed discussion of the role of the slack `ClusterQueue`. diff --git a/setup.k8s/TEAM-SETUP.md b/setup.k8s/TEAM-SETUP.md deleted file mode 100644 index 3f1fc38..0000000 --- a/setup.k8s/TEAM-SETUP.md +++ /dev/null @@ -1,97 +0,0 @@ -# Team Setup - -A *team* in MLBatch is a group of users that share a resource quota. - -Before setting up your teams and quotas, please read [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) -for a discussion of our recommended best practices. - -Setting up a new team requires the cluster admin to create a namespace, -a quota, a queue, and the required role bindings as described below. - -Create and label the namespace: -```sh -kubectl create namespace team1 -kubectl label namespace team1 'mlbatch-team-namespace=true' -``` - -For each user on the team, create a RoleBinding: -```sh -kubectl -n team1 apply -f- << EOF -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: user-one -subjects: - - kind: User - apiGroup: rbac.authorization.k8s.io - name: user-one -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: mlbatch-edit -EOF -``` - -Specify the intended quota for the namespace by creating a `ClusterQueue`: -```sh -kubectl apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: team1-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "memory" - nominalQuota: 128Gi - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/gpu" - nominalQuota: 16 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/roce_gdr" - nominalQuota: 4 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "pods" - nominalQuota: 100 - # borrowingLimit: 0 - # lendingLimit: 0 -EOF -``` -Edit the above quantities to adjust the quota to the desired values. Pod counts -are optional and can be omitted from the list of covered resources. - -Uncomment all `borrowingLimit` lines to prevent this namespace from borrowing -quota from other namespaces. Uncomment all `lendingLimit` lines to prevent other -namespaces from borrowing quota from this namespace. - -Create a `LocalQueue` to bind the `ClusterQueue` to the namespace: -```sh -kubectl apply -n team1 -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: default-queue -spec: - clusterQueue: team1-cluster-queue -EOF -``` -We recommend naming the local queue `default-queue` as `AppWrappers` will -default to this queue name. - diff --git a/setup.k8s/UNINSTALL.md b/setup.k8s/UNINSTALL.md deleted file mode 100644 index 70ec929..0000000 --- a/setup.k8s/UNINSTALL.md +++ /dev/null @@ -1,27 +0,0 @@ -# Uninstall - -***First, remove all team namespaces and corresponding cluster queues.*** - -Then to uninstall the MLBatch controllers and reclaim the corresponding -namespaces, do the following: -```sh -# Delete operators and CRDs -kubectl delete -k setup.k8s/appwrapper/base -kubectl delete -k setup.k8s/kueue -kubectl delete -k setup.k8s/kuberay -kubectl delete -k setup.k8s/training-operator/base - -# Delete namespace -kubectl delete namespace mlbatch-system - -# Delete clusterole -kubectl delete clusterrole mlbatch-edit - -# Coscheduler uninstall -helm uninstall -n scheduler-plugins scheduler-plugins -kubectl delete namespace scheduler-plugins - -# Sakkara uninstall -helm uninstall -n sakkara-scheduler sakkara-scheduler -kubectl delete namespace sakkara-scheduler -``` diff --git a/setup.k8s/appwrapper/base/kustomization.yaml b/setup.k8s/appwrapper/base/kustomization.yaml deleted file mode 100644 index 440f25b..0000000 --- a/setup.k8s/appwrapper/base/kustomization.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: mlbatch-system - -resources: -- "https://github.com/project-codeflare/appwrapper/config/default?ref=v0.30.0" - -labels: -- pairs: - app.kubernetes.io/name: appwrapper - app.kubernetes.io/component: controller - includeSelectors: true - -images: -- name: quay.io/ibm/appwrapper - newTag: v0.30.0 - -patches: -- path: manager_resources_patch.yaml -- path: remove_default_namespace.yaml diff --git a/setup.k8s/appwrapper/base/manager_resources_patch.yaml b/setup.k8s/appwrapper/base/manager_resources_patch.yaml deleted file mode 100644 index 1b26c3c..0000000 --- a/setup.k8s/appwrapper/base/manager_resources_patch.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system -spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: manager - resources: - requests: - cpu: 250m - memory: 250Mi - limits: - cpu: 1000m - memory: 1000Mi diff --git a/setup.k8s/appwrapper/base/remove_default_namespace.yaml b/setup.k8s/appwrapper/base/remove_default_namespace.yaml deleted file mode 100644 index b63fb95..0000000 --- a/setup.k8s/appwrapper/base/remove_default_namespace.yaml +++ /dev/null @@ -1,5 +0,0 @@ -$patch: delete -apiVersion: v1 -kind: Namespace -metadata: - name: appwrapper-system diff --git a/setup.k8s/appwrapper/coscheduling/config_patch.yaml b/setup.k8s/appwrapper/coscheduling/config_patch.yaml deleted file mode 100644 index 6e1b592..0000000 --- a/setup.k8s/appwrapper/coscheduling/config_patch.yaml +++ /dev/null @@ -1,23 +0,0 @@ -kind: ConfigMap -apiVersion: v1 -metadata: - name: appwrapper-operator-config - namespace: appwrapper-system -data: - config.yaml: | - appwrapper: - enableKueueIntegrations: true - kueueJobReconciller: - manageJobsWithoutQueueName: true - waitForPodsReady: - enable: false - defaultQueueName: default-queue - schedulerName: scheduler-plugins-scheduler - slackQueueName: slack-cluster-queue - userRBACAdmissionCheck: false - controllerManager: - health: - bindAddress: ":8081" - metrics: - bindAddress: "127.0.0.1:8080" - leaderElection: true diff --git a/setup.k8s/appwrapper/coscheduling/kustomization.yaml b/setup.k8s/appwrapper/coscheduling/kustomization.yaml deleted file mode 100644 index c651d8a..0000000 --- a/setup.k8s/appwrapper/coscheduling/kustomization.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: mlbatch-system - -resources: -- ../base - -patches: -patches: -- path: config_patch.yaml diff --git a/setup.k8s/appwrapper/sakkara/config_patch.yaml b/setup.k8s/appwrapper/sakkara/config_patch.yaml deleted file mode 100644 index f657b58..0000000 --- a/setup.k8s/appwrapper/sakkara/config_patch.yaml +++ /dev/null @@ -1,23 +0,0 @@ -kind: ConfigMap -apiVersion: v1 -metadata: - name: appwrapper-operator-config - namespace: appwrapper-system -data: - config.yaml: | - appwrapper: - enableKueueIntegrations: true - kueueJobReconciller: - manageJobsWithoutQueueName: true - waitForPodsReady: - enable: false - defaultQueueName: default-queue - schedulerName: sakkara-scheduler - slackQueueName: slack-cluster-queue - userRBACAdmissionCheck: false - controllerManager: - health: - bindAddress: ":8081" - metrics: - bindAddress: "127.0.0.1:8080" - leaderElection: true diff --git a/setup.k8s/appwrapper/sakkara/kustomization.yaml b/setup.k8s/appwrapper/sakkara/kustomization.yaml deleted file mode 100644 index c651d8a..0000000 --- a/setup.k8s/appwrapper/sakkara/kustomization.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: mlbatch-system - -resources: -- ../base - -patches: -patches: -- path: config_patch.yaml diff --git a/setup.k8s/default-flavor.yaml b/setup.k8s/default-flavor.yaml deleted file mode 100644 index 6cbccf3..0000000 --- a/setup.k8s/default-flavor.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ResourceFlavor -metadata: - name: default-flavor diff --git a/setup.k8s/kind/kind-config.yaml b/setup.k8s/kind/kind-config.yaml deleted file mode 100644 index f5d7a9e..0000000 --- a/setup.k8s/kind/kind-config.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# this config file contains all config fields with comments -kind: Cluster -apiVersion: kind.x-k8s.io/v1alpha4 -# 1 control plane node and 1 worker node -nodes: -# the control plane node config -- role: control-plane - # kubernetes version 1.27.17 from kind v0.24.0 - image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe -# the worker -- role: worker - # kubernetes version 1.27.17 from kind v0.24.0 - image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe diff --git a/setup.k8s/kuberay/kustomization.yaml b/setup.k8s/kuberay/kustomization.yaml deleted file mode 100644 index 0161395..0000000 --- a/setup.k8s/kuberay/kustomization.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: mlbatch-system - -resources: -- "https://github.com/ray-project/kuberay/ray-operator/config/default?ref=v1.1.0" - -labels: -- pairs: - app.kubernetes.io/name: kuberay - app.kubernetes.io/component: controller - includeSelectors: true - -patches: -- path: remove_default_namespace.yaml -- path: manager_resources_patch.yaml diff --git a/setup.k8s/kuberay/manager_resources_patch.yaml b/setup.k8s/kuberay/manager_resources_patch.yaml deleted file mode 100644 index 7bb80d9..0000000 --- a/setup.k8s/kuberay/manager_resources_patch.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kuberay-operator - namespace: system -spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: kuberay-operator - args: - - "--zap-log-level=2" - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 500m - memory: 1000Mi diff --git a/setup.k8s/kuberay/remove_default_namespace.yaml b/setup.k8s/kuberay/remove_default_namespace.yaml deleted file mode 100644 index b5977cc..0000000 --- a/setup.k8s/kuberay/remove_default_namespace.yaml +++ /dev/null @@ -1,5 +0,0 @@ -$patch: delete -apiVersion: v1 -kind: Namespace -metadata: - name: ray-system diff --git a/setup.k8s/kueue/controller_manager_config.yaml b/setup.k8s/kueue/controller_manager_config.yaml deleted file mode 100644 index 0f395ac..0000000 --- a/setup.k8s/kueue/controller_manager_config.yaml +++ /dev/null @@ -1,76 +0,0 @@ -apiVersion: config.kueue.x-k8s.io/v1beta1 -kind: Configuration -health: - healthProbeBindAddress: :8081 -metrics: - bindAddress: :8080 - enableClusterQueueResources: true -webhook: - port: 9443 -leaderElection: - leaderElect: true - resourceName: c1f6bfd2.kueue.x-k8s.io -controller: - groupKindConcurrency: -# Job.batch: 5 - Pod: 5 - Workload.kueue.x-k8s.io: 5 - LocalQueue.kueue.x-k8s.io: 1 - Cohort.kueue.x-k8s.io: 1 - ClusterQueue.kueue.x-k8s.io: 1 - ResourceFlavor.kueue.x-k8s.io: 1 -clientConnection: - qps: 50 - burst: 100 -#pprofBindAddress: :8083 -waitForPodsReady: - enable: false -# timeout: 5m -# blockAdmission: false -# requeuingStrategy: -# timestamp: Eviction -# backoffLimitCount: null # null indicates infinite requeuing -# backoffBaseSeconds: 60 -# backoffMaxSeconds: 3600 -manageJobsWithoutQueueName: true -managedJobsNamespaceSelector: - matchLabels: - mlbatch-team-namespace: "true" -#internalCertManagement: -# enable: false -# webhookServiceName: "" -# webhookSecretName: "" -integrations: - frameworks: -# - "batch/job" - - "kubeflow.org/mpijob" - - "ray.io/rayjob" - - "ray.io/raycluster" - - "jobset.x-k8s.io/jobset" - - "kubeflow.org/mxjob" - - "kubeflow.org/paddlejob" - - "kubeflow.org/pytorchjob" - - "kubeflow.org/tfjob" - - "kubeflow.org/xgboostjob" -# - "pod" -# - "deployment" # requires enabling pod integration -# - "statefulset" # requires enabling pod integration - externalFrameworks: - - "AppWrapper.v1beta2.workload.codeflare.dev" -# podOptions: -# namespaceSelector: -# matchExpressions: -# - key: kubernetes.io/metadata.name -# operator: NotIn -# values: [ kube-system, kueue-system ] -fairSharing: - enable: true - preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] -#resources: -# excludeResourcePrefixes: [] -# transformations: -# - input: nvidia.com/mig-4g.5gb -# strategy: Replace | Retain -# outputs: -# example.com/accelerator-memory: 5Gi -# example.com/accelerator-gpc: 4 diff --git a/setup.k8s/kueue/kustomization.yaml b/setup.k8s/kueue/kustomization.yaml deleted file mode 100644 index 5b7004c..0000000 --- a/setup.k8s/kueue/kustomization.yaml +++ /dev/null @@ -1,53 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: mlbatch-system - -resources: -- "https://github.com/kubernetes-sigs/kueue/config/default?ref=v0.10.2" - -labels: -- pairs: - app.kubernetes.io/name: kueue - app.kubernetes.io/component: controller - includeSelectors: true - -configMapGenerator: -- name: manager-config - namespace: kueue-system - behavior: replace - files: - - controller_manager_config.yaml - -images: -- name: us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue - newName: registry.k8s.io/kueue/kueue - newTag: v0.10.2 - -patches: -- path: manager_resources_patch.yaml -- path: mutating_webhook_patch.yaml -- path: remove_default_namespace.yaml -- path: validating_webhook_patch.yaml -- target: - kind: ClusterRole - name: manager-role - patch: | - - op: add - path: /rules/- - value: - apiGroups: - - workload.codeflare.dev - resources: - - appwrappers - verbs: - - get - - list - - watch -- target: - kind: Deployment - name: controller-manager - patch: | - - op: add - path: /spec/template/spec/containers/0/args/- - value: "--feature-gates=LendingLimit=true" diff --git a/setup.k8s/kueue/manager_resources_patch.yaml b/setup.k8s/kueue/manager_resources_patch.yaml deleted file mode 100644 index 5dc7501..0000000 --- a/setup.k8s/kueue/manager_resources_patch.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system -spec: - template: - spec: - priorityClassName: system-node-critical diff --git a/setup.k8s/kueue/mutating_webhook_patch.yaml b/setup.k8s/kueue/mutating_webhook_patch.yaml deleted file mode 100644 index 61d0e1d..0000000 --- a/setup.k8s/kueue/mutating_webhook_patch.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: admissionregistration.k8s.io/v1 -kind: MutatingWebhookConfiguration -metadata: - name: mutating-webhook-configuration -webhooks: - - $patch: delete - name: mpod.kb.io - - $patch: delete - name: mjob.kb.io diff --git a/setup.k8s/kueue/remove_default_namespace.yaml b/setup.k8s/kueue/remove_default_namespace.yaml deleted file mode 100644 index 787ee88..0000000 --- a/setup.k8s/kueue/remove_default_namespace.yaml +++ /dev/null @@ -1,5 +0,0 @@ -$patch: delete -apiVersion: v1 -kind: Namespace -metadata: - name: kueue-system diff --git a/setup.k8s/kueue/validating_webhook_patch.yaml b/setup.k8s/kueue/validating_webhook_patch.yaml deleted file mode 100644 index 3fe0342..0000000 --- a/setup.k8s/kueue/validating_webhook_patch.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingWebhookConfiguration -metadata: - name: validating-webhook-configuration -webhooks: - - $patch: delete - name: vpod.kb.io - - $patch: delete - name: vjob.kb.io diff --git a/setup.k8s/mlbatch-edit-role.yaml b/setup.k8s/mlbatch-edit-role.yaml deleted file mode 100644 index a3db811..0000000 --- a/setup.k8s/mlbatch-edit-role.yaml +++ /dev/null @@ -1,135 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: mlbatch-edit -rules: -- apiGroups: - - "" - resources: - - pods - verbs: - - delete - - get - - list - - watch -- apiGroups: - - apps - resources: - - deployments - - statefulsets - verbs: - - delete - - get - - list - - watch -- apiGroups: - - "" - resources: - - services - - secrets - - configmaps - - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - kueue.x-k8s.io - resources: - - "*" - verbs: - - get - - list - - watch -- apiGroups: - - kubeflow.org - resources: - - pytorchjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - ray.io - resources: - - rayjobs - - rayclusters - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - delete - - get - - list - - watch -- apiGroups: - - workload.codeflare.dev - resources: - - appwrappers - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - scheduling.k8s.io - resources: - - priorityclasses - verbs: - - get - - list - - watch -- apiGroups: - - scheduling.x-k8s.io - resources: - - podgroups - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - get - - list - - watch -- apiGroups: - - "" - resources: - - namespaces - - pods/log - verbs: - - get -- apiGroups: - - "" - resources: - - pods/exec - - pods/portforward - verbs: - - create diff --git a/setup.k8s/mlbatch-priorities.yaml b/setup.k8s/mlbatch-priorities.yaml deleted file mode 100644 index 77c8f3b..0000000 --- a/setup.k8s/mlbatch-priorities.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: low-priority -value: 1 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class for all lower priority jobs." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: default-priority -value: 5 -preemptionPolicy: PreemptLowerPriority -globalDefault: true -description: "This is the priority class for all jobs (default priority)." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: high-priority -value: 10 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class defined for highly important jobs that would evict lower and default priority jobs." diff --git a/setup.k8s/scheduler-priority-patch.yaml b/setup.k8s/scheduler-priority-patch.yaml deleted file mode 100644 index 278802f..0000000 --- a/setup.k8s/scheduler-priority-patch.yaml +++ /dev/null @@ -1,3 +0,0 @@ -- op: add - path: /spec/template/spec/priorityClassName - value: system-node-critical diff --git a/setup.k8s/training-operator/base/kustomization.yaml b/setup.k8s/training-operator/base/kustomization.yaml deleted file mode 100644 index 6aa6dc2..0000000 --- a/setup.k8s/training-operator/base/kustomization.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: mlbatch-system - -resources: -- "https://github.com/kubeflow/training-operator/manifests/base?ref=v1.7.0" - -labels: -- pairs: - app.kubernetes.io/name: training-operator - app.kubernetes.io/component: controller - includeSelectors: true - -images: -- name: kubeflow/training-operator - newTag: "v1-855e096" - -patches: -- path: manager_resources_patch.yaml diff --git a/setup.k8s/training-operator/base/manager_resources_patch.yaml b/setup.k8s/training-operator/base/manager_resources_patch.yaml deleted file mode 100644 index 93f052b..0000000 --- a/setup.k8s/training-operator/base/manager_resources_patch.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: training-operator -spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: training-operator - args: - - "--zap-log-level=2" - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 500m - memory: 1000Mi diff --git a/setup.k8s/training-operator/coscheduling/kustomization.yaml b/setup.k8s/training-operator/coscheduling/kustomization.yaml deleted file mode 100644 index dc8e5ba..0000000 --- a/setup.k8s/training-operator/coscheduling/kustomization.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: mlbatch-system - -resources: -- ../base - -patches: -- target: - kind: Deployment - name: training-operator - patch: | - - op: add - path: /spec/template/spec/containers/0/args/- - value: "--gang-scheduler-name=scheduler-plugins-scheduler" diff --git a/setup.k8s/training-operator/sakkara/kustomization.yaml b/setup.k8s/training-operator/sakkara/kustomization.yaml deleted file mode 100644 index 4b40383..0000000 --- a/setup.k8s/training-operator/sakkara/kustomization.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: mlbatch-system - -resources: -- ../base - -patches: -- target: - kind: Deployment - name: training-operator - patch: | - - op: add - path: /spec/template/spec/containers/0/args/- - value: "--gang-scheduler-name=sakkara-scheduler" diff --git a/setup.tmpl/CLUSTER-SETUP.md.tmpl b/setup.tmpl/CLUSTER-SETUP.md.tmpl deleted file mode 100644 index 1cb3f8d..0000000 --- a/setup.tmpl/CLUSTER-SETUP.md.tmpl +++ /dev/null @@ -1,291 +0,0 @@ -# Cluster Setup - -{{ if .RHOAI -}} -The cluster setup installs Red Hat OpenShift AI and configures Scheduler Plugins, Kueue, -cluster roles, and priority classes. - -{{- else -}} -The cluster setup installs and configures the following components: -+ Scheduler Plugins -+ Kubeflow Training Operator -+ KubeRay -+ Kueue -+ AppWrappers -+ Cluster roles and priority classes -+ Autopilot - -{{- end }} - -## Priorities - -Create `default-priority`, `high-priority`, and `low-priority` priority classes: -```sh -{{ .KUBECTL }} apply -f setup.{{ .VERSION }}/mlbatch-priorities.yaml -``` - -## Scheduler Configuration - -MLBatch configures Kubernetes scheduling to accomplish two objectives: -+ Obtaining gang (all or nothing) scheduling for multi-Pod workloads. -+ Packing Pods whose GPU request is less than the number of GPUs on a Node to - maximize the number of Nodes available for Pods that request all the GPUs on a Node. - -{{ if .RHOAI -}} -This is done by installing the Coscheduling out-of-tree scheduler plugin and configuring -the default NodeResourcesFit scheduler plugin to pack in the GPU dimension. -{{- else -}} -The currently recommend way to do this is by installing the Coscheduling out-of-tree scheduler -plugin and configuring the default NodeResourcesFit scheduler plugin to pack in the GPU dimension. -Alternatively, you can skip the helm install and patch commands shown below and instead install -the experimental Sakkara scheduler plugin (described next). -{{- end }} - - -```sh -helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \ - scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \ - --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]' -``` -Patch scheduler-plugins pod priorities: -```sh -{{ .KUBECTL }} patch deployment -n scheduler-plugins --type=json --patch-file setup.{{ .VERSION }}/scheduler-priority-patch.yaml scheduler-plugins-controller -{{ .KUBECTL }} patch deployment -n scheduler-plugins --type=json --patch-file setup.{{ .VERSION }}/scheduler-priority-patch.yaml scheduler-plugins-scheduler -``` - -{{ if not .RHOAI -}} -### Sakkara - -[Sakkara](https://github.com/atantawi/scheduler-plugins/tree/sakkara) is an experimental -new scheduler plugin with advanced support for topology-aware scheduling. - -Install Sakkara as a secondary scheduler: -```sh -helm install sakkara-scheduler --namespace sakkara-scheduler --create-namespace mlbatch/sakkara-scheduler -``` -Optionally, create a config map capturing your cluster's topology as described in the [Sakkara documentation](https://github.com/atantawi/sakkara-deploy/tree/main?tab=readme-ov-file#cluster-topology). This step is optional but recommended for production clusters. If the config map is not present Sakkara will default to a single-level hierarchy containing the Nodes of the cluster. -{{- end }} - -{{ if .RHOAI -}} -## Red Hat OpenShift AI - -Create the Red Hat OpenShift AI subscription: -```sh -{{ .KUBECTL }} apply -f setup.{{ .VERSION }}/mlbatch-subscription.yaml -```` -Identify install plan: -```sh -{{ .KUBECTL }} get ip -n redhat-ods-operator -``` -``` -NAMESPACE NAME CSV APPROVAL APPROVED -redhat-ods-operator install-kmh8w rhods-operator.2.16.0 Manual false -``` -Approve install plan replacing the generated plan name below with the actual -value: -```sh -{{ .KUBECTL }} patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kmh8w -``` -Create DSC Initialization: -```sh -{{ .KUBECTL }} apply -f setup.{{ .VERSION }}/mlbatch-dsci.yaml -``` -Create Data Science Cluster: -```sh -{{ .KUBECTL }} apply -f setup.{{ .VERSION }}/mlbatch-dsc.yaml -``` -The provided DSCI and DSC are intended to install a minimal set of Red Hat OpenShift -AI managed components: `codeflare`, `kueue`, `ray`, and `trainingoperator`. The -remaining components such as `dashboard` can be optionally enabled. - -The configuration of the managed components differs from the default Red Hat OpenShift -AI configuration as follows: -- Kubeflow Training Operator: - - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, -- Kueue: - - `manageJobsWithoutQueueName` is enabled, - - `batch/job` integration is disabled, - - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, -{{- if .FAIRSHARE }} - - `fairSharing` is enabled, -{{- end }} - - `enableClusterQueueResources` metrics is enabled, -- Codeflare operator: - - the AppWrapper controller is enabled and configured as follows: - - `userRBACAdmissionCheck` is disabled, - - `schedulerName` is set to `scheduler-plugins-scheduler`, - - `queueName` is set to `default-queue`, - - `slackQueueName` is set to `slack-cluster-queue` -- pod priorities, resource requests and limits have been adjusted. - -{{ if (eq .VERSION "RHOAI-v2.13") -}} -To work around https://issues.redhat.com/browse/RHOAIENG-7887 (a race condition -in Red Hat OpenShift AI installation), do a rolling restart of the Kueue manager. -```sh -{{ .KUBECTL }} rollout restart deployment/kueue-controller-manager -n redhat-ods-applications -``` - -After doing the restart, verify that you see the following lines in the -kueue-controller-manager's log: -```sh -{"level":"info","ts":"2024-06-25T20:17:25.689638786Z","logger":"controller-runtime.builder","caller":"builder/webhook.go:189","msg":"Registering a validating webhook","GVK":"kubeflow.org/v1, Kind=PyTorchJob","path":"/validate-kubeflow-org-v1-pytorchjob"} -{"level":"info","ts":"2024-06-25T20:17:25.689698615Z","logger":"controller-runtime.webhook","caller":"webhook/server.go:183","msg":"Registering webhook","path":"/validate-kubeflow-org-v1-pytorchjob"} -{"level":"info","ts":"2024-06-25T20:17:25.689743757Z","logger":"setup","caller":"jobframework/setup.go:81","msg":"Set up controller and webhook for job framework","jobFrameworkName":"kubeflow.org/pytorchjob"} - -``` -{{- end }} -{{- else -}} -## Install Operators - -Create the mlbatch-system namespace -```sh -{{ .KUBECTL }} create namespace mlbatch-system -``` - -Install the Kubeflow Training Operator - -If you are using Coscheduling do: -```sh -{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/training-operator/coscheduling -``` -If you are using Sakkara do: -```sh -{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/training-operator/sakkara -``` - -Install the KubeRay Operator -```sh -{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/kuberay -``` - -Install Kueue -```sh -{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/kueue -``` - -Install the AppWrapper Operator -If you are using Coscheduling do: -```sh -{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/appwrapper/coscheduling -``` -If you are using Sakkara do: -```sh -{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/appwrapper/sakkara -``` - -The provided configuration differs from the default configuration of the -operators as follows: -- Kubeflow Training Operator: - - `gang-scheduler-name` is set to either `scheduler-plugins-scheduler` or `sakkara-scheduler`, -- Kueue: - - `batch/job` integration is disabled, - - `manageJobsWithoutQueueName` is enabled and configured via `managedJobsNamespaceSelector` to be - scoped to only namespaces that are labeled with `mlbatch-team-namespace=true`. - - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, -{{- if .FAIRSHARE }} - - `fairSharing` is enabled, -{{- end }} - - `enableClusterQueueResources` metrics is enabled, -- AppWrapper operator: - - `userRBACAdmissionCheck` is disabled, - - `schedulerName` is set to `scheduler-plugins-scheduler` or `sakkara-scheduler`, - - `queueName` is set to `default-queue`, -- pod priorities, resource requests and limits have been adjusted. - -{{- end }} - -## Autopilot - -Helm charts values and how-to for customization can be found [in the official documentation](https://github.com/IBM/autopilot/blob/main/helm-charts/autopilot/README.md). As-is, Autopilot will run on GPU nodes. - -- Add the Autopilot Helm repository - -```bash -helm repo add autopilot https://ibm.github.io/autopilot/ -helm repo update -``` - -- Install the chart (idempotent command). The config file is for customizing the helm values and it is optional. - -```bash -helm upgrade autopilot autopilot/autopilot --install --namespace=autopilot --create-namespace -f your-config.yml -``` - -### Enabling Prometheus metrics - -{{ if .RHOAI -}} -After completing the installation, manually label the namespace to enable metrics to be scraped by Prometheus with the following command: - -```bash -{{ .KUBECTL }} label ns autopilot openshift.io/cluster-monitoring=true -``` - -The `ServiceMonitor` labeling is not required. -{{- else -}} -The `ServiceMonitor` object is the one that enables Prometheus to scrape the metrics produced by Autopilot. -In order for Prometheus to find the right objects, the `ServiceMonitor` needs to be annotated with the Prometheus' release name. It is usually `prometheus`, and that's the default added in the Autopilot release. -If that is not the case in your cluster, the correct release label can be found by checking in the `ServiceMonitor` of Prometheus itself, or the name of Prometheus helm chart. -Then, Autopilot's `ServiceMonitor` can be labeled with the following command - -```bash -{{ .KUBECTL }} label servicemonitors.monitoring.coreos.com -n autopilot autopilot-metrics-monitor release= --overwrite -``` -{{- end }} - -## Kueue Configuration - -Create Kueue's default flavor: -```sh -{{ .KUBECTL }} apply -f setup.{{ .VERSION }}/default-flavor.yaml -``` - -## Cluster Role - -Create `mlbatch-edit` role: -```sh -{{ .KUBECTL }} apply -f setup.{{ .VERSION }}/mlbatch-edit-role.yaml -``` - -## Slack Cluster Queue - -Create the designated slack `ClusterQueue` which will be used to automate -minor adjustments to cluster capacity caused by node failures and -scheduler maintanence. -```sh -{{ .KUBECTL }} apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: slack-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - - name: "memory" - nominalQuota: 128Gi - - name: "nvidia.com/gpu" - nominalQuota: 8 - - name: "nvidia.com/roce_gdr" - nominalQuota: 1 - - name: "pods" - nominalQuota: 100 -EOF -``` -Edit the above quantities to adjust the quota to the desired -values. Pod counts are optional and can be omitted from the list of -covered resources. The `lendingLimit` for each resource will be -dynamically adjusted by the MLBatch system to reflect reduced cluster -capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a -detailed discussion of the role of the slack `ClusterQueue`. diff --git a/setup.tmpl/Kubernetes.yaml b/setup.tmpl/Kubernetes.yaml deleted file mode 100644 index d498bdb..0000000 --- a/setup.tmpl/Kubernetes.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# Values for Kubernetes v1.29+ - -RHOAI: false -VERSION: k8s -KUBECTL: kubectl -FAIRSHARE: true diff --git a/setup.tmpl/Makefile b/setup.tmpl/Makefile deleted file mode 100644 index a7fe221..0000000 --- a/setup.tmpl/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -.PHONY: all -all: docs - -##@ General - -# The help target prints out all targets with their descriptions organized -# beneath their categories. The categories are represented by '##@' and the -# target descriptions by '##'. The awk command is responsible for reading the -# entire set of makefiles included in this invocation, looking for lines of the -# file as xyz: ## something, and then pretty-format the target and help. Then, -# if there's a line with ##@ something, that gets pretty-printed as a category. -# More info on the usage of ANSI control characters for terminal formatting: -# https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters -# More info on the awk command: -# http://linuxcommand.org/lc3_adv_awk.php - -.PHONY: help -help: ## Display this help. - @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) - - -##@ Generate Documentation -docs: gotmpl - ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.16/CLUSTER-SETUP.md -values RHOAI-v2.16.yaml - ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.16/TEAM-SETUP.md -values RHOAI-v2.16.yaml - ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.19/CLUSTER-SETUP.md -values RHOAI-v2.19.yaml - ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.19/TEAM-SETUP.md -values RHOAI-v2.19.yaml - ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.k8s/CLUSTER-SETUP.md -values Kubernetes.yaml - ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.k8s/TEAM-SETUP.md -values Kubernetes.yaml - - -##@ Dependencies - -.PHONY: gotmpl -gotmpl: ## Build gotmpl tool - cd ../tools/gotmpl && go build ./... diff --git a/setup.tmpl/RHOAI-v2.16.yaml b/setup.tmpl/RHOAI-v2.16.yaml deleted file mode 100644 index 17cff67..0000000 --- a/setup.tmpl/RHOAI-v2.16.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# Values for RHOAI 2.16 - -RHOAI: true -VERSION: RHOAI-v2.16 -KUBECTL: oc -FAIRSHARE: true diff --git a/setup.tmpl/RHOAI-v2.17.yaml b/setup.tmpl/RHOAI-v2.17.yaml deleted file mode 100644 index c243c3c..0000000 --- a/setup.tmpl/RHOAI-v2.17.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# Values for RHOAI 2.17 - -RHOAI: true -VERSION: RHOAI-v2.17 -KUBECTL: oc -FAIRSHARE: true diff --git a/setup.tmpl/RHOAI-v2.19.yaml b/setup.tmpl/RHOAI-v2.19.yaml deleted file mode 100644 index 0b54073..0000000 --- a/setup.tmpl/RHOAI-v2.19.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# Values for RHOAI 2.19 - -RHOAI: true -VERSION: RHOAI-v2.19 -KUBECTL: oc -FAIRSHARE: true diff --git a/setup.tmpl/TEAM-SETUP.md.tmpl b/setup.tmpl/TEAM-SETUP.md.tmpl deleted file mode 100644 index 59476b0..0000000 --- a/setup.tmpl/TEAM-SETUP.md.tmpl +++ /dev/null @@ -1,126 +0,0 @@ -# Team Setup - -A *team* in MLBatch is a group of users that share a resource quota. - -Before setting up your teams and quotas, please read [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) -for a discussion of our recommended best practices. - -{{ if .RHOAI }} -Setting up a new team requires the cluster admin to create a project, -a user group, a quota, a queue, and the required role bindings as described below. - -Create project: -```sh -{{ .KUBECTL }} new-project team1 -``` -Create user group: -```sh -{{ .KUBECTL }} adm groups new team1-edit-group -``` -Add users to group for example: -```sh -{{ .KUBECTL }} adm groups add-users team1-edit-group user1 -``` -Bind cluster role to group in namespace: -```sh -{{ .KUBECTL }} adm policy add-role-to-group mlbatch-edit team1-edit-group --role-namespace="" --namespace team1 -``` -{{- else -}} -Setting up a new team requires the cluster admin to create a namespace, -a quota, a queue, and the required role bindings as described below. - -{{ if .RHOAI }} -Create the namespace: -```sh -{{ .KUBECTL }} create namespace team1 -``` -{{- else -}} -Create and label the namespace: -```sh -{{ .KUBECTL }} create namespace team1 -{{ .KUBECTL }} label namespace team1 'mlbatch-team-namespace=true' -``` -{{- end }} - -For each user on the team, create a RoleBinding: -```sh -{{ .KUBECTL }} -n team1 apply -f- << EOF -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: user-one -subjects: - - kind: User - apiGroup: rbac.authorization.k8s.io - name: user-one -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: mlbatch-edit -EOF -``` -{{- end }} - -Specify the intended quota for the namespace by creating a `ClusterQueue`: -```sh -{{ .KUBECTL }} apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: team1-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "memory" - nominalQuota: 128Gi - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/gpu" - nominalQuota: 16 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/roce_gdr" - nominalQuota: 4 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "pods" - nominalQuota: 100 - # borrowingLimit: 0 - # lendingLimit: 0 -EOF -``` -Edit the above quantities to adjust the quota to the desired values. Pod counts -are optional and can be omitted from the list of covered resources. - -Uncomment all `borrowingLimit` lines to prevent this namespace from borrowing -quota from other namespaces. Uncomment all `lendingLimit` lines to prevent other -namespaces from borrowing quota from this namespace. - -Create a `LocalQueue` to bind the `ClusterQueue` to the namespace: -```sh -{{ .KUBECTL }} apply -n team1 -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: default-queue -spec: - clusterQueue: team1-cluster-queue -EOF -``` -We recommend naming the local queue `default-queue` as `AppWrappers` will -default to this queue name. - diff --git a/tools/appwrapper-packager/README.md b/tools/appwrapper-packager/README.md deleted file mode 100644 index 02fce9e..0000000 --- a/tools/appwrapper-packager/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# AppWrapper Packager - -The Python script in this directory takes as input a YAML file -containing one or more Kubernetes resources and generates -an output YAML file with an AppWrapper containing the input -resources. - -Example invocation: -```sh -./awpack.py -i input.yaml -o aw.yaml -n my-appwrapper -``` - -Usage information: -```sh -usage: awpack.py [-h] -i INPUT [-n NAME] [-o OUTPUT] - -Wrap Resources in an AppWrapper - -options: - -h, --help show this help message and exit - -i INPUT, --input INPUT - input YAML file - -n NAME, --name NAME name of AppWrapper - -o OUTPUT, --output OUTPUT - output file -``` diff --git a/tools/appwrapper-packager/awpack.py b/tools/appwrapper-packager/awpack.py deleted file mode 100755 index 0aaea4d..0000000 --- a/tools/appwrapper-packager/awpack.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python - -import os -import string -import argparse -from pathlib import Path - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Wrap Resources in an AppWrapper" - ) - parser.add_argument("-i", "--input", type=str, help="input YAML file", required=True) - parser.add_argument("-n", "--name", type=str, help="name of AppWrapper", default="sample-appwrapper") - parser.add_argument("-o", "--output", type=str, help="output file", default="aw.yaml") - args = parser.parse_args() - - new_object = True - - with open(args.output, mode="w") as output_file: - with open(args.input) as input_file: - output_file.write("apiVersion: workload.codeflare.dev/v1beta2\n") - output_file.write("kind: AppWrapper\n") - output_file.write("metadata:\n") - output_file.write(f" name: {args.name}\n") - output_file.write(" labels:\n") - output_file.write(" kueue.x-k8s.io/queue-name: default-queue\n") - output_file.write("spec:\n") - output_file.write(" components:\n") - while True: - line = input_file.readline() - if not line: - break - if line.startswith("---"): - new_object = True - continue - if line == "\n": - continue - if new_object: - output_file.write(" - template:\n") - new_object = False - output_file.write(" "+line) diff --git a/tools/cluster-checker/README.md b/tools/cluster-checker/README.md deleted file mode 100644 index 4527fd1..0000000 --- a/tools/cluster-checker/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Cluster Checker - -The tool in this directory produces a summary view on GPU quotas and utilization -on the cluster. It also diagnoses the state of a cluster looking for common -issues. - -The tool is implemented in JavaScript and intended to run with Node.js. - -Install [Node.js](https://nodejs.org/) with the npm package manager. - -Install dependencies with: -```sh -npm install -``` - -Run the tool against the current Kubernetes context with: -```sh -node checker.js -``` -``` -CLUSTER QUEUE GPU QUOTA GPU USAGE ADMITTED WORKLOADS PENDING WORKLOADS -team1-cluster-queue 8 16 1 0 -team2-cluster-queue 8 4 4 0 - -Total GPU count in cluster: 24 -Unschedulable GPU count: - 0 -Schedulable GPU count: = 24 - -Nominal GPU quota: 16 -Slack GPU quota: + 8 -Total GPU quota: = 24 - -GPU usage by admitted workloads: 20 -Borrowed GPU count: 8 - -WARNING: workload "default/pytorchjob-job-e6381" refers to a non-existent local queue "test-queue" -``` diff --git a/tools/cluster-checker/checker.js b/tools/cluster-checker/checker.js deleted file mode 100644 index 85ac6cf..0000000 --- a/tools/cluster-checker/checker.js +++ /dev/null @@ -1,442 +0,0 @@ -'use strict' - -const k8s = require('@kubernetes/client-node') -const k8srp = require('kubernetes-resource-parser') - -const nodeResources = { - 'nvidia.com/gpu': 8, - 'nvidia.com/roce_gdr': 2, - cpu: 80, - memory: '1100Gi' -} - -class Client { - constructor () { - const config = new k8s.KubeConfig() - config.loadFromDefault() - config.getCurrentCluster().skipTLSVerify = true - this.core = config.makeApiClient(k8s.CoreV1Api) - this.custom = config.makeApiClient(k8s.CustomObjectsApi) - } - - async nodes () { - const res = await this.core.listNode() - return res.body.items - } - - async namespaces () { - const res = await this.core.listNamespace() - return res.body.items - } - - async pods (namespace) { - const res = await this.core.listNamespacedPod(namespace) - return res.body.items - } - - async readConfigMap (name, namespace) { - const res = await this.core.readNamespacedConfigMap(name, namespace) - return res.body - } - - async readOperatorConfig () { - const options = [ - { ns: 'redhat-ods-applications', cm: 'codeflare-operator-config', key: 'config.yaml', f: cm => cm.appwrapper?.Config }, - { ns: 'mlbatch-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper }, - { ns: 'appwrapper-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper } - ] - for (const opt of options) { - try { - const configMap = await this.readConfigMap(opt.cm, opt.ns) - const cm = k8s.loadYaml(configMap.data[opt.key]) - return opt.f(cm) - } catch (error) { - } - } - console.log('WARNING: Failed to read operator config') - return {} - } - - async clusterQueues () { - const res = await this.custom.listClusterCustomObject( - 'kueue.x-k8s.io', - 'v1beta1', - 'clusterqueues') - return res.body.items - } - - async localQueues (namespace) { - const res = await this.custom.listNamespacedCustomObject( - 'kueue.x-k8s.io', - 'v1beta1', - namespace, - 'localqueues') - return res.body.items - } - - async workloads (namespace) { - const res = await this.custom.listNamespacedCustomObject( - 'kueue.x-k8s.io', - 'v1beta1', - namespace, - 'workloads') - return res.body.items - } -} - -// pad value with spaces to the left -function pad (v, n) { - return String(v ?? '').padStart(n) -} - -// format and print table -function printTable (table, kind, ...columns) { - const widths = { name: kind.length } // column widths - const names = Object.keys(table).sort() // object names - - // compute column widths - for (const name of names) { - widths.name = Math.max(widths.name, name.length) - for (const column of columns) { - widths[column[1]] = Math.max(widths[column[1]] ?? column[0].length, String(table[name][column[0]] ?? '').length) - } - } - - // print table header - let header = kind.toUpperCase().padEnd(widths.name, ' ') - for (const column of columns) { - header += ' ' + pad(column[0].toUpperCase(), widths[column[1]]) - } - console.log(header) - - // print table rows - for (const name of names) { - let row = name.padEnd(widths.name, ' ') - for (const column of columns) { - row += ' ' + pad(table[name][column[1]], widths[column[1]]) - } - console.log(row) - } -} - -// return the number of GPUs reserved by the pod -function reservation (pod) { - if (pod.status?.phase === 'Succeeded' || pod.status?.phase === 'Failed') { - return 0 // pod has already completed - } - let scheduled = false - for (const condition of pod.status?.conditions ?? []) { - if (condition.type === 'PodScheduled') { - if (condition.status === 'True') { - scheduled = true - } - break // PodScheduled condition may only appear once - } - } - if (!scheduled) { - return 0 // pod has not been scheduled yet - } - let gpus = 0 - // compute sum of container gpu limits - for (const container of pod.spec.containers) { - gpus += parseInt(container.resources?.limits?.['nvidia.com/gpu'] ?? '0') - } - // compute max with init container gpu limits - for (const container of pod.spec.initContainers ?? []) { - gpus = Math.max(gpus, parseInt(container.resources?.limits?.['nvidia.com/gpu'] ?? '0')) - } - return gpus -} - -// check container resource requests against node_resources -function checkContainerResources (namespace, workload, workloadReplicas, container) { - // selectively merge limits into requests - const resources = {} - for (const k in container.resources?.requests ?? []) { - resources[k] = container.resources.requests[k] - } - for (const k in container.resources?.limits ?? []) { - if (!(k in resources)) { - resources[k] = container.resources.limits[k] - } - } - - const gpus = parseInt(resources['nvidia.com/gpu'] ?? '0') - const gdr = parseInt(resources['nvidia.com/roce_gdr'] ?? '0') - const cpus = k8srp.cpuParser(resources.cpu ?? '0') - const mem = k8srp.memoryParser(resources.memory ?? '0') - - // warn if the resource requests cannot be satisfied by a Node - if (gpus > nodeResources['nvidia.com/gpu']) { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${gpus} GPUs"`) - } - if (gdr > nodeResources.gdrPerNode) { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr interfaces"`) - } - if (cpus > nodeResources.cpu) { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${cpus} CPUs"`) - } - if (mem > k8srp.memoryParser(nodeResources.memory)) { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources.memory} memory`) - } - - // warn if the resource:GPU ratio is not proportional to Node resources - if (gdr > 0 && ((gpus === 0) || (gpus / gdr < nodeResources['nvidia.com/gpu'] / nodeResources['nvidia.com/roce_gdr']))) { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr but only ${gpus} GPUs`) - } - if (gpus > 0 && (cpus > 0) && (cpus / gpus > nodeResources.cpu / nodeResources['nvidia.com/gpu'])) { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${cpus} cpus but only ${gpus} GPUs`) - } - if (gpus > 0 && (mem > 0) && (mem / gpus > k8srp.memoryParser(nodeResources.memory) / nodeResources['nvidia.com/gpu'])) { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources.memory} memory but only ${gpus} GPUs`) - } - - // warn if other resource constraints are violated - if (gdr > 0 && workloadReplicas < 2) { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" is a single pod workload that is requesting ${gdr} roce_gdr`) - } -} - -// check user namespace -async function checkUserNamespace (client, namespace, queues) { - const workloads = await client.workloads(namespace.metadata.name) - - for (const workload of workloads) { - // report invalid queue names - const queueName = workload.spec.queueName - if (queueName) { - if (!queues.find(queue => queue.metadata.name === queueName)) { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" refers to a non-existent local queue "${queueName}"`) - } - } else { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" is missing a local queue name`) - } - - // report high-priority workloads - if (workload.spec.priorityClassName !== 'default-priority' && workload.spec.priorityClassName !== 'low-priority') { - console.log(`NOTE: workload "${namespace.metadata.name}/${workload.metadata.name}" has priority "${workload.spec.priorityClassName}"`) - } - - // report unusual conditions - const conditions = {} - for (const condition of workload.status?.conditions ?? []) { - conditions[condition.type] = condition.status - } - if (conditions.Finished !== 'True') { - if (conditions.Admitted === 'True' && conditions.PodsReady === 'False') { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has conditions Admitted=True and PodsReady=False`) - } - if (conditions.Evicted === 'True') { - console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has condition Evicted=True`) - } - } - - // report misconfigured resource requests - let replicas = 0 - for (const podSet of workload.spec?.podSets) { - replicas += podSet.count ?? 0 - } - for (const podSet of workload.spec?.podSets) { - for (const ic of podSet.template?.spec?.initContainers ?? []) { - checkContainerResources(namespace, workload, replicas, ic) - } - for (const c of podSet.template?.spec?.containers ?? []) { - checkContainerResources(namespace, workload, replicas, c) - } - } - } - - // report GPU pods using default scheduler - const pods = await client.pods(namespace.metadata.name) - for (const pod of pods) { - if (pod.spec.schedulerName === 'default-scheduler' && reservation(pod) > 0) { - console.log(`WARNING: pod "${namespace.metadata.name}/${pod.metadata.name}" is using default-scheduler`) - } - } -} - -// check system namespace -async function checkSystemNamespace (client, namespace, nodes) { - const pods = await client.pods(namespace.metadata.name) - - for (const pod of pods) { - // report GPU occupancy - const gpus = reservation(pod) - if (gpus) { - const node = nodes.find(node => node.metadata.name === pod.spec.nodeName) - console.log(`WARNING: pod "${namespace.metadata.name}/${pod.metadata.name}" occupies ${gpus} GPU(s)` + - `on node "${pod.spec.nodeName}" with GPU taints noExecute=${node?.noExecute} and noSchedule=${node?.noSchedule}`) - } - } -} - -async function main () { - try { - // initialize kubernetes client - const client = new Client() - - let clusterGPUs = 0 // cluster capacity - let noScheduleGPUs = 0 // no-schedule GPUs - let noExecuteGPUs = 0 // no-execute GPUs - let usedGPUs = 0 // GPU usage by admitted workloads - let borrowedGPUs = 0 // GPU borrowed from the cohort - let quotaGPUs = 0 // nominal GPU quota (excluding slack queue) - let limitGPUs = 0 // lending limit on slack queue - let slackGPUs = 0 // nominal GPU quota on slack queue - - const config = await client.readOperatorConfig() - const taints = config.autopilot?.resourceTaints?.['nvidia.com/gpu'] - const slackQueueName = config.slackQueueName - - let newline = false - - // compute GPU counts - const nodes = await client.nodes() - for (const node of nodes) { - const gpus = parseInt(node.status.capacity['nvidia.com/gpu'] ?? '0') - if (gpus > 0) { - clusterGPUs += gpus - node.noSchedule = false - node.noExecute = false - for (const taint of taints ?? []) { - if (node.metadata.labels?.[taint.key] === taint.value) { - if (taint.effect === 'NoExecute') { - console.log(`WARNING: node "${node.metadata.name}" has label "${taint.key}"="${taint.value}" with effect "${taint.effect}"`) - newline = true - node.noExecute = true - } else if (taint.effect === 'NoSchedule') { - console.log(`WARNING: node "${node.metadata.name}" has label "${taint.key}"="${taint.value}" with effect "${taint.effect}"`) - newline = true - node.noSchedule = true - } - } - } - for (const taint of node.spec.taints ?? []) { - if (taint.effect === 'NoExecute') { - console.log(`WARNING: node "${node.metadata.name}" has taint "${taint.key}" with effect "${taint.effect}"`) - newline = true - node.noExecute = true - } else if (taint.effect === 'NoSchedule') { - console.log(`WARNING: node "${node.metadata.name}" has taint "${taint.key}" with effect "${taint.effect}"`) - newline = true - node.noSchedule = true - } - } - if (node.noExecute) { - noExecuteGPUs += gpus - } else if (node.noSchedule) { // no double counting - noScheduleGPUs += gpus - } - } - } - - if (newline) { - console.log() - } - - // collect cluster queue metrics - const clusterQueues = await client.clusterQueues() - const queues = {} - for (const clusterQueue of clusterQueues) { - const queue = { - quota: 0, - usage: 0, - borrowed: 0, - lendingLimit: 0, - admitted: clusterQueue.status?.admittedWorkloads ?? 0, - pending: clusterQueue.status?.pendingWorkloads ?? 0 - } - for (const resourceGroup of clusterQueue.spec.resourceGroups) { - if (resourceGroup.coveredResources.includes('nvidia.com/gpu')) { - for (const flavor of resourceGroup.flavors) { - for (const resource of flavor.resources) { - if (resource.name === 'nvidia.com/gpu') { - queue.quota += parseInt(resource.nominalQuota ?? '0') - // lending limit is nominal quota if not set - queue.lendingLimit += parseInt(resource.lendingLimit ?? resource.nominalQuota ?? '0') - break // resource may only occur once in flavor - } - } - } - break // resource may only belong to one resource group - } - } - for (const flavor of clusterQueue.status?.flavorsUsage ?? []) { - for (const resource of flavor.resources) { - if (resource.name === 'nvidia.com/gpu') { - queue.usage += parseInt(resource.total ?? '0') - queue.borrowed += parseInt(resource.borrowed ?? '0') - break // resource may only occur once in flavor - } - } - } - usedGPUs += queue.usage - borrowedGPUs += queue.borrowed - if (clusterQueue.metadata.name === slackQueueName) { - slackGPUs = queue.quota - limitGPUs = queue.lendingLimit - // do not include slack queue in table - } else { - quotaGPUs += queue.quota - queues[clusterQueue.metadata.name] = queue - } - } - - // print cluster queue table - printTable(queues, 'cluster queue', ['gpu quota', 'quota'], ['gpu usage', 'usage'], - ['admitted workloads', 'admitted'], ['pending workloads', 'pending']) - console.log() - - // print summary results - const width = Math.max(String(clusterGPUs).length, String(quotaGPUs).length) - console.log(`Total GPU count in cluster: ${pad(clusterGPUs, width)}`) - console.log(`Unschedulable GPU count: - ${pad(noExecuteGPUs + noScheduleGPUs, width)}`) - console.log(`Schedulable GPU count: = ${pad(clusterGPUs - noExecuteGPUs - noScheduleGPUs, width)}`) - console.log() - console.log(`Nominal GPU quota: ${pad(quotaGPUs, width)}`) - console.log(`Maximum slack GPU quota: + ${pad(slackGPUs, width)}`) - console.log(`Slack GPU quota adjustment: - ${pad(slackGPUs - limitGPUs, width)}`) - console.log(`Current GPU quota: = ${pad(quotaGPUs + limitGPUs, width)}`) - console.log() - console.log(`GPU usage by admitted workloads: ${pad(usedGPUs, width)}`) - console.log(`Borrowed GPU count: ${pad(borrowedGPUs, width)}`) - console.log() - - if (quotaGPUs > clusterGPUs - noExecuteGPUs - noScheduleGPUs) { - console.log('WARNING: nominal GPU quota is greater than schedulable GPU count') - } - - if (quotaGPUs + slackGPUs < clusterGPUs) { - console.log('WARNING: maximum GPU quota is lower than total GPU count') - } - - if (quotaGPUs + slackGPUs > clusterGPUs) { - console.log('WARNING: maximum GPU quota is greater than total GPU count') - } - - // check all accessible namespaces - const namespaces = await client.namespaces() - for (const namespace of namespaces) { - if (namespace.metadata.name.startsWith('openshift-')) { - continue // skip openshift namespaces - } - - let localQueues - try { - localQueues = await client.localQueues(namespace.metadata.name) - } catch (err) { - continue // skip inaccessible namespaces - } - - if (localQueues.length === 0) { - await checkSystemNamespace(client, namespace, nodes) - } else { - await checkUserNamespace(client, namespace, localQueues) - } - } - } catch (err) { - console.error(err) - } -} - -main() diff --git a/tools/cluster-checker/package-lock.json b/tools/cluster-checker/package-lock.json deleted file mode 100644 index 84aa97a..0000000 --- a/tools/cluster-checker/package-lock.json +++ /dev/null @@ -1,4502 +0,0 @@ -{ - "name": "cluster-checker", - "lockfileVersion": 3, - "requires": true, - "packages": { - "": { - "dependencies": { - "@kubernetes/client-node": "^0.21.0", - "kubernetes-resource-parser": "0.1.0" - }, - "devDependencies": { - "standard": "^17.1.2" - } - }, - "node_modules/@eslint-community/eslint-utils": { - "version": "4.4.0", - "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.4.0.tgz", - "integrity": "sha512-1/sA4dwrzBAyeUoQ6oxahHKmrZvsnLCg4RfxW3ZFGGmQkSNQPFNLV9CUEFQP1x9EYXHTo5p6xdhZM1Ne9p/AfA==", - "dev": true, - "dependencies": { - "eslint-visitor-keys": "^3.3.0" - }, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "peerDependencies": { - "eslint": "^6.0.0 || ^7.0.0 || >=8.0.0" - } - }, - "node_modules/@eslint-community/regexpp": { - "version": "4.11.1", - "resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.11.1.tgz", - "integrity": "sha512-m4DVN9ZqskZoLU5GlWZadwDnYo3vAEydiUayB9widCl9ffWx2IvPnp6n3on5rJmziJSw9Bv+Z3ChDVdMwXCY8Q==", - "dev": true, - "engines": { - "node": "^12.0.0 || ^14.0.0 || >=16.0.0" - } - }, - "node_modules/@eslint/eslintrc": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-2.1.4.tgz", - "integrity": "sha512-269Z39MS6wVJtsoUl10L60WdkhJVdPG24Q4eZTH3nnF6lpvSShEK3wQjDX9JRWAUPvPh7COouPpU9IrqaZFvtQ==", - "dev": true, - "dependencies": { - "ajv": "^6.12.4", - "debug": "^4.3.2", - "espree": "^9.6.0", - "globals": "^13.19.0", - "ignore": "^5.2.0", - "import-fresh": "^3.2.1", - "js-yaml": "^4.1.0", - "minimatch": "^3.1.2", - "strip-json-comments": "^3.1.1" - }, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/@eslint/eslintrc/node_modules/brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "dev": true, - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "node_modules/@eslint/eslintrc/node_modules/minimatch": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", - "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" - } - }, - "node_modules/@eslint/js": { - "version": "8.57.1", - "resolved": "https://registry.npmjs.org/@eslint/js/-/js-8.57.1.tgz", - "integrity": "sha512-d9zaMRSTIKDLhctzH12MtXvJKSSUhaHcjV+2Z+GK+EEY7XKpP5yR4x+N3TAcHTcu963nIr+TMcCb4DBCYX1z6Q==", - "dev": true, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - } - }, - "node_modules/@humanwhocodes/config-array": { - "version": "0.13.0", - "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.13.0.tgz", - "integrity": "sha512-DZLEEqFWQFiyK6h5YIeynKx7JlvCYWL0cImfSRXZ9l4Sg2efkFGTuFf6vzXjK1cq6IYkU+Eg/JizXw+TD2vRNw==", - "deprecated": "Use @eslint/config-array instead", - "dev": true, - "dependencies": { - "@humanwhocodes/object-schema": "^2.0.3", - "debug": "^4.3.1", - "minimatch": "^3.0.5" - }, - "engines": { - "node": ">=10.10.0" - } - }, - "node_modules/@humanwhocodes/config-array/node_modules/brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "dev": true, - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "node_modules/@humanwhocodes/config-array/node_modules/minimatch": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", - "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" - } - }, - "node_modules/@humanwhocodes/module-importer": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/@humanwhocodes/module-importer/-/module-importer-1.0.1.tgz", - "integrity": "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==", - "dev": true, - "engines": { - "node": ">=12.22" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/nzakas" - } - }, - "node_modules/@humanwhocodes/object-schema": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz", - "integrity": "sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA==", - "deprecated": "Use @eslint/object-schema instead", - "dev": true - }, - "node_modules/@isaacs/cliui": { - "version": "8.0.2", - "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", - "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==", - "dependencies": { - "string-width": "^5.1.2", - "string-width-cjs": "npm:string-width@^4.2.0", - "strip-ansi": "^7.0.1", - "strip-ansi-cjs": "npm:strip-ansi@^6.0.1", - "wrap-ansi": "^8.1.0", - "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0" - }, - "engines": { - "node": ">=12" - } - }, - "node_modules/@isaacs/fs-minipass": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz", - "integrity": "sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==", - "dependencies": { - "minipass": "^7.0.4" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@kubernetes/client-node": { - "version": "0.21.0", - "resolved": "https://registry.npmjs.org/@kubernetes/client-node/-/client-node-0.21.0.tgz", - "integrity": "sha512-yYRbgMeyQbvZDHt/ZqsW3m4lRefzhbbJEuj8sVXM+bufKrgmzriA2oq7lWPH/k/LQIicAME9ixPUadTrxIF6dQ==", - "dependencies": { - "@types/js-yaml": "^4.0.1", - "@types/node": "^20.1.1", - "@types/request": "^2.47.1", - "@types/ws": "^8.5.3", - "byline": "^5.0.0", - "isomorphic-ws": "^5.0.0", - "js-yaml": "^4.1.0", - "jsonpath-plus": "^8.0.0", - "request": "^2.88.0", - "rfc4648": "^1.3.0", - "stream-buffers": "^3.0.2", - "tar": "^7.0.0", - "tslib": "^2.4.1", - "ws": "^8.11.0" - }, - "optionalDependencies": { - "openid-client": "^5.3.0" - } - }, - "node_modules/@nodelib/fs.scandir": { - "version": "2.1.5", - "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", - "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", - "dev": true, - "dependencies": { - "@nodelib/fs.stat": "2.0.5", - "run-parallel": "^1.1.9" - }, - "engines": { - "node": ">= 8" - } - }, - "node_modules/@nodelib/fs.stat": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", - "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", - "dev": true, - "engines": { - "node": ">= 8" - } - }, - "node_modules/@nodelib/fs.walk": { - "version": "1.2.8", - "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", - "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", - "dev": true, - "dependencies": { - "@nodelib/fs.scandir": "2.1.5", - "fastq": "^1.6.0" - }, - "engines": { - "node": ">= 8" - } - }, - "node_modules/@pkgjs/parseargs": { - "version": "0.11.0", - "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", - "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==", - "optional": true, - "engines": { - "node": ">=14" - } - }, - "node_modules/@rtsao/scc": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@rtsao/scc/-/scc-1.1.0.tgz", - "integrity": "sha512-zt6OdqaDoOnJ1ZYsCYGt9YmWzDXl4vQdKTyJev62gFhRGKdx7mcT54V9KIjg+d2wi9EXsPvAPKe7i7WjfVWB8g==", - "dev": true - }, - "node_modules/@types/caseless": { - "version": "0.12.5", - "resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.5.tgz", - "integrity": "sha512-hWtVTC2q7hc7xZ/RLbxapMvDMgUnDvKvMOpKal4DrMyfGBUfB1oKaZlIRr6mJL+If3bAP6sV/QneGzF6tJjZDg==" - }, - "node_modules/@types/js-yaml": { - "version": "4.0.9", - "resolved": "https://registry.npmjs.org/@types/js-yaml/-/js-yaml-4.0.9.tgz", - "integrity": "sha512-k4MGaQl5TGo/iipqb2UDG2UwjXziSWkh0uysQelTlJpX1qGlpUZYm8PnO4DxG1qBomtJUdYJ6qR6xdIah10JLg==" - }, - "node_modules/@types/json5": { - "version": "0.0.29", - "resolved": "https://registry.npmjs.org/@types/json5/-/json5-0.0.29.tgz", - "integrity": "sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ==", - "dev": true - }, - "node_modules/@types/node": { - "version": "20.16.2", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.16.2.tgz", - "integrity": "sha512-91s/n4qUPV/wg8eE9KHYW1kouTfDk2FPGjXbBMfRWP/2vg1rCXNQL1OCabwGs0XSdukuK+MwCDXE30QpSeMUhQ==", - "dependencies": { - "undici-types": "~6.19.2" - } - }, - "node_modules/@types/request": { - "version": "2.48.12", - "resolved": "https://registry.npmjs.org/@types/request/-/request-2.48.12.tgz", - "integrity": "sha512-G3sY+NpsA9jnwm0ixhAFQSJ3Q9JkpLZpJbI3GMv0mIAT0y3mRabYeINzal5WOChIiaTEGQYlHOKgkaM9EisWHw==", - "dependencies": { - "@types/caseless": "*", - "@types/node": "*", - "@types/tough-cookie": "*", - "form-data": "^2.5.0" - } - }, - "node_modules/@types/tough-cookie": { - "version": "4.0.5", - "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.5.tgz", - "integrity": "sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==" - }, - "node_modules/@types/ws": { - "version": "8.5.12", - "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.5.12.tgz", - "integrity": "sha512-3tPRkv1EtkDpzlgyKyI8pGsGZAGPEaXeu0DOj5DI25Ja91bdAYddYHbADRYVrZMRbfW+1l5YwXVDKohDJNQxkQ==", - "dependencies": { - "@types/node": "*" - } - }, - "node_modules/@ungap/structured-clone": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.2.0.tgz", - "integrity": "sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==", - "dev": true - }, - "node_modules/acorn": { - "version": "8.12.1", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.12.1.tgz", - "integrity": "sha512-tcpGyI9zbizT9JbV6oYE477V6mTlXvvi0T0G3SNIYE2apm/G5huBa1+K89VGeovbg+jycCrfhl3ADxErOuO6Jg==", - "dev": true, - "bin": { - "acorn": "bin/acorn" - }, - "engines": { - "node": ">=0.4.0" - } - }, - "node_modules/acorn-jsx": { - "version": "5.3.2", - "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz", - "integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==", - "dev": true, - "peerDependencies": { - "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" - } - }, - "node_modules/ajv": { - "version": "6.12.6", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", - "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", - "dependencies": { - "fast-deep-equal": "^3.1.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/epoberezkin" - } - }, - "node_modules/ansi-regex": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.0.1.tgz", - "integrity": "sha512-n5M855fKb2SsfMIiFFoVrABHJC8QtHwVx+mHWP3QcEqBHYienj5dHSgjbxtC0WEZXYt4wcD6zrQElDPhFuZgfA==", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/ansi-regex?sponsor=1" - } - }, - "node_modules/ansi-styles": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.1.tgz", - "integrity": "sha512-bN798gFfQX+viw3R7yrGWRqnrN2oRkEkUjjl4JNn4E8GxxbjtG3FbrEIIY3l8/hrwUwIeCZvi4QuOTP4MErVug==", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/argparse": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", - "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==" - }, - "node_modules/array-buffer-byte-length": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.1.tgz", - "integrity": "sha512-ahC5W1xgou+KTXix4sAO8Ki12Q+jf4i0+tmk3sC+zgcynshkHxzpXdImBehiUYKKKDwvfFiJl1tZt6ewscS1Mg==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.5", - "is-array-buffer": "^3.0.4" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/array-includes": { - "version": "3.1.8", - "resolved": "https://registry.npmjs.org/array-includes/-/array-includes-3.1.8.tgz", - "integrity": "sha512-itaWrbYbqpGXkGhZPGUulwnhVf5Hpy1xiCFsGqyIGglbBxmG5vSjxQen3/WGOjPpNEv1RtBLKxbmVXm8HpJStQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.2", - "es-object-atoms": "^1.0.0", - "get-intrinsic": "^1.2.4", - "is-string": "^1.0.7" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/array.prototype.findlast": { - "version": "1.2.5", - "resolved": "https://registry.npmjs.org/array.prototype.findlast/-/array.prototype.findlast-1.2.5.tgz", - "integrity": "sha512-CVvd6FHg1Z3POpBLxO6E6zr+rSKEQ9L6rZHAaY7lLfhKsWYUBBOuMs0e9o24oopj6H+geRCX0YJ+TJLBK2eHyQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.2", - "es-errors": "^1.3.0", - "es-object-atoms": "^1.0.0", - "es-shim-unscopables": "^1.0.2" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/array.prototype.findlastindex": { - "version": "1.2.5", - "resolved": "https://registry.npmjs.org/array.prototype.findlastindex/-/array.prototype.findlastindex-1.2.5.tgz", - "integrity": "sha512-zfETvRFA8o7EiNn++N5f/kaCw221hrpGsDmcpndVupkPzEc1Wuf3VgC0qby1BbHs7f5DVYjgtEU2LLh5bqeGfQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.2", - "es-errors": "^1.3.0", - "es-object-atoms": "^1.0.0", - "es-shim-unscopables": "^1.0.2" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/array.prototype.flat": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/array.prototype.flat/-/array.prototype.flat-1.3.2.tgz", - "integrity": "sha512-djYB+Zx2vLewY8RWlNCUdHjDXs2XOgm602S9E7P/UpHgfeHL00cRiIF+IN/G/aUJ7kGPb6yO/ErDI5V2s8iycA==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.2", - "define-properties": "^1.2.0", - "es-abstract": "^1.22.1", - "es-shim-unscopables": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/array.prototype.flatmap": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/array.prototype.flatmap/-/array.prototype.flatmap-1.3.2.tgz", - "integrity": "sha512-Ewyx0c9PmpcsByhSW4r+9zDU7sGjFc86qf/kKtuSCRdhfbk0SNLLkaT5qvcHnRGgc5NP/ly/y+qkXkqONX54CQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.2", - "define-properties": "^1.2.0", - "es-abstract": "^1.22.1", - "es-shim-unscopables": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/array.prototype.tosorted": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/array.prototype.tosorted/-/array.prototype.tosorted-1.1.4.tgz", - "integrity": "sha512-p6Fx8B7b7ZhL/gmUsAy0D15WhvDccw3mnGNbZpi3pmeJdxtWsj2jEaI4Y6oo3XiHfzuSgPwKc04MYt6KgvC/wA==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.3", - "es-errors": "^1.3.0", - "es-shim-unscopables": "^1.0.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/arraybuffer.prototype.slice": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/arraybuffer.prototype.slice/-/arraybuffer.prototype.slice-1.0.3.tgz", - "integrity": "sha512-bMxMKAjg13EBSVscxTaYA4mRc5t1UAXa2kXiGTNfZ079HIWXEkKmkgFrh/nJqamaLSrXO5H4WFFkPEaLJWbs3A==", - "dev": true, - "dependencies": { - "array-buffer-byte-length": "^1.0.1", - "call-bind": "^1.0.5", - "define-properties": "^1.2.1", - "es-abstract": "^1.22.3", - "es-errors": "^1.2.1", - "get-intrinsic": "^1.2.3", - "is-array-buffer": "^3.0.4", - "is-shared-array-buffer": "^1.0.2" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/asn1": { - "version": "0.2.6", - "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.6.tgz", - "integrity": "sha512-ix/FxPn0MDjeyJ7i/yoHGFt/EX6LyNbxSEhPPXODPL+KB0VPk86UYfL0lMdy+KCnv+fmvIzySwaK5COwqVbWTQ==", - "dependencies": { - "safer-buffer": "~2.1.0" - } - }, - "node_modules/assert-plus": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", - "integrity": "sha512-NfJ4UzBCcQGLDlQq7nHxH+tv3kyZ0hHQqF5BO6J7tNJeP5do1llPr8dZ8zHonfhAu0PHAdMkSo+8o0wxg9lZWw==", - "engines": { - "node": ">=0.8" - } - }, - "node_modules/asynckit": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", - "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" - }, - "node_modules/available-typed-arrays": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/available-typed-arrays/-/available-typed-arrays-1.0.7.tgz", - "integrity": "sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ==", - "dev": true, - "dependencies": { - "possible-typed-array-names": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/aws-sign2": { - "version": "0.7.0", - "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.7.0.tgz", - "integrity": "sha512-08kcGqnYf/YmjoRhfxyu+CLxBjUtHLXLXX/vUfx9l2LYzG3c1m61nrpyFUZI6zeS+Li/wWMMidD9KgrqtGq3mA==", - "engines": { - "node": "*" - } - }, - "node_modules/aws4": { - "version": "1.13.2", - "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.13.2.tgz", - "integrity": "sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==" - }, - "node_modules/balanced-match": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", - "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==" - }, - "node_modules/bcrypt-pbkdf": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.2.tgz", - "integrity": "sha512-qeFIXtP4MSoi6NLqO12WfqARWWuCKi2Rn/9hJLEmtB5yTNr9DqFWkJRCf2qShWzPeAMRnOgCrq0sg/KLv5ES9w==", - "dependencies": { - "tweetnacl": "^0.14.3" - } - }, - "node_modules/brace-expansion": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", - "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", - "dependencies": { - "balanced-match": "^1.0.0" - } - }, - "node_modules/builtins": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/builtins/-/builtins-5.1.0.tgz", - "integrity": "sha512-SW9lzGTLvWTP1AY8xeAMZimqDrIaSdLQUcVr9DMef51niJ022Ri87SwRRKYm4A6iHfkPaiVUu/Duw2Wc4J7kKg==", - "dev": true, - "dependencies": { - "semver": "^7.0.0" - } - }, - "node_modules/builtins/node_modules/semver": { - "version": "7.6.3", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", - "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", - "dev": true, - "bin": { - "semver": "bin/semver.js" - }, - "engines": { - "node": ">=10" - } - }, - "node_modules/byline": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/byline/-/byline-5.0.0.tgz", - "integrity": "sha512-s6webAy+R4SR8XVuJWt2V2rGvhnrhxN+9S15GNuTK3wKPOXFF6RNc+8ug2XhH+2s4f+uudG4kUVYmYOQWL2g0Q==", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/call-bind": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz", - "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==", - "dev": true, - "dependencies": { - "es-define-property": "^1.0.0", - "es-errors": "^1.3.0", - "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.4", - "set-function-length": "^1.2.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/callsites": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", - "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==", - "dev": true, - "engines": { - "node": ">=6" - } - }, - "node_modules/caseless": { - "version": "0.12.0", - "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz", - "integrity": "sha512-4tYFyifaFfGacoiObjJegolkwSU4xQNGbVgUiNYVUxbQ2x2lUsFvY4hVgVzGiIe6WLOPqycWXA40l+PWsxthUw==" - }, - "node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", - "dev": true, - "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" - } - }, - "node_modules/chalk/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/chownr": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz", - "integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==", - "engines": { - "node": ">=18" - } - }, - "node_modules/color-convert": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", - "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", - "dependencies": { - "color-name": "~1.1.4" - }, - "engines": { - "node": ">=7.0.0" - } - }, - "node_modules/color-name": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", - "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==" - }, - "node_modules/combined-stream": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", - "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", - "dependencies": { - "delayed-stream": "~1.0.0" - }, - "engines": { - "node": ">= 0.8" - } - }, - "node_modules/concat-map": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", - "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", - "dev": true - }, - "node_modules/core-util-is": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", - "integrity": "sha512-3lqz5YjWTYnW6dlDa5TLaTCcShfar1e40rmcJVwCBJC6mWlFuj0eCHIElmG1g5kyuJ/GD+8Wn4FFCcz4gJPfaQ==" - }, - "node_modules/cross-spawn": { - "version": "7.0.3", - "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz", - "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==", - "dependencies": { - "path-key": "^3.1.0", - "shebang-command": "^2.0.0", - "which": "^2.0.1" - }, - "engines": { - "node": ">= 8" - } - }, - "node_modules/dashdash": { - "version": "1.14.1", - "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", - "integrity": "sha512-jRFi8UDGo6j+odZiEpjazZaWqEal3w/basFjQHQEwVtZJGDpxbH1MeYluwCS8Xq5wmLJooDlMgvVarmWfGM44g==", - "dependencies": { - "assert-plus": "^1.0.0" - }, - "engines": { - "node": ">=0.10" - } - }, - "node_modules/data-view-buffer": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz", - "integrity": "sha512-0lht7OugA5x3iJLOWFhWK/5ehONdprk0ISXqVFn/NFrDu+cuc8iADFrGQz5BnRK7LLU3JmkbXSxaqX+/mXYtUA==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.6", - "es-errors": "^1.3.0", - "is-data-view": "^1.0.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/data-view-byte-length": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/data-view-byte-length/-/data-view-byte-length-1.0.1.tgz", - "integrity": "sha512-4J7wRJD3ABAzr8wP+OcIcqq2dlUKp4DVflx++hs5h5ZKydWMI6/D/fAot+yh6g2tHh8fLFTvNOaVN357NvSrOQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "es-errors": "^1.3.0", - "is-data-view": "^1.0.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/data-view-byte-offset": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/data-view-byte-offset/-/data-view-byte-offset-1.0.0.tgz", - "integrity": "sha512-t/Ygsytq+R995EJ5PZlD4Cu56sWa8InXySaViRzw9apusqsOO2bQP+SbYzAhR0pFKoB+43lYy8rWban9JSuXnA==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.6", - "es-errors": "^1.3.0", - "is-data-view": "^1.0.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/debug": { - "version": "4.3.7", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.7.tgz", - "integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==", - "dev": true, - "dependencies": { - "ms": "^2.1.3" - }, - "engines": { - "node": ">=6.0" - }, - "peerDependenciesMeta": { - "supports-color": { - "optional": true - } - } - }, - "node_modules/deep-is": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz", - "integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==", - "dev": true - }, - "node_modules/define-data-property": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", - "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==", - "dev": true, - "dependencies": { - "es-define-property": "^1.0.0", - "es-errors": "^1.3.0", - "gopd": "^1.0.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/define-properties": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz", - "integrity": "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==", - "dev": true, - "dependencies": { - "define-data-property": "^1.0.1", - "has-property-descriptors": "^1.0.0", - "object-keys": "^1.1.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/delayed-stream": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", - "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", - "engines": { - "node": ">=0.4.0" - } - }, - "node_modules/doctrine": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-3.0.0.tgz", - "integrity": "sha512-yS+Q5i3hBf7GBkd4KG8a7eBNNWNGLTaEwwYWUijIYM7zrlYDM0BFXHjjPWlWZ1Rg7UaddZeIDmi9jF3HmqiQ2w==", - "dev": true, - "dependencies": { - "esutils": "^2.0.2" - }, - "engines": { - "node": ">=6.0.0" - } - }, - "node_modules/eastasianwidth": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", - "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==" - }, - "node_modules/ecc-jsbn": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz", - "integrity": "sha512-eh9O+hwRHNbG4BLTjEl3nw044CkGm5X6LoaCf7LPp7UU8Qrt47JYNi6nPX8xjW97TKGKm1ouctg0QSpZe9qrnw==", - "dependencies": { - "jsbn": "~0.1.0", - "safer-buffer": "^2.1.0" - } - }, - "node_modules/emoji-regex": { - "version": "9.2.2", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz", - "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==" - }, - "node_modules/error-ex": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz", - "integrity": "sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==", - "dev": true, - "dependencies": { - "is-arrayish": "^0.2.1" - } - }, - "node_modules/es-abstract": { - "version": "1.23.3", - "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.23.3.tgz", - "integrity": "sha512-e+HfNH61Bj1X9/jLc5v1owaLYuHdeHHSQlkhCBiTK8rBvKaULl/beGMxwrMXjpYrv4pz22BlY570vVePA2ho4A==", - "dev": true, - "dependencies": { - "array-buffer-byte-length": "^1.0.1", - "arraybuffer.prototype.slice": "^1.0.3", - "available-typed-arrays": "^1.0.7", - "call-bind": "^1.0.7", - "data-view-buffer": "^1.0.1", - "data-view-byte-length": "^1.0.1", - "data-view-byte-offset": "^1.0.0", - "es-define-property": "^1.0.0", - "es-errors": "^1.3.0", - "es-object-atoms": "^1.0.0", - "es-set-tostringtag": "^2.0.3", - "es-to-primitive": "^1.2.1", - "function.prototype.name": "^1.1.6", - "get-intrinsic": "^1.2.4", - "get-symbol-description": "^1.0.2", - "globalthis": "^1.0.3", - "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.2", - "has-proto": "^1.0.3", - "has-symbols": "^1.0.3", - "hasown": "^2.0.2", - "internal-slot": "^1.0.7", - "is-array-buffer": "^3.0.4", - "is-callable": "^1.2.7", - "is-data-view": "^1.0.1", - "is-negative-zero": "^2.0.3", - "is-regex": "^1.1.4", - "is-shared-array-buffer": "^1.0.3", - "is-string": "^1.0.7", - "is-typed-array": "^1.1.13", - "is-weakref": "^1.0.2", - "object-inspect": "^1.13.1", - "object-keys": "^1.1.1", - "object.assign": "^4.1.5", - "regexp.prototype.flags": "^1.5.2", - "safe-array-concat": "^1.1.2", - "safe-regex-test": "^1.0.3", - "string.prototype.trim": "^1.2.9", - "string.prototype.trimend": "^1.0.8", - "string.prototype.trimstart": "^1.0.8", - "typed-array-buffer": "^1.0.2", - "typed-array-byte-length": "^1.0.1", - "typed-array-byte-offset": "^1.0.2", - "typed-array-length": "^1.0.6", - "unbox-primitive": "^1.0.2", - "which-typed-array": "^1.1.15" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/es-define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz", - "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==", - "dev": true, - "dependencies": { - "get-intrinsic": "^1.2.4" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/es-errors": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", - "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", - "dev": true, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/es-iterator-helpers": { - "version": "1.0.19", - "resolved": "https://registry.npmjs.org/es-iterator-helpers/-/es-iterator-helpers-1.0.19.tgz", - "integrity": "sha512-zoMwbCcH5hwUkKJkT8kDIBZSz9I6mVG//+lDCinLCGov4+r7NIy0ld8o03M0cJxl2spVf6ESYVS6/gpIfq1FFw==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.3", - "es-errors": "^1.3.0", - "es-set-tostringtag": "^2.0.3", - "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.4", - "globalthis": "^1.0.3", - "has-property-descriptors": "^1.0.2", - "has-proto": "^1.0.3", - "has-symbols": "^1.0.3", - "internal-slot": "^1.0.7", - "iterator.prototype": "^1.1.2", - "safe-array-concat": "^1.1.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/es-object-atoms": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.0.0.tgz", - "integrity": "sha512-MZ4iQ6JwHOBQjahnjwaC1ZtIBH+2ohjamzAO3oaHcXYup7qxjF2fixyH+Q71voWHeOkI2q/TnJao/KfXYIZWbw==", - "dev": true, - "dependencies": { - "es-errors": "^1.3.0" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/es-set-tostringtag": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.0.3.tgz", - "integrity": "sha512-3T8uNMC3OQTHkFUsFq8r/BwAXLHvU/9O9mE0fBc/MY5iq/8H7ncvO947LmYA6ldWw9Uh8Yhf25zu6n7nML5QWQ==", - "dev": true, - "dependencies": { - "get-intrinsic": "^1.2.4", - "has-tostringtag": "^1.0.2", - "hasown": "^2.0.1" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/es-shim-unscopables": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/es-shim-unscopables/-/es-shim-unscopables-1.0.2.tgz", - "integrity": "sha512-J3yBRXCzDu4ULnQwxyToo/OjdMx6akgVC7K6few0a7F/0wLtmKKN7I73AH5T2836UuXRqN7Qg+IIUw/+YJksRw==", - "dev": true, - "dependencies": { - "hasown": "^2.0.0" - } - }, - "node_modules/es-to-primitive": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.2.1.tgz", - "integrity": "sha512-QCOllgZJtaUo9miYBcLChTUaHNjJF3PYs1VidD7AwiEj1kYxKeQTctLAezAOH5ZKRH0g2IgPn6KwB4IT8iRpvA==", - "dev": true, - "dependencies": { - "is-callable": "^1.1.4", - "is-date-object": "^1.0.1", - "is-symbol": "^1.0.2" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/escape-string-regexp": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", - "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", - "dev": true, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/eslint": { - "version": "8.57.1", - "resolved": "https://registry.npmjs.org/eslint/-/eslint-8.57.1.tgz", - "integrity": "sha512-ypowyDxpVSYpkXr9WPv2PAZCtNip1Mv5KTW0SCurXv/9iOpcrH9PaqUElksqEB6pChqHGDRCFTyrZlGhnLNGiA==", - "dev": true, - "dependencies": { - "@eslint-community/eslint-utils": "^4.2.0", - "@eslint-community/regexpp": "^4.6.1", - "@eslint/eslintrc": "^2.1.4", - "@eslint/js": "8.57.1", - "@humanwhocodes/config-array": "^0.13.0", - "@humanwhocodes/module-importer": "^1.0.1", - "@nodelib/fs.walk": "^1.2.8", - "@ungap/structured-clone": "^1.2.0", - "ajv": "^6.12.4", - "chalk": "^4.0.0", - "cross-spawn": "^7.0.2", - "debug": "^4.3.2", - "doctrine": "^3.0.0", - "escape-string-regexp": "^4.0.0", - "eslint-scope": "^7.2.2", - "eslint-visitor-keys": "^3.4.3", - "espree": "^9.6.1", - "esquery": "^1.4.2", - "esutils": "^2.0.2", - "fast-deep-equal": "^3.1.3", - "file-entry-cache": "^6.0.1", - "find-up": "^5.0.0", - "glob-parent": "^6.0.2", - "globals": "^13.19.0", - "graphemer": "^1.4.0", - "ignore": "^5.2.0", - "imurmurhash": "^0.1.4", - "is-glob": "^4.0.0", - "is-path-inside": "^3.0.3", - "js-yaml": "^4.1.0", - "json-stable-stringify-without-jsonify": "^1.0.1", - "levn": "^0.4.1", - "lodash.merge": "^4.6.2", - "minimatch": "^3.1.2", - "natural-compare": "^1.4.0", - "optionator": "^0.9.3", - "strip-ansi": "^6.0.1", - "text-table": "^0.2.0" - }, - "bin": { - "eslint": "bin/eslint.js" - }, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/eslint-config-standard": { - "version": "17.1.0", - "resolved": "https://registry.npmjs.org/eslint-config-standard/-/eslint-config-standard-17.1.0.tgz", - "integrity": "sha512-IwHwmaBNtDK4zDHQukFDW5u/aTb8+meQWZvNFWkiGmbWjD6bqyuSSBxxXKkCftCUzc1zwCH2m/baCNDLGmuO5Q==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "engines": { - "node": ">=12.0.0" - }, - "peerDependencies": { - "eslint": "^8.0.1", - "eslint-plugin-import": "^2.25.2", - "eslint-plugin-n": "^15.0.0 || ^16.0.0 ", - "eslint-plugin-promise": "^6.0.0" - } - }, - "node_modules/eslint-config-standard-jsx": { - "version": "11.0.0", - "resolved": "https://registry.npmjs.org/eslint-config-standard-jsx/-/eslint-config-standard-jsx-11.0.0.tgz", - "integrity": "sha512-+1EV/R0JxEK1L0NGolAr8Iktm3Rgotx3BKwgaX+eAuSX8D952LULKtjgZD3F+e6SvibONnhLwoTi9DPxN5LvvQ==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "peerDependencies": { - "eslint": "^8.8.0", - "eslint-plugin-react": "^7.28.0" - } - }, - "node_modules/eslint-import-resolver-node": { - "version": "0.3.9", - "resolved": "https://registry.npmjs.org/eslint-import-resolver-node/-/eslint-import-resolver-node-0.3.9.tgz", - "integrity": "sha512-WFj2isz22JahUv+B788TlO3N6zL3nNJGU8CcZbPZvVEkBPaJdCV4vy5wyghty5ROFbCRnm132v8BScu5/1BQ8g==", - "dev": true, - "dependencies": { - "debug": "^3.2.7", - "is-core-module": "^2.13.0", - "resolve": "^1.22.4" - } - }, - "node_modules/eslint-import-resolver-node/node_modules/debug": { - "version": "3.2.7", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", - "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", - "dev": true, - "dependencies": { - "ms": "^2.1.1" - } - }, - "node_modules/eslint-module-utils": { - "version": "2.12.0", - "resolved": "https://registry.npmjs.org/eslint-module-utils/-/eslint-module-utils-2.12.0.tgz", - "integrity": "sha512-wALZ0HFoytlyh/1+4wuZ9FJCD/leWHQzzrxJ8+rebyReSLk7LApMyd3WJaLVoN+D5+WIdJyDK1c6JnE65V4Zyg==", - "dev": true, - "dependencies": { - "debug": "^3.2.7" - }, - "engines": { - "node": ">=4" - }, - "peerDependenciesMeta": { - "eslint": { - "optional": true - } - } - }, - "node_modules/eslint-module-utils/node_modules/debug": { - "version": "3.2.7", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", - "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", - "dev": true, - "dependencies": { - "ms": "^2.1.1" - } - }, - "node_modules/eslint-plugin-es": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/eslint-plugin-es/-/eslint-plugin-es-4.1.0.tgz", - "integrity": "sha512-GILhQTnjYE2WorX5Jyi5i4dz5ALWxBIdQECVQavL6s7cI76IZTDWleTHkxz/QT3kvcs2QlGHvKLYsSlPOlPXnQ==", - "dev": true, - "dependencies": { - "eslint-utils": "^2.0.0", - "regexpp": "^3.0.0" - }, - "engines": { - "node": ">=8.10.0" - }, - "funding": { - "url": "https://github.com/sponsors/mysticatea" - }, - "peerDependencies": { - "eslint": ">=4.19.1" - } - }, - "node_modules/eslint-plugin-es/node_modules/eslint-utils": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/eslint-utils/-/eslint-utils-2.1.0.tgz", - "integrity": "sha512-w94dQYoauyvlDc43XnGB8lU3Zt713vNChgt4EWwhXAP2XkBvndfxF0AgIqKOOasjPIPzj9JqgwkwbCYD0/V3Zg==", - "dev": true, - "dependencies": { - "eslint-visitor-keys": "^1.1.0" - }, - "engines": { - "node": ">=6" - }, - "funding": { - "url": "https://github.com/sponsors/mysticatea" - } - }, - "node_modules/eslint-plugin-es/node_modules/eslint-visitor-keys": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz", - "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==", - "dev": true, - "engines": { - "node": ">=4" - } - }, - "node_modules/eslint-plugin-import": { - "version": "2.30.0", - "resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.30.0.tgz", - "integrity": "sha512-/mHNE9jINJfiD2EKkg1BKyPyUk4zdnT54YgbOgfjSakWT5oyX/qQLVNTkehyfpcMxZXMy1zyonZ2v7hZTX43Yw==", - "dev": true, - "dependencies": { - "@rtsao/scc": "^1.1.0", - "array-includes": "^3.1.8", - "array.prototype.findlastindex": "^1.2.5", - "array.prototype.flat": "^1.3.2", - "array.prototype.flatmap": "^1.3.2", - "debug": "^3.2.7", - "doctrine": "^2.1.0", - "eslint-import-resolver-node": "^0.3.9", - "eslint-module-utils": "^2.9.0", - "hasown": "^2.0.2", - "is-core-module": "^2.15.1", - "is-glob": "^4.0.3", - "minimatch": "^3.1.2", - "object.fromentries": "^2.0.8", - "object.groupby": "^1.0.3", - "object.values": "^1.2.0", - "semver": "^6.3.1", - "tsconfig-paths": "^3.15.0" - }, - "engines": { - "node": ">=4" - }, - "peerDependencies": { - "eslint": "^2 || ^3 || ^4 || ^5 || ^6 || ^7.2.0 || ^8" - } - }, - "node_modules/eslint-plugin-import/node_modules/brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "dev": true, - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "node_modules/eslint-plugin-import/node_modules/debug": { - "version": "3.2.7", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", - "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", - "dev": true, - "dependencies": { - "ms": "^2.1.1" - } - }, - "node_modules/eslint-plugin-import/node_modules/doctrine": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-2.1.0.tgz", - "integrity": "sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==", - "dev": true, - "dependencies": { - "esutils": "^2.0.2" - }, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/eslint-plugin-import/node_modules/minimatch": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", - "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" - } - }, - "node_modules/eslint-plugin-n": { - "version": "15.7.0", - "resolved": "https://registry.npmjs.org/eslint-plugin-n/-/eslint-plugin-n-15.7.0.tgz", - "integrity": "sha512-jDex9s7D/Qial8AGVIHq4W7NswpUD5DPDL2RH8Lzd9EloWUuvUkHfv4FRLMipH5q2UtyurorBkPeNi1wVWNh3Q==", - "dev": true, - "dependencies": { - "builtins": "^5.0.1", - "eslint-plugin-es": "^4.1.0", - "eslint-utils": "^3.0.0", - "ignore": "^5.1.1", - "is-core-module": "^2.11.0", - "minimatch": "^3.1.2", - "resolve": "^1.22.1", - "semver": "^7.3.8" - }, - "engines": { - "node": ">=12.22.0" - }, - "funding": { - "url": "https://github.com/sponsors/mysticatea" - }, - "peerDependencies": { - "eslint": ">=7.0.0" - } - }, - "node_modules/eslint-plugin-n/node_modules/brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "dev": true, - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "node_modules/eslint-plugin-n/node_modules/minimatch": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", - "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" - } - }, - "node_modules/eslint-plugin-n/node_modules/semver": { - "version": "7.6.3", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", - "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", - "dev": true, - "bin": { - "semver": "bin/semver.js" - }, - "engines": { - "node": ">=10" - } - }, - "node_modules/eslint-plugin-promise": { - "version": "6.6.0", - "resolved": "https://registry.npmjs.org/eslint-plugin-promise/-/eslint-plugin-promise-6.6.0.tgz", - "integrity": "sha512-57Zzfw8G6+Gq7axm2Pdo3gW/Rx3h9Yywgn61uE/3elTCOePEHVrn2i5CdfBwA1BLK0Q0WqctICIUSqXZW/VprQ==", - "dev": true, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - }, - "peerDependencies": { - "eslint": "^7.0.0 || ^8.0.0 || ^9.0.0" - } - }, - "node_modules/eslint-plugin-react": { - "version": "7.37.0", - "resolved": "https://registry.npmjs.org/eslint-plugin-react/-/eslint-plugin-react-7.37.0.tgz", - "integrity": "sha512-IHBePmfWH5lKhJnJ7WB1V+v/GolbB0rjS8XYVCSQCZKaQCAUhMoVoOEn1Ef8Z8Wf0a7l8KTJvuZg5/e4qrZ6nA==", - "dev": true, - "dependencies": { - "array-includes": "^3.1.8", - "array.prototype.findlast": "^1.2.5", - "array.prototype.flatmap": "^1.3.2", - "array.prototype.tosorted": "^1.1.4", - "doctrine": "^2.1.0", - "es-iterator-helpers": "^1.0.19", - "estraverse": "^5.3.0", - "hasown": "^2.0.2", - "jsx-ast-utils": "^2.4.1 || ^3.0.0", - "minimatch": "^3.1.2", - "object.entries": "^1.1.8", - "object.fromentries": "^2.0.8", - "object.values": "^1.2.0", - "prop-types": "^15.8.1", - "resolve": "^2.0.0-next.5", - "semver": "^6.3.1", - "string.prototype.matchall": "^4.0.11", - "string.prototype.repeat": "^1.0.0" - }, - "engines": { - "node": ">=4" - }, - "peerDependencies": { - "eslint": "^3 || ^4 || ^5 || ^6 || ^7 || ^8 || ^9.7" - } - }, - "node_modules/eslint-plugin-react/node_modules/brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "dev": true, - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "node_modules/eslint-plugin-react/node_modules/doctrine": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-2.1.0.tgz", - "integrity": "sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==", - "dev": true, - "dependencies": { - "esutils": "^2.0.2" - }, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/eslint-plugin-react/node_modules/minimatch": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", - "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" - } - }, - "node_modules/eslint-plugin-react/node_modules/resolve": { - "version": "2.0.0-next.5", - "resolved": "https://registry.npmjs.org/resolve/-/resolve-2.0.0-next.5.tgz", - "integrity": "sha512-U7WjGVG9sH8tvjW5SmGbQuui75FiyjAX72HX15DwBBwF9dNiQZRQAg9nnPhYy+TUnE0+VcrttuvNI8oSxZcocA==", - "dev": true, - "dependencies": { - "is-core-module": "^2.13.0", - "path-parse": "^1.0.7", - "supports-preserve-symlinks-flag": "^1.0.0" - }, - "bin": { - "resolve": "bin/resolve" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/eslint-scope": { - "version": "7.2.2", - "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-7.2.2.tgz", - "integrity": "sha512-dOt21O7lTMhDM+X9mB4GX+DZrZtCUJPL/wlcTqxyrx5IvO0IYtILdtrQGQp+8n5S0gwSVmOf9NQrjMOgfQZlIg==", - "dev": true, - "dependencies": { - "esrecurse": "^4.3.0", - "estraverse": "^5.2.0" - }, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/eslint-utils": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/eslint-utils/-/eslint-utils-3.0.0.tgz", - "integrity": "sha512-uuQC43IGctw68pJA1RgbQS8/NP7rch6Cwd4j3ZBtgo4/8Flj4eGE7ZYSZRN3iq5pVUv6GPdW5Z1RFleo84uLDA==", - "dev": true, - "dependencies": { - "eslint-visitor-keys": "^2.0.0" - }, - "engines": { - "node": "^10.0.0 || ^12.0.0 || >= 14.0.0" - }, - "funding": { - "url": "https://github.com/sponsors/mysticatea" - }, - "peerDependencies": { - "eslint": ">=5" - } - }, - "node_modules/eslint-utils/node_modules/eslint-visitor-keys": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-2.1.0.tgz", - "integrity": "sha512-0rSmRBzXgDzIsD6mGdJgevzgezI534Cer5L/vyMX0kHzT/jiB43jRhd9YUlMGYLQy2zprNmoT8qasCGtY+QaKw==", - "dev": true, - "engines": { - "node": ">=10" - } - }, - "node_modules/eslint-visitor-keys": { - "version": "3.4.3", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz", - "integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==", - "dev": true, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/eslint/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/eslint/node_modules/brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "dev": true, - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "node_modules/eslint/node_modules/minimatch": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", - "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" - } - }, - "node_modules/eslint/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dev": true, - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/espree": { - "version": "9.6.1", - "resolved": "https://registry.npmjs.org/espree/-/espree-9.6.1.tgz", - "integrity": "sha512-oruZaFkjorTpF32kDSI5/75ViwGeZginGGy2NoOSg3Q9bnwlnmDm4HLnkl0RE3n+njDXR037aY1+x58Z/zFdwQ==", - "dev": true, - "dependencies": { - "acorn": "^8.9.0", - "acorn-jsx": "^5.3.2", - "eslint-visitor-keys": "^3.4.1" - }, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/esquery": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.6.0.tgz", - "integrity": "sha512-ca9pw9fomFcKPvFLXhBKUK90ZvGibiGOvRJNbjljY7s7uq/5YO4BOzcYtJqExdx99rF6aAcnRxHmcUHcz6sQsg==", - "dev": true, - "dependencies": { - "estraverse": "^5.1.0" - }, - "engines": { - "node": ">=0.10" - } - }, - "node_modules/esrecurse": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz", - "integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==", - "dev": true, - "dependencies": { - "estraverse": "^5.2.0" - }, - "engines": { - "node": ">=4.0" - } - }, - "node_modules/estraverse": { - "version": "5.3.0", - "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", - "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", - "dev": true, - "engines": { - "node": ">=4.0" - } - }, - "node_modules/esutils": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz", - "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==", - "dev": true, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/extend": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", - "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==" - }, - "node_modules/extsprintf": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", - "integrity": "sha512-11Ndz7Nv+mvAC1j0ktTa7fAb0vLyGGX+rMHNBYQviQDGU0Hw7lhctJANqbPhu9nV9/izT/IntTgZ7Im/9LJs9g==", - "engines": [ - "node >=0.6.0" - ] - }, - "node_modules/fast-deep-equal": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", - "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==" - }, - "node_modules/fast-json-stable-stringify": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", - "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==" - }, - "node_modules/fast-levenshtein": { - "version": "2.0.6", - "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", - "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", - "dev": true - }, - "node_modules/fastq": { - "version": "1.17.1", - "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", - "integrity": "sha512-sRVD3lWVIXWg6By68ZN7vho9a1pQcN/WBFaAAsDDFzlJjvoGx0P8z7V1t72grFJfJhu3YPZBuu25f7Kaw2jN1w==", - "dev": true, - "dependencies": { - "reusify": "^1.0.4" - } - }, - "node_modules/file-entry-cache": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz", - "integrity": "sha512-7Gps/XWymbLk2QLYK4NzpMOrYjMhdIxXuIvy2QBsLE6ljuodKvdkWs/cpyJJ3CVIVpH0Oi1Hvg1ovbMzLdFBBg==", - "dev": true, - "dependencies": { - "flat-cache": "^3.0.4" - }, - "engines": { - "node": "^10.12.0 || >=12.0.0" - } - }, - "node_modules/find-up": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", - "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", - "dev": true, - "dependencies": { - "locate-path": "^6.0.0", - "path-exists": "^4.0.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/flat-cache": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-3.2.0.tgz", - "integrity": "sha512-CYcENa+FtcUKLmhhqyctpclsq7QF38pKjZHsGNiSQF5r4FtoKDWabFDl3hzaEQMvT1LHEysw5twgLvpYYb4vbw==", - "dev": true, - "dependencies": { - "flatted": "^3.2.9", - "keyv": "^4.5.3", - "rimraf": "^3.0.2" - }, - "engines": { - "node": "^10.12.0 || >=12.0.0" - } - }, - "node_modules/flat-cache/node_modules/brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", - "dev": true, - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" - } - }, - "node_modules/flat-cache/node_modules/glob": { - "version": "7.2.3", - "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", - "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==", - "deprecated": "Glob versions prior to v9 are no longer supported", - "dev": true, - "dependencies": { - "fs.realpath": "^1.0.0", - "inflight": "^1.0.4", - "inherits": "2", - "minimatch": "^3.1.1", - "once": "^1.3.0", - "path-is-absolute": "^1.0.0" - }, - "engines": { - "node": "*" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/flat-cache/node_modules/minimatch": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", - "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, - "dependencies": { - "brace-expansion": "^1.1.7" - }, - "engines": { - "node": "*" - } - }, - "node_modules/flat-cache/node_modules/rimraf": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", - "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==", - "deprecated": "Rimraf versions prior to v4 are no longer supported", - "dev": true, - "dependencies": { - "glob": "^7.1.3" - }, - "bin": { - "rimraf": "bin.js" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/flatted": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.1.tgz", - "integrity": "sha512-X8cqMLLie7KsNUDSdzeN8FYK9rEt4Dt67OsG/DNGnYTSDBG4uFAJFBnUeiV+zCVAvwFy56IjM9sH51jVaEhNxw==", - "dev": true - }, - "node_modules/for-each": { - "version": "0.3.3", - "resolved": "https://registry.npmjs.org/for-each/-/for-each-0.3.3.tgz", - "integrity": "sha512-jqYfLp7mo9vIyQf8ykW2v7A+2N4QjeCeI5+Dz9XraiO1ign81wjiH7Fb9vSOWvQfNtmSa4H2RoQTrrXivdUZmw==", - "dev": true, - "dependencies": { - "is-callable": "^1.1.3" - } - }, - "node_modules/foreground-child": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.0.tgz", - "integrity": "sha512-Ld2g8rrAyMYFXBhEqMz8ZAHBi4J4uS1i/CxGMDnjyFWddMXLVcDp051DZfu+t7+ab7Wv6SMqpWmyFIj5UbfFvg==", - "dependencies": { - "cross-spawn": "^7.0.0", - "signal-exit": "^4.0.1" - }, - "engines": { - "node": ">=14" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/forever-agent": { - "version": "0.6.1", - "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz", - "integrity": "sha512-j0KLYPhm6zeac4lz3oJ3o65qvgQCcPubiyotZrXqEaG4hNagNYO8qdlUrX5vwqv9ohqeT/Z3j6+yW067yWWdUw==", - "engines": { - "node": "*" - } - }, - "node_modules/form-data": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.5.1.tgz", - "integrity": "sha512-m21N3WOmEEURgk6B9GLOE4RuWOFf28Lhh9qGYeNlGq4VDXUlJy2th2slBNU8Gp8EzloYZOibZJ7t5ecIrFSjVA==", - "dependencies": { - "asynckit": "^0.4.0", - "combined-stream": "^1.0.6", - "mime-types": "^2.1.12" - }, - "engines": { - "node": ">= 0.12" - } - }, - "node_modules/fs.realpath": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", - "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", - "dev": true - }, - "node_modules/function-bind": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", - "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", - "dev": true, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/function.prototype.name": { - "version": "1.1.6", - "resolved": "https://registry.npmjs.org/function.prototype.name/-/function.prototype.name-1.1.6.tgz", - "integrity": "sha512-Z5kx79swU5P27WEayXM1tBi5Ze/lbIyiNgU3qyXUOf9b2rgXYyF9Dy9Cx+IQv/Lc8WCG6L82zwUPpSS9hGehIg==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.2", - "define-properties": "^1.2.0", - "es-abstract": "^1.22.1", - "functions-have-names": "^1.2.3" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/functions-have-names": { - "version": "1.2.3", - "resolved": "https://registry.npmjs.org/functions-have-names/-/functions-have-names-1.2.3.tgz", - "integrity": "sha512-xckBUXyTIqT97tq2x2AMb+g163b5JFysYk0x4qxNFwbfQkmNZoiRHb6sPzI9/QV33WeuvVYBUIiD4NzNIyqaRQ==", - "dev": true, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/get-intrinsic": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz", - "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==", - "dev": true, - "dependencies": { - "es-errors": "^1.3.0", - "function-bind": "^1.1.2", - "has-proto": "^1.0.1", - "has-symbols": "^1.0.3", - "hasown": "^2.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/get-stdin": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/get-stdin/-/get-stdin-8.0.0.tgz", - "integrity": "sha512-sY22aA6xchAzprjyqmSEQv4UbAAzRN0L2dQB0NlN5acTTK9Don6nhoc3eAbUnpZiCANAMfd/+40kVdKfFygohg==", - "dev": true, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/get-symbol-description": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/get-symbol-description/-/get-symbol-description-1.0.2.tgz", - "integrity": "sha512-g0QYk1dZBxGwk+Ngc+ltRH2IBp2f7zBkBMBJZCDerh6EhlhSR6+9irMCuT/09zD6qkarHUSn529sK/yL4S27mg==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.5", - "es-errors": "^1.3.0", - "get-intrinsic": "^1.2.4" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/getpass": { - "version": "0.1.7", - "resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz", - "integrity": "sha512-0fzj9JxOLfJ+XGLhR8ze3unN0KZCgZwiSSDz168VERjK8Wl8kVSdcu2kspd4s4wtAa1y/qrVRiAA0WclVsu0ng==", - "dependencies": { - "assert-plus": "^1.0.0" - } - }, - "node_modules/glob": { - "version": "10.4.5", - "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz", - "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==", - "dependencies": { - "foreground-child": "^3.1.0", - "jackspeak": "^3.1.2", - "minimatch": "^9.0.4", - "minipass": "^7.1.2", - "package-json-from-dist": "^1.0.0", - "path-scurry": "^1.11.1" - }, - "bin": { - "glob": "dist/esm/bin.mjs" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/glob-parent": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", - "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==", - "dev": true, - "dependencies": { - "is-glob": "^4.0.3" - }, - "engines": { - "node": ">=10.13.0" - } - }, - "node_modules/globals": { - "version": "13.24.0", - "resolved": "https://registry.npmjs.org/globals/-/globals-13.24.0.tgz", - "integrity": "sha512-AhO5QUcj8llrbG09iWhPU2B204J1xnPeL8kQmVorSsy+Sjj1sk8gIyh6cUocGmH4L0UuhAJy+hJMRA4mgA4mFQ==", - "dev": true, - "dependencies": { - "type-fest": "^0.20.2" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/globalthis": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/globalthis/-/globalthis-1.0.4.tgz", - "integrity": "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ==", - "dev": true, - "dependencies": { - "define-properties": "^1.2.1", - "gopd": "^1.0.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/gopd": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.0.1.tgz", - "integrity": "sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA==", - "dev": true, - "dependencies": { - "get-intrinsic": "^1.1.3" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/graceful-fs": { - "version": "4.2.11", - "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", - "integrity": "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==", - "dev": true - }, - "node_modules/graphemer": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz", - "integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==", - "dev": true - }, - "node_modules/har-schema": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-2.0.0.tgz", - "integrity": "sha512-Oqluz6zhGX8cyRaTQlFMPw80bSJVG2x/cFb8ZPhUILGgHka9SsokCCOQgpveePerqidZOrT14ipqfJb7ILcW5Q==", - "engines": { - "node": ">=4" - } - }, - "node_modules/har-validator": { - "version": "5.1.5", - "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.5.tgz", - "integrity": "sha512-nmT2T0lljbxdQZfspsno9hgrG3Uir6Ks5afism62poxqBM6sDnMEuPmzTq8XN0OEwqKLLdh1jQI3qyE66Nzb3w==", - "deprecated": "this library is no longer supported", - "dependencies": { - "ajv": "^6.12.3", - "har-schema": "^2.0.0" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/has-bigints": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.0.2.tgz", - "integrity": "sha512-tSvCKtBr9lkF0Ex0aQiP9N+OpV4zi2r/Nee5VkRDbaqv35RLYMzbwQfFSZZH0kR+Rd6302UJZ2p/bJCEoR3VoQ==", - "dev": true, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/has-flag": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", - "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/has-property-descriptors": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz", - "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==", - "dev": true, - "dependencies": { - "es-define-property": "^1.0.0" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/has-proto": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.3.tgz", - "integrity": "sha512-SJ1amZAJUiZS+PhsVLf5tGydlaVB8EdFpaSO4gmiUKUOxk8qzn5AIy4ZeJUmh22znIdk/uMAUT2pl3FxzVUH+Q==", - "dev": true, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/has-symbols": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.3.tgz", - "integrity": "sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==", - "dev": true, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/has-tostringtag": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", - "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", - "dev": true, - "dependencies": { - "has-symbols": "^1.0.3" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/hasown": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", - "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", - "dev": true, - "dependencies": { - "function-bind": "^1.1.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/http-signature": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.2.0.tgz", - "integrity": "sha512-CAbnr6Rz4CYQkLYUtSNXxQPUH2gK8f3iWexVlsnMeD+GjlsQ0Xsy1cOX+mN3dtxYomRy21CiOzU8Uhw6OwncEQ==", - "dependencies": { - "assert-plus": "^1.0.0", - "jsprim": "^1.2.2", - "sshpk": "^1.7.0" - }, - "engines": { - "node": ">=0.8", - "npm": ">=1.3.7" - } - }, - "node_modules/ignore": { - "version": "5.3.2", - "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz", - "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==", - "dev": true, - "engines": { - "node": ">= 4" - } - }, - "node_modules/import-fresh": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.0.tgz", - "integrity": "sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==", - "dev": true, - "dependencies": { - "parent-module": "^1.0.0", - "resolve-from": "^4.0.0" - }, - "engines": { - "node": ">=6" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/imurmurhash": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz", - "integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==", - "dev": true, - "engines": { - "node": ">=0.8.19" - } - }, - "node_modules/inflight": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", - "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", - "deprecated": "This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful.", - "dev": true, - "dependencies": { - "once": "^1.3.0", - "wrappy": "1" - } - }, - "node_modules/inherits": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", - "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", - "dev": true - }, - "node_modules/internal-slot": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/internal-slot/-/internal-slot-1.0.7.tgz", - "integrity": "sha512-NGnrKwXzSms2qUUih/ILZ5JBqNTSa1+ZmP6flaIp6KmSElgE9qdndzS3cqjrDovwFdmwsGsLdeFgB6suw+1e9g==", - "dev": true, - "dependencies": { - "es-errors": "^1.3.0", - "hasown": "^2.0.0", - "side-channel": "^1.0.4" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/is-array-buffer": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/is-array-buffer/-/is-array-buffer-3.0.4.tgz", - "integrity": "sha512-wcjaerHw0ydZwfhiKbXJWLDY8A7yV7KhjQOpb83hGgGfId/aQa4TOvwyzn2PuswW2gPCYEL/nEAiSVpdOj1lXw==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.2", - "get-intrinsic": "^1.2.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-arrayish": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz", - "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==", - "dev": true - }, - "node_modules/is-async-function": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/is-async-function/-/is-async-function-2.0.0.tgz", - "integrity": "sha512-Y1JXKrfykRJGdlDwdKlLpLyMIiWqWvuSd17TvZk68PLAOGOoF4Xyav1z0Xhoi+gCYjZVeC5SI+hYFOfvXmGRCA==", - "dev": true, - "dependencies": { - "has-tostringtag": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-bigint": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/is-bigint/-/is-bigint-1.0.4.tgz", - "integrity": "sha512-zB9CruMamjym81i2JZ3UMn54PKGsQzsJeo6xvN3HJJ4CAsQNB6iRutp2To77OfCNuoxspsIhzaPoO1zyCEhFOg==", - "dev": true, - "dependencies": { - "has-bigints": "^1.0.1" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-boolean-object": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/is-boolean-object/-/is-boolean-object-1.1.2.tgz", - "integrity": "sha512-gDYaKHJmnj4aWxyj6YHyXVpdQawtVLHU5cb+eztPGczf6cjuTdwve5ZIEfgXqH4e57An1D1AKf8CZ3kYrQRqYA==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.2", - "has-tostringtag": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-callable": { - "version": "1.2.7", - "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz", - "integrity": "sha512-1BC0BVFhS/p0qtw6enp8e+8OD0UrK0oFLztSjNzhcKA3WDuJxxAPXzPuPtKkjEY9UUoEWlX/8fgKeu2S8i9JTA==", - "dev": true, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-core-module": { - "version": "2.15.1", - "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.15.1.tgz", - "integrity": "sha512-z0vtXSwucUJtANQWldhbtbt7BnL0vxiFjIdDLAatwhDYty2bad6s+rijD6Ri4YuYJubLzIJLUidCh09e1djEVQ==", - "dev": true, - "dependencies": { - "hasown": "^2.0.2" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-data-view": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/is-data-view/-/is-data-view-1.0.1.tgz", - "integrity": "sha512-AHkaJrsUVW6wq6JS8y3JnM/GJF/9cf+k20+iDzlSaJrinEo5+7vRiteOSwBhHRiAyQATN1AmY4hwzxJKPmYf+w==", - "dev": true, - "dependencies": { - "is-typed-array": "^1.1.13" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-date-object": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.5.tgz", - "integrity": "sha512-9YQaSxsAiSwcvS33MBk3wTCVnWK+HhF8VZR2jRxehM16QcVOdHqPn4VPHmRK4lSr38n9JriurInLcP90xsYNfQ==", - "dev": true, - "dependencies": { - "has-tostringtag": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", - "dev": true, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/is-finalizationregistry": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-finalizationregistry/-/is-finalizationregistry-1.0.2.tgz", - "integrity": "sha512-0by5vtUJs8iFQb5TYUHHPudOR+qXYIMKtiUzvLIZITZUjknFmziyBJuLhVRc+Ds0dREFlskDNJKYIdIzu/9pfw==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.2" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-fullwidth-code-point": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", - "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", - "engines": { - "node": ">=8" - } - }, - "node_modules/is-generator-function": { - "version": "1.0.10", - "resolved": "https://registry.npmjs.org/is-generator-function/-/is-generator-function-1.0.10.tgz", - "integrity": "sha512-jsEjy9l3yiXEQ+PsXdmBwEPcOxaXWLspKdplFUVI9vq1iZgIekeC0L167qeu86czQaxed3q/Uzuw0swL0irL8A==", - "dev": true, - "dependencies": { - "has-tostringtag": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-glob": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", - "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", - "dev": true, - "dependencies": { - "is-extglob": "^2.1.1" - }, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/is-map": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/is-map/-/is-map-2.0.3.tgz", - "integrity": "sha512-1Qed0/Hr2m+YqxnM09CjA2d/i6YZNfF6R2oRAOj36eUdS6qIV/huPJNSEpKbupewFs+ZsJlxsjjPbc0/afW6Lw==", - "dev": true, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-negative-zero": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/is-negative-zero/-/is-negative-zero-2.0.3.tgz", - "integrity": "sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==", - "dev": true, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-number-object": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/is-number-object/-/is-number-object-1.0.7.tgz", - "integrity": "sha512-k1U0IRzLMo7ZlYIfzRu23Oh6MiIFasgpb9X76eqfFZAqwH44UI4KTBvBYIZ1dSL9ZzChTB9ShHfLkR4pdW5krQ==", - "dev": true, - "dependencies": { - "has-tostringtag": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-path-inside": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/is-path-inside/-/is-path-inside-3.0.3.tgz", - "integrity": "sha512-Fd4gABb+ycGAmKou8eMftCupSir5lRxqf4aD/vd0cD2qc4HL07OjCeuHMr8Ro4CoMaeCKDB0/ECBOVWjTwUvPQ==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/is-regex": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz", - "integrity": "sha512-kvRdxDsxZjhzUX07ZnLydzS1TU/TJlTUHHY4YLL87e37oUA49DfkLqgy+VjFocowy29cKvcSiu+kIv728jTTVg==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.2", - "has-tostringtag": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-set": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/is-set/-/is-set-2.0.3.tgz", - "integrity": "sha512-iPAjerrse27/ygGLxw+EBR9agv9Y6uLeYVJMu+QNCoouJ1/1ri0mGrcWpfCqFZuzzx3WjtwxG098X+n4OuRkPg==", - "dev": true, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-shared-array-buffer": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/is-shared-array-buffer/-/is-shared-array-buffer-1.0.3.tgz", - "integrity": "sha512-nA2hv5XIhLR3uVzDDfCIknerhx8XUKnstuOERPNNIinXG7v9u+ohXF67vxm4TPTEPU6lm61ZkwP3c9PCB97rhg==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-string": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.0.7.tgz", - "integrity": "sha512-tE2UXzivje6ofPW7l23cjDOMa09gb7xlAqG6jG5ej6uPV32TlWP3NKPigtaGeHNu9fohccRYvIiZMfOOnOYUtg==", - "dev": true, - "dependencies": { - "has-tostringtag": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-symbol": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.0.4.tgz", - "integrity": "sha512-C/CPBqKWnvdcxqIARxyOh4v1UUEOCHpgDa0WYgpKDFMszcrPcffg5uhwSgPCLD2WWxmq6isisz87tzT01tuGhg==", - "dev": true, - "dependencies": { - "has-symbols": "^1.0.2" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-typed-array": { - "version": "1.1.13", - "resolved": "https://registry.npmjs.org/is-typed-array/-/is-typed-array-1.1.13.tgz", - "integrity": "sha512-uZ25/bUAlUY5fR4OKT4rZQEBrzQWYV9ZJYGGsUmEJ6thodVJ1HX64ePQ6Z0qPWP+m+Uq6e9UugrE38jeYsDSMw==", - "dev": true, - "dependencies": { - "which-typed-array": "^1.1.14" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-typedarray": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz", - "integrity": "sha512-cyA56iCMHAh5CdzjJIa4aohJyeO1YbwLi3Jc35MmRU6poroFjIGZzUzupGiRPOjgHg9TLu43xbpwXk523fMxKA==" - }, - "node_modules/is-weakmap": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/is-weakmap/-/is-weakmap-2.0.2.tgz", - "integrity": "sha512-K5pXYOm9wqY1RgjpL3YTkF39tni1XajUIkawTLUo9EZEVUFga5gSQJF8nNS7ZwJQ02y+1YCNYcMh+HIf1ZqE+w==", - "dev": true, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-weakref": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-weakref/-/is-weakref-1.0.2.tgz", - "integrity": "sha512-qctsuLZmIQ0+vSSMfoVvyFe2+GSEvnmZ2ezTup1SBse9+twCCeial6EEi3Nc2KFcf6+qz2FBPnjXsk8xhKSaPQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.2" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-weakset": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/is-weakset/-/is-weakset-2.0.3.tgz", - "integrity": "sha512-LvIm3/KWzS9oRFHugab7d+M/GcBXuXX5xZkzPmN+NxihdQlZUQ4dWuSV1xR/sq6upL1TJEDrfBgRepHFdBtSNQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "get-intrinsic": "^1.2.4" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/isarray": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-2.0.5.tgz", - "integrity": "sha512-xHjhDr3cNBK0BzdUJSPXZntQUx/mwMS5Rw4A7lPJ90XGAO6ISP/ePDNuo0vhqOZU+UD5JoodwCAAoZQd3FeAKw==", - "dev": true - }, - "node_modules/isexe": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", - "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==" - }, - "node_modules/isomorphic-ws": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/isomorphic-ws/-/isomorphic-ws-5.0.0.tgz", - "integrity": "sha512-muId7Zzn9ywDsyXgTIafTry2sV3nySZeUDe6YedVd1Hvuuep5AsIlqK+XefWpYTyJG5e503F2xIuT2lcU6rCSw==", - "peerDependencies": { - "ws": "*" - } - }, - "node_modules/isstream": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz", - "integrity": "sha512-Yljz7ffyPbrLpLngrMtZ7NduUgVvi6wG9RJ9IUcyCd59YQ911PBJphODUcbOVbqYfxe1wuYf/LJ8PauMRwsM/g==" - }, - "node_modules/iterator.prototype": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/iterator.prototype/-/iterator.prototype-1.1.2.tgz", - "integrity": "sha512-DR33HMMr8EzwuRL8Y9D3u2BMj8+RqSE850jfGu59kS7tbmPLzGkZmVSfyCFSDxuZiEY6Rzt3T2NA/qU+NwVj1w==", - "dev": true, - "dependencies": { - "define-properties": "^1.2.1", - "get-intrinsic": "^1.2.1", - "has-symbols": "^1.0.3", - "reflect.getprototypeof": "^1.0.4", - "set-function-name": "^2.0.1" - } - }, - "node_modules/jackspeak": { - "version": "3.4.3", - "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz", - "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==", - "dependencies": { - "@isaacs/cliui": "^8.0.2" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - }, - "optionalDependencies": { - "@pkgjs/parseargs": "^0.11.0" - } - }, - "node_modules/jose": { - "version": "4.15.9", - "resolved": "https://registry.npmjs.org/jose/-/jose-4.15.9.tgz", - "integrity": "sha512-1vUQX+IdDMVPj4k8kOxgUqlcK518yluMuGZwqlr44FS1ppZB/5GWh4rZG89erpOBOJjU/OBsnCVFfapsRz6nEA==", - "optional": true, - "funding": { - "url": "https://github.com/sponsors/panva" - } - }, - "node_modules/js-tokens": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", - "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", - "dev": true - }, - "node_modules/js-yaml": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", - "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", - "dependencies": { - "argparse": "^2.0.1" - }, - "bin": { - "js-yaml": "bin/js-yaml.js" - } - }, - "node_modules/jsbn": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz", - "integrity": "sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg==" - }, - "node_modules/json-buffer": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz", - "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==", - "dev": true - }, - "node_modules/json-parse-better-errors": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/json-parse-better-errors/-/json-parse-better-errors-1.0.2.tgz", - "integrity": "sha512-mrqyZKfX5EhL7hvqcV6WG1yYjnjeuYDzDhhcAAUrq8Po85NBQBJP+ZDUT75qZQ98IkUoBqdkExkukOU7Ts2wrw==", - "dev": true - }, - "node_modules/json-schema": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz", - "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==" - }, - "node_modules/json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==" - }, - "node_modules/json-stable-stringify-without-jsonify": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz", - "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==", - "dev": true - }, - "node_modules/json-stringify-safe": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz", - "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==" - }, - "node_modules/json5": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/json5/-/json5-1.0.2.tgz", - "integrity": "sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==", - "dev": true, - "dependencies": { - "minimist": "^1.2.0" - }, - "bin": { - "json5": "lib/cli.js" - } - }, - "node_modules/jsonpath-plus": { - "version": "8.1.0", - "resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-8.1.0.tgz", - "integrity": "sha512-qVTiuKztFGw0dGhYi3WNqvddx3/SHtyDT0xJaeyz4uP0d1tkpG+0y5uYQ4OcIo1TLAz3PE/qDOW9F0uDt3+CTw==", - "bin": { - "jsonpath": "bin/jsonpath-cli.js", - "jsonpath-plus": "bin/jsonpath-cli.js" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/jsprim": { - "version": "1.4.2", - "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.2.tgz", - "integrity": "sha512-P2bSOMAc/ciLz6DzgjVlGJP9+BrJWu5UDGK70C2iweC5QBIeFf0ZXRvGjEj2uYgrY2MkAAhsSWHDWlFtEroZWw==", - "dependencies": { - "assert-plus": "1.0.0", - "extsprintf": "1.3.0", - "json-schema": "0.4.0", - "verror": "1.10.0" - }, - "engines": { - "node": ">=0.6.0" - } - }, - "node_modules/jsx-ast-utils": { - "version": "3.3.5", - "resolved": "https://registry.npmjs.org/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz", - "integrity": "sha512-ZZow9HBI5O6EPgSJLUb8n2NKgmVWTwCvHGwFuJlMjvLFqlGG6pjirPhtdsseaLZjSibD8eegzmYpUZwoIlj2cQ==", - "dev": true, - "dependencies": { - "array-includes": "^3.1.6", - "array.prototype.flat": "^1.3.1", - "object.assign": "^4.1.4", - "object.values": "^1.1.6" - }, - "engines": { - "node": ">=4.0" - } - }, - "node_modules/keyv": { - "version": "4.5.4", - "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", - "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==", - "dev": true, - "dependencies": { - "json-buffer": "3.0.1" - } - }, - "node_modules/kubernetes-resource-parser": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/kubernetes-resource-parser/-/kubernetes-resource-parser-0.1.0.tgz", - "integrity": "sha512-rr2K/4akDkY3oKgJ/KL3KAKw8Fb0VwBucGgKhvgqXluVhfn2BgEuJUXIDU+zt4eWaqOOjAC6ApUgnHF/SJ/iNw==", - "license": "MIT" - }, - "node_modules/levn": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz", - "integrity": "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==", - "dev": true, - "dependencies": { - "prelude-ls": "^1.2.1", - "type-check": "~0.4.0" - }, - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/load-json-file": { - "version": "5.3.0", - "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-5.3.0.tgz", - "integrity": "sha512-cJGP40Jc/VXUsp8/OrnyKyTZ1y6v/dphm3bioS+RrKXjK2BB6wHUd6JptZEFDGgGahMT+InnZO5i1Ei9mpC8Bw==", - "dev": true, - "dependencies": { - "graceful-fs": "^4.1.15", - "parse-json": "^4.0.0", - "pify": "^4.0.1", - "strip-bom": "^3.0.0", - "type-fest": "^0.3.0" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/load-json-file/node_modules/type-fest": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.3.1.tgz", - "integrity": "sha512-cUGJnCdr4STbePCgqNFbpVNCepa+kAVohJs1sLhxzdH+gnEoOd8VhbYa7pD3zZYGiURWM2xzEII3fQcRizDkYQ==", - "dev": true, - "engines": { - "node": ">=6" - } - }, - "node_modules/locate-path": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", - "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==", - "dev": true, - "dependencies": { - "p-locate": "^5.0.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/lodash.merge": { - "version": "4.6.2", - "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", - "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==", - "dev": true - }, - "node_modules/loose-envify": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", - "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", - "dev": true, - "dependencies": { - "js-tokens": "^3.0.0 || ^4.0.0" - }, - "bin": { - "loose-envify": "cli.js" - } - }, - "node_modules/lru-cache": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", - "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", - "optional": true, - "dependencies": { - "yallist": "^4.0.0" - }, - "engines": { - "node": ">=10" - } - }, - "node_modules/mime-db": { - "version": "1.52.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", - "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/mime-types": { - "version": "2.1.35", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", - "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", - "dependencies": { - "mime-db": "1.52.0" - }, - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/minimatch": { - "version": "9.0.5", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz", - "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==", - "dependencies": { - "brace-expansion": "^2.0.1" - }, - "engines": { - "node": ">=16 || 14 >=14.17" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/minimist": { - "version": "1.2.8", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", - "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", - "dev": true, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/minipass": { - "version": "7.1.2", - "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", - "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", - "engines": { - "node": ">=16 || 14 >=14.17" - } - }, - "node_modules/minizlib": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-3.0.1.tgz", - "integrity": "sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==", - "dependencies": { - "minipass": "^7.0.4", - "rimraf": "^5.0.5" - }, - "engines": { - "node": ">= 18" - } - }, - "node_modules/mkdirp": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-3.0.1.tgz", - "integrity": "sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==", - "bin": { - "mkdirp": "dist/cjs/src/bin.js" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/ms": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", - "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", - "dev": true - }, - "node_modules/natural-compare": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", - "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==", - "dev": true - }, - "node_modules/oauth-sign": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.9.0.tgz", - "integrity": "sha512-fexhUFFPTGV8ybAtSIGbV6gOkSv8UtRbDBnAyLQw4QPKkgNlsH2ByPGtMUqdWkos6YCRmAqViwgZrJc/mRDzZQ==", - "engines": { - "node": "*" - } - }, - "node_modules/object-assign": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", - "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", - "dev": true, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/object-hash": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/object-hash/-/object-hash-2.2.0.tgz", - "integrity": "sha512-gScRMn0bS5fH+IuwyIFgnh9zBdo4DV+6GhygmWM9HyNJSgS0hScp1f5vjtm7oIIOiT9trXrShAkLFSc2IqKNgw==", - "optional": true, - "engines": { - "node": ">= 6" - } - }, - "node_modules/object-inspect": { - "version": "1.13.2", - "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz", - "integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==", - "dev": true, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/object-keys": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz", - "integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==", - "dev": true, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/object.assign": { - "version": "4.1.5", - "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.5.tgz", - "integrity": "sha512-byy+U7gp+FVwmyzKPYhW2h5l3crpmGsxl7X2s8y43IgxvG4g3QZ6CffDtsNQy1WsmZpQbO+ybo0AlW7TY6DcBQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.5", - "define-properties": "^1.2.1", - "has-symbols": "^1.0.3", - "object-keys": "^1.1.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/object.entries": { - "version": "1.1.8", - "resolved": "https://registry.npmjs.org/object.entries/-/object.entries-1.1.8.tgz", - "integrity": "sha512-cmopxi8VwRIAw/fkijJohSfpef5PdN0pMQJN6VC/ZKvn0LIknWD8KtgY6KlQdEc4tIjcQ3HxSMmnvtzIscdaYQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-object-atoms": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/object.fromentries": { - "version": "2.0.8", - "resolved": "https://registry.npmjs.org/object.fromentries/-/object.fromentries-2.0.8.tgz", - "integrity": "sha512-k6E21FzySsSK5a21KRADBd/NGneRegFO5pLHfdQLpRDETUNJueLXs3WCzyQ3tFRDYgbq3KHGXfTbi2bs8WQ6rQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.2", - "es-object-atoms": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/object.groupby": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/object.groupby/-/object.groupby-1.0.3.tgz", - "integrity": "sha512-+Lhy3TQTuzXI5hevh8sBGqbmurHbbIjAi0Z4S63nthVLmLxfbj4T54a4CfZrXIrt9iP4mVAPYMo/v99taj3wjQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/object.values": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/object.values/-/object.values-1.2.0.tgz", - "integrity": "sha512-yBYjY9QX2hnRmZHAjG/f13MzmBzxzYgQhFrke06TTyKY5zSTEqkOeukBzIdVA3j3ulu8Qa3MbVFShV7T2RmGtQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-object-atoms": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/oidc-token-hash": { - "version": "5.0.3", - "resolved": "https://registry.npmjs.org/oidc-token-hash/-/oidc-token-hash-5.0.3.tgz", - "integrity": "sha512-IF4PcGgzAr6XXSff26Sk/+P4KZFJVuHAJZj3wgO3vX2bMdNVp/QXTP3P7CEm9V1IdG8lDLY3HhiqpsE/nOwpPw==", - "optional": true, - "engines": { - "node": "^10.13.0 || >=12.0.0" - } - }, - "node_modules/once": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", - "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", - "dev": true, - "dependencies": { - "wrappy": "1" - } - }, - "node_modules/openid-client": { - "version": "5.6.5", - "resolved": "https://registry.npmjs.org/openid-client/-/openid-client-5.6.5.tgz", - "integrity": "sha512-5P4qO9nGJzB5PI0LFlhj4Dzg3m4odt0qsJTfyEtZyOlkgpILwEioOhVVJOrS1iVH494S4Ee5OCjjg6Bf5WOj3w==", - "optional": true, - "dependencies": { - "jose": "^4.15.5", - "lru-cache": "^6.0.0", - "object-hash": "^2.2.0", - "oidc-token-hash": "^5.0.3" - }, - "funding": { - "url": "https://github.com/sponsors/panva" - } - }, - "node_modules/optionator": { - "version": "0.9.4", - "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz", - "integrity": "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==", - "dev": true, - "dependencies": { - "deep-is": "^0.1.3", - "fast-levenshtein": "^2.0.6", - "levn": "^0.4.1", - "prelude-ls": "^1.2.1", - "type-check": "^0.4.0", - "word-wrap": "^1.2.5" - }, - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/p-limit": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", - "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", - "dev": true, - "dependencies": { - "yocto-queue": "^0.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/p-locate": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz", - "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==", - "dev": true, - "dependencies": { - "p-limit": "^3.0.2" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/p-try": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", - "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==", - "dev": true, - "engines": { - "node": ">=6" - } - }, - "node_modules/package-json-from-dist": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.0.tgz", - "integrity": "sha512-dATvCeZN/8wQsGywez1mzHtTlP22H8OEfPrVMLNr4/eGa+ijtLn/6M5f0dY8UKNrC2O9UCU6SSoG3qRKnt7STw==" - }, - "node_modules/parent-module": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", - "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==", - "dev": true, - "dependencies": { - "callsites": "^3.0.0" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/parse-json": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-4.0.0.tgz", - "integrity": "sha512-aOIos8bujGN93/8Ox/jPLh7RwVnPEysynVFE+fQZyg6jKELEHwzgKdLRFHUgXJL6kylijVSBC4BvN9OmsB48Rw==", - "dev": true, - "dependencies": { - "error-ex": "^1.3.1", - "json-parse-better-errors": "^1.0.1" - }, - "engines": { - "node": ">=4" - } - }, - "node_modules/path-exists": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", - "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/path-is-absolute": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", - "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", - "dev": true, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/path-key": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", - "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==", - "engines": { - "node": ">=8" - } - }, - "node_modules/path-parse": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", - "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", - "dev": true - }, - "node_modules/path-scurry": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz", - "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==", - "dependencies": { - "lru-cache": "^10.2.0", - "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0" - }, - "engines": { - "node": ">=16 || 14 >=14.18" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/path-scurry/node_modules/lru-cache": { - "version": "10.4.3", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", - "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==" - }, - "node_modules/performance-now": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz", - "integrity": "sha512-7EAHlyLHI56VEIdK57uwHdHKIaAGbnXPiw0yWbarQZOKaKpvUIgW0jWRVLiatnM+XXlSwsanIBH/hzGMJulMow==" - }, - "node_modules/pify": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/pify/-/pify-4.0.1.tgz", - "integrity": "sha512-uB80kBFb/tfd68bVleG9T5GGsGPjJrLAUpR5PZIrhBnIaRTQRjqdJSsIKkOP6OAIFbj7GOrcudc5pNjZ+geV2g==", - "dev": true, - "engines": { - "node": ">=6" - } - }, - "node_modules/pkg-conf": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/pkg-conf/-/pkg-conf-3.1.0.tgz", - "integrity": "sha512-m0OTbR/5VPNPqO1ph6Fqbj7Hv6QU7gR/tQW40ZqrL1rjgCU85W6C1bJn0BItuJqnR98PWzw7Z8hHeChD1WrgdQ==", - "dev": true, - "dependencies": { - "find-up": "^3.0.0", - "load-json-file": "^5.2.0" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/pkg-conf/node_modules/find-up": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", - "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==", - "dev": true, - "dependencies": { - "locate-path": "^3.0.0" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/pkg-conf/node_modules/locate-path": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", - "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==", - "dev": true, - "dependencies": { - "p-locate": "^3.0.0", - "path-exists": "^3.0.0" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/pkg-conf/node_modules/p-limit": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", - "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", - "dev": true, - "dependencies": { - "p-try": "^2.0.0" - }, - "engines": { - "node": ">=6" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/pkg-conf/node_modules/p-locate": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", - "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==", - "dev": true, - "dependencies": { - "p-limit": "^2.0.0" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/pkg-conf/node_modules/path-exists": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", - "integrity": "sha512-bpC7GYwiDYQ4wYLe+FA8lhRjhQCMcQGuSgGGqDkg/QerRWw9CmGRT0iSOVRSZJ29NMLZgIzqaljJ63oaL4NIJQ==", - "dev": true, - "engines": { - "node": ">=4" - } - }, - "node_modules/possible-typed-array-names": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.0.0.tgz", - "integrity": "sha512-d7Uw+eZoloe0EHDIYoe+bQ5WXnGMOpmiZFTuMWCwpjzzkL2nTjcKiAk4hh8TjnGye2TwWOk3UXucZ+3rbmBa8Q==", - "dev": true, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/prelude-ls": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz", - "integrity": "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==", - "dev": true, - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/prop-types": { - "version": "15.8.1", - "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz", - "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==", - "dev": true, - "dependencies": { - "loose-envify": "^1.4.0", - "object-assign": "^4.1.1", - "react-is": "^16.13.1" - } - }, - "node_modules/psl": { - "version": "1.9.0", - "resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz", - "integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag==" - }, - "node_modules/punycode": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", - "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", - "engines": { - "node": ">=6" - } - }, - "node_modules/qs": { - "version": "6.5.3", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.3.tgz", - "integrity": "sha512-qxXIEh4pCGfHICj1mAJQ2/2XVZkjCDTcEgfoSQxc/fYivUZxTkk7L3bDBJSoNrEzXI17oUO5Dp07ktqE5KzczA==", - "engines": { - "node": ">=0.6" - } - }, - "node_modules/queue-microtask": { - "version": "1.2.3", - "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", - "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ] - }, - "node_modules/react-is": { - "version": "16.13.1", - "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz", - "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==", - "dev": true - }, - "node_modules/reflect.getprototypeof": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/reflect.getprototypeof/-/reflect.getprototypeof-1.0.6.tgz", - "integrity": "sha512-fmfw4XgoDke3kdI6h4xcUz1dG8uaiv5q9gcEwLS4Pnth2kxT+GZ7YehS1JTMGBQmtV7Y4GFGbs2re2NqhdozUg==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.1", - "es-errors": "^1.3.0", - "get-intrinsic": "^1.2.4", - "globalthis": "^1.0.3", - "which-builtin-type": "^1.1.3" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/regexp.prototype.flags": { - "version": "1.5.2", - "resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.5.2.tgz", - "integrity": "sha512-NcDiDkTLuPR+++OCKB0nWafEmhg/Da8aUPLPMQbK+bxKKCm1/S5he+AqYa4PlMCVBalb4/yxIRub6qkEx5yJbw==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.6", - "define-properties": "^1.2.1", - "es-errors": "^1.3.0", - "set-function-name": "^2.0.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/regexpp": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/regexpp/-/regexpp-3.2.0.tgz", - "integrity": "sha512-pq2bWo9mVD43nbts2wGv17XLiNLya+GklZ8kaDLV2Z08gDCsGpnKn9BFMepvWuHCbyVvY7J5o5+BVvoQbmlJLg==", - "dev": true, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/mysticatea" - } - }, - "node_modules/request": { - "version": "2.88.2", - "resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz", - "integrity": "sha512-MsvtOrfG9ZcrOwAW+Qi+F6HbD0CWXEh9ou77uOb7FM2WPhwT7smM833PzanhJLsgXjN89Ir6V2PczXNnMpwKhw==", - "deprecated": "request has been deprecated, see https://github.com/request/request/issues/3142", - "dependencies": { - "aws-sign2": "~0.7.0", - "aws4": "^1.8.0", - "caseless": "~0.12.0", - "combined-stream": "~1.0.6", - "extend": "~3.0.2", - "forever-agent": "~0.6.1", - "form-data": "~2.3.2", - "har-validator": "~5.1.3", - "http-signature": "~1.2.0", - "is-typedarray": "~1.0.0", - "isstream": "~0.1.2", - "json-stringify-safe": "~5.0.1", - "mime-types": "~2.1.19", - "oauth-sign": "~0.9.0", - "performance-now": "^2.1.0", - "qs": "~6.5.2", - "safe-buffer": "^5.1.2", - "tough-cookie": "~2.5.0", - "tunnel-agent": "^0.6.0", - "uuid": "^3.3.2" - }, - "engines": { - "node": ">= 6" - } - }, - "node_modules/request/node_modules/form-data": { - "version": "2.3.3", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz", - "integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==", - "dependencies": { - "asynckit": "^0.4.0", - "combined-stream": "^1.0.6", - "mime-types": "^2.1.12" - }, - "engines": { - "node": ">= 0.12" - } - }, - "node_modules/resolve": { - "version": "1.22.8", - "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz", - "integrity": "sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw==", - "dev": true, - "dependencies": { - "is-core-module": "^2.13.0", - "path-parse": "^1.0.7", - "supports-preserve-symlinks-flag": "^1.0.0" - }, - "bin": { - "resolve": "bin/resolve" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/resolve-from": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", - "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==", - "dev": true, - "engines": { - "node": ">=4" - } - }, - "node_modules/reusify": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", - "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==", - "dev": true, - "engines": { - "iojs": ">=1.0.0", - "node": ">=0.10.0" - } - }, - "node_modules/rfc4648": { - "version": "1.5.3", - "resolved": "https://registry.npmjs.org/rfc4648/-/rfc4648-1.5.3.tgz", - "integrity": "sha512-MjOWxM065+WswwnmNONOT+bD1nXzY9Km6u3kzvnx8F8/HXGZdz3T6e6vZJ8Q/RIMUSp/nxqjH3GwvJDy8ijeQQ==" - }, - "node_modules/rimraf": { - "version": "5.0.10", - "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-5.0.10.tgz", - "integrity": "sha512-l0OE8wL34P4nJH/H2ffoaniAokM2qSmrtXHmlpvYr5AVVX8msAyW0l8NVJFDxlSK4u3Uh/f41cQheDVdnYijwQ==", - "dependencies": { - "glob": "^10.3.7" - }, - "bin": { - "rimraf": "dist/esm/bin.mjs" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/run-parallel": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", - "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "dependencies": { - "queue-microtask": "^1.2.2" - } - }, - "node_modules/safe-array-concat": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/safe-array-concat/-/safe-array-concat-1.1.2.tgz", - "integrity": "sha512-vj6RsCsWBCf19jIeHEfkRMw8DPiBb+DMXklQ/1SGDHOMlHdPUkZXFQ2YdplS23zESTijAcurb1aSgJA3AgMu1Q==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "get-intrinsic": "^1.2.4", - "has-symbols": "^1.0.3", - "isarray": "^2.0.5" - }, - "engines": { - "node": ">=0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/safe-buffer": { - "version": "5.2.1", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", - "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ] - }, - "node_modules/safe-regex-test": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/safe-regex-test/-/safe-regex-test-1.0.3.tgz", - "integrity": "sha512-CdASjNJPvRa7roO6Ra/gLYBTzYzzPyyBXxIMdGW3USQLyjWEls2RgW5UBTXaQVp+OrpeCK3bLem8smtmheoRuw==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.6", - "es-errors": "^1.3.0", - "is-regex": "^1.1.4" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/safer-buffer": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", - "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" - }, - "node_modules/semver": { - "version": "6.3.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", - "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", - "dev": true, - "bin": { - "semver": "bin/semver.js" - } - }, - "node_modules/set-function-length": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", - "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==", - "dev": true, - "dependencies": { - "define-data-property": "^1.1.4", - "es-errors": "^1.3.0", - "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.4", - "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/set-function-name": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/set-function-name/-/set-function-name-2.0.2.tgz", - "integrity": "sha512-7PGFlmtwsEADb0WYyvCMa1t+yke6daIG4Wirafur5kcf+MhUnPms1UeR0CKQdTZD81yESwMHbtn+TR+dMviakQ==", - "dev": true, - "dependencies": { - "define-data-property": "^1.1.4", - "es-errors": "^1.3.0", - "functions-have-names": "^1.2.3", - "has-property-descriptors": "^1.0.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/shebang-command": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", - "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==", - "dependencies": { - "shebang-regex": "^3.0.0" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/shebang-regex": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz", - "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==", - "engines": { - "node": ">=8" - } - }, - "node_modules/side-channel": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz", - "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "es-errors": "^1.3.0", - "get-intrinsic": "^1.2.4", - "object-inspect": "^1.13.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/signal-exit": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", - "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", - "engines": { - "node": ">=14" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/sshpk": { - "version": "1.18.0", - "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.18.0.tgz", - "integrity": "sha512-2p2KJZTSqQ/I3+HX42EpYOa2l3f8Erv8MWKsy2I9uf4wA7yFIkXRffYdsx86y6z4vHtV8u7g+pPlr8/4ouAxsQ==", - "dependencies": { - "asn1": "~0.2.3", - "assert-plus": "^1.0.0", - "bcrypt-pbkdf": "^1.0.0", - "dashdash": "^1.12.0", - "ecc-jsbn": "~0.1.1", - "getpass": "^0.1.1", - "jsbn": "~0.1.0", - "safer-buffer": "^2.0.2", - "tweetnacl": "~0.14.0" - }, - "bin": { - "sshpk-conv": "bin/sshpk-conv", - "sshpk-sign": "bin/sshpk-sign", - "sshpk-verify": "bin/sshpk-verify" - }, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/standard": { - "version": "17.1.2", - "resolved": "https://registry.npmjs.org/standard/-/standard-17.1.2.tgz", - "integrity": "sha512-WLm12WoXveKkvnPnPnaFUUHuOB2cUdAsJ4AiGHL2G0UNMrcRAWY2WriQaV8IQ3oRmYr0AWUbLNr94ekYFAHOrA==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "dependencies": { - "eslint": "^8.41.0", - "eslint-config-standard": "17.1.0", - "eslint-config-standard-jsx": "^11.0.0", - "eslint-plugin-import": "^2.27.5", - "eslint-plugin-n": "^15.7.0", - "eslint-plugin-promise": "^6.1.1", - "eslint-plugin-react": "^7.36.1", - "standard-engine": "^15.1.0", - "version-guard": "^1.1.1" - }, - "bin": { - "standard": "bin/cmd.cjs" - }, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - } - }, - "node_modules/standard-engine": { - "version": "15.1.0", - "resolved": "https://registry.npmjs.org/standard-engine/-/standard-engine-15.1.0.tgz", - "integrity": "sha512-VHysfoyxFu/ukT+9v49d4BRXIokFRZuH3z1VRxzFArZdjSCFpro6rEIU3ji7e4AoAtuSfKBkiOmsrDqKW5ZSRw==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "dependencies": { - "get-stdin": "^8.0.0", - "minimist": "^1.2.6", - "pkg-conf": "^3.1.0", - "xdg-basedir": "^4.0.0" - }, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - } - }, - "node_modules/stream-buffers": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/stream-buffers/-/stream-buffers-3.0.3.tgz", - "integrity": "sha512-pqMqwQCso0PBJt2PQmDO0cFj0lyqmiwOMiMSkVtRokl7e+ZTRYgDHKnuZNbqjiJXgsg4nuqtD/zxuo9KqTp0Yw==", - "engines": { - "node": ">= 0.10.0" - } - }, - "node_modules/string-width": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz", - "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==", - "dependencies": { - "eastasianwidth": "^0.2.0", - "emoji-regex": "^9.2.2", - "strip-ansi": "^7.0.1" - }, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/string-width-cjs": { - "name": "string-width", - "version": "4.2.3", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", - "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", - "dependencies": { - "emoji-regex": "^8.0.0", - "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/string-width-cjs/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "engines": { - "node": ">=8" - } - }, - "node_modules/string-width-cjs/node_modules/emoji-regex": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", - "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==" - }, - "node_modules/string-width-cjs/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/string.prototype.matchall": { - "version": "4.0.11", - "resolved": "https://registry.npmjs.org/string.prototype.matchall/-/string.prototype.matchall-4.0.11.tgz", - "integrity": "sha512-NUdh0aDavY2og7IbBPenWqR9exH+E26Sv8e0/eTe1tltDGZL+GtBkDAnnyBtmekfK6/Dq3MkcGtzXFEd1LQrtg==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.2", - "es-errors": "^1.3.0", - "es-object-atoms": "^1.0.0", - "get-intrinsic": "^1.2.4", - "gopd": "^1.0.1", - "has-symbols": "^1.0.3", - "internal-slot": "^1.0.7", - "regexp.prototype.flags": "^1.5.2", - "set-function-name": "^2.0.2", - "side-channel": "^1.0.6" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/string.prototype.repeat": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/string.prototype.repeat/-/string.prototype.repeat-1.0.0.tgz", - "integrity": "sha512-0u/TldDbKD8bFCQ/4f5+mNRrXwZ8hg2w7ZR8wa16e8z9XpePWl3eGEcUD0OXpEH/VJH/2G3gjUtR3ZOiBe2S/w==", - "dev": true, - "dependencies": { - "define-properties": "^1.1.3", - "es-abstract": "^1.17.5" - } - }, - "node_modules/string.prototype.trim": { - "version": "1.2.9", - "resolved": "https://registry.npmjs.org/string.prototype.trim/-/string.prototype.trim-1.2.9.tgz", - "integrity": "sha512-klHuCNxiMZ8MlsOihJhJEBJAiMVqU3Z2nEXWfWnIqjN0gEFS9J9+IxKozWWtQGcgoa1WUZzLjKPTr4ZHNFTFxw==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.0", - "es-object-atoms": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/string.prototype.trimend": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/string.prototype.trimend/-/string.prototype.trimend-1.0.8.tgz", - "integrity": "sha512-p73uL5VCHCO2BZZ6krwwQE3kCzM7NKmis8S//xEC6fQonchbum4eP6kR4DLEjQFO3Wnj3Fuo8NM0kOSjVdHjZQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-object-atoms": "^1.0.0" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/string.prototype.trimstart": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/string.prototype.trimstart/-/string.prototype.trimstart-1.0.8.tgz", - "integrity": "sha512-UXSH262CSZY1tfu3G3Secr6uGLCFVPMhIqHjlgCUtCCcgihYc/xKs9djMTMUOb2j1mVSeU8EU6NWc/iQKU6Gfg==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-object-atoms": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/strip-ansi": { - "version": "7.1.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", - "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==", - "dependencies": { - "ansi-regex": "^6.0.1" - }, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/strip-ansi?sponsor=1" - } - }, - "node_modules/strip-ansi-cjs": { - "name": "strip-ansi", - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/strip-ansi-cjs/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "engines": { - "node": ">=8" - } - }, - "node_modules/strip-bom": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", - "integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==", - "dev": true, - "engines": { - "node": ">=4" - } - }, - "node_modules/strip-json-comments": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", - "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==", - "dev": true, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/supports-color": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", - "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", - "dev": true, - "dependencies": { - "has-flag": "^4.0.0" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/supports-preserve-symlinks-flag": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz", - "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", - "dev": true, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/tar": { - "version": "7.4.3", - "resolved": "https://registry.npmjs.org/tar/-/tar-7.4.3.tgz", - "integrity": "sha512-5S7Va8hKfV7W5U6g3aYxXmlPoZVAwUMy9AOKyF2fVuZa2UD3qZjg578OrLRt8PcNN1PleVaL/5/yYATNL0ICUw==", - "dependencies": { - "@isaacs/fs-minipass": "^4.0.0", - "chownr": "^3.0.0", - "minipass": "^7.1.2", - "minizlib": "^3.0.1", - "mkdirp": "^3.0.1", - "yallist": "^5.0.0" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/tar/node_modules/yallist": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz", - "integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==", - "engines": { - "node": ">=18" - } - }, - "node_modules/text-table": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz", - "integrity": "sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==", - "dev": true - }, - "node_modules/tough-cookie": { - "version": "2.5.0", - "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.5.0.tgz", - "integrity": "sha512-nlLsUzgm1kfLXSXfRZMc1KLAugd4hqJHDTvc2hDIwS3mZAfMEuMbc03SujMF+GEcpaX/qboeycw6iO8JwVv2+g==", - "dependencies": { - "psl": "^1.1.28", - "punycode": "^2.1.1" - }, - "engines": { - "node": ">=0.8" - } - }, - "node_modules/tsconfig-paths": { - "version": "3.15.0", - "resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.15.0.tgz", - "integrity": "sha512-2Ac2RgzDe/cn48GvOe3M+o82pEFewD3UPbyoUHHdKasHwJKjds4fLXWf/Ux5kATBKN20oaFGu+jbElp1pos0mg==", - "dev": true, - "dependencies": { - "@types/json5": "^0.0.29", - "json5": "^1.0.2", - "minimist": "^1.2.6", - "strip-bom": "^3.0.0" - } - }, - "node_modules/tslib": { - "version": "2.7.0", - "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.7.0.tgz", - "integrity": "sha512-gLXCKdN1/j47AiHiOkJN69hJmcbGTHI0ImLmbYLHykhgeN0jVGola9yVjFgzCUklsZQMW55o+dW7IXv3RCXDzA==" - }, - "node_modules/tunnel-agent": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", - "integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==", - "dependencies": { - "safe-buffer": "^5.0.1" - }, - "engines": { - "node": "*" - } - }, - "node_modules/tweetnacl": { - "version": "0.14.5", - "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz", - "integrity": "sha512-KXXFFdAbFXY4geFIwoyNK+f5Z1b7swfXABfL7HXCmoIWMKU3dmS26672A4EeQtDzLKy7SXmfBu51JolvEKwtGA==" - }, - "node_modules/type-check": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", - "integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==", - "dev": true, - "dependencies": { - "prelude-ls": "^1.2.1" - }, - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/type-fest": { - "version": "0.20.2", - "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz", - "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==", - "dev": true, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/typed-array-buffer": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.2.tgz", - "integrity": "sha512-gEymJYKZtKXzzBzM4jqa9w6Q1Jjm7x2d+sh19AdsD4wqnMPDYyvwpsIc2Q/835kHuo3BEQ7CjelGhfTsoBb2MQ==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "es-errors": "^1.3.0", - "is-typed-array": "^1.1.13" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/typed-array-byte-length": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/typed-array-byte-length/-/typed-array-byte-length-1.0.1.tgz", - "integrity": "sha512-3iMJ9q0ao7WE9tWcaYKIptkNBuOIcZCCT0d4MRvuuH88fEoEH62IuQe0OtraD3ebQEoTRk8XCBoknUNc1Y67pw==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "for-each": "^0.3.3", - "gopd": "^1.0.1", - "has-proto": "^1.0.3", - "is-typed-array": "^1.1.13" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/typed-array-byte-offset": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.2.tgz", - "integrity": "sha512-Ous0vodHa56FviZucS2E63zkgtgrACj7omjwd/8lTEMEPFFyjfixMZ1ZXenpgCFBBt4EC1J2XsyVS2gkG0eTFA==", - "dev": true, - "dependencies": { - "available-typed-arrays": "^1.0.7", - "call-bind": "^1.0.7", - "for-each": "^0.3.3", - "gopd": "^1.0.1", - "has-proto": "^1.0.3", - "is-typed-array": "^1.1.13" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/typed-array-length": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/typed-array-length/-/typed-array-length-1.0.6.tgz", - "integrity": "sha512-/OxDN6OtAk5KBpGb28T+HZc2M+ADtvRxXrKKbUwtsLgdoxgX13hyy7ek6bFRl5+aBs2yZzB0c4CnQfAtVypW/g==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.7", - "for-each": "^0.3.3", - "gopd": "^1.0.1", - "has-proto": "^1.0.3", - "is-typed-array": "^1.1.13", - "possible-typed-array-names": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/unbox-primitive": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.0.2.tgz", - "integrity": "sha512-61pPlCD9h51VoreyJ0BReideM3MDKMKnh6+V9L08331ipq6Q8OFXZYiqP6n/tbHx4s5I9uRhcye6BrbkizkBDw==", - "dev": true, - "dependencies": { - "call-bind": "^1.0.2", - "has-bigints": "^1.0.2", - "has-symbols": "^1.0.3", - "which-boxed-primitive": "^1.0.2" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/undici-types": { - "version": "6.19.8", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.19.8.tgz", - "integrity": "sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==" - }, - "node_modules/uri-js": { - "version": "4.4.1", - "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", - "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==", - "dependencies": { - "punycode": "^2.1.0" - } - }, - "node_modules/uuid": { - "version": "3.4.0", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.4.0.tgz", - "integrity": "sha512-HjSDRw6gZE5JMggctHBcjVak08+KEVhSIiDzFnT9S9aegmp85S/bReBVTb4QTFaRNptJ9kuYaNhnbNEOkbKb/A==", - "deprecated": "Please upgrade to version 7 or higher. Older versions may use Math.random() in certain circumstances, which is known to be problematic. See https://v8.dev/blog/math-random for details.", - "bin": { - "uuid": "bin/uuid" - } - }, - "node_modules/verror": { - "version": "1.10.0", - "resolved": "https://registry.npmjs.org/verror/-/verror-1.10.0.tgz", - "integrity": "sha512-ZZKSmDAEFOijERBLkmYfJ+vmk3w+7hOLYDNkRCuRuMJGEmqYNCNLyBBFwWKVMhfwaEF3WOd0Zlw86U/WC/+nYw==", - "engines": [ - "node >=0.6.0" - ], - "dependencies": { - "assert-plus": "^1.0.0", - "core-util-is": "1.0.2", - "extsprintf": "^1.2.0" - } - }, - "node_modules/version-guard": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/version-guard/-/version-guard-1.1.3.tgz", - "integrity": "sha512-JwPr6erhX53EWH/HCSzfy1tTFrtPXUe927wdM1jqBBeYp1OM+qPHjWbsvv6pIBduqdgxxS+ScfG7S28pzyr2DQ==", - "dev": true, - "engines": { - "node": ">=0.10.48" - } - }, - "node_modules/which": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", - "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==", - "dependencies": { - "isexe": "^2.0.0" - }, - "bin": { - "node-which": "bin/node-which" - }, - "engines": { - "node": ">= 8" - } - }, - "node_modules/which-boxed-primitive": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz", - "integrity": "sha512-bwZdv0AKLpplFY2KZRX6TvyuN7ojjr7lwkg6ml0roIy9YeuSr7JS372qlNW18UQYzgYK9ziGcerWqZOmEn9VNg==", - "dev": true, - "dependencies": { - "is-bigint": "^1.0.1", - "is-boolean-object": "^1.1.0", - "is-number-object": "^1.0.4", - "is-string": "^1.0.5", - "is-symbol": "^1.0.3" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/which-builtin-type": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/which-builtin-type/-/which-builtin-type-1.1.4.tgz", - "integrity": "sha512-bppkmBSsHFmIMSl8BO9TbsyzsvGjVoppt8xUiGzwiu/bhDCGxnpOKCxgqj6GuyHE0mINMDecBFPlOm2hzY084w==", - "dev": true, - "dependencies": { - "function.prototype.name": "^1.1.6", - "has-tostringtag": "^1.0.2", - "is-async-function": "^2.0.0", - "is-date-object": "^1.0.5", - "is-finalizationregistry": "^1.0.2", - "is-generator-function": "^1.0.10", - "is-regex": "^1.1.4", - "is-weakref": "^1.0.2", - "isarray": "^2.0.5", - "which-boxed-primitive": "^1.0.2", - "which-collection": "^1.0.2", - "which-typed-array": "^1.1.15" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/which-collection": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/which-collection/-/which-collection-1.0.2.tgz", - "integrity": "sha512-K4jVyjnBdgvc86Y6BkaLZEN933SwYOuBFkdmBu9ZfkcAbdVbpITnDmjvZ/aQjRXQrv5EPkTnD1s39GiiqbngCw==", - "dev": true, - "dependencies": { - "is-map": "^2.0.3", - "is-set": "^2.0.3", - "is-weakmap": "^2.0.2", - "is-weakset": "^2.0.3" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/which-typed-array": { - "version": "1.1.15", - "resolved": "https://registry.npmjs.org/which-typed-array/-/which-typed-array-1.1.15.tgz", - "integrity": "sha512-oV0jmFtUky6CXfkqehVvBP/LSWJ2sy4vWMioiENyJLePrBO/yKyV9OyJySfAKosh+RYkIl5zJCNZ8/4JncrpdA==", - "dev": true, - "dependencies": { - "available-typed-arrays": "^1.0.7", - "call-bind": "^1.0.7", - "for-each": "^0.3.3", - "gopd": "^1.0.1", - "has-tostringtag": "^1.0.2" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/word-wrap": { - "version": "1.2.5", - "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz", - "integrity": "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==", - "dev": true, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/wrap-ansi": { - "version": "8.1.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz", - "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==", - "dependencies": { - "ansi-styles": "^6.1.0", - "string-width": "^5.0.1", - "strip-ansi": "^7.0.1" - }, - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/chalk/wrap-ansi?sponsor=1" - } - }, - "node_modules/wrap-ansi-cjs": { - "name": "wrap-ansi", - "version": "7.0.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", - "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", - "dependencies": { - "ansi-styles": "^4.0.0", - "string-width": "^4.1.0", - "strip-ansi": "^6.0.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/wrap-ansi?sponsor=1" - } - }, - "node_modules/wrap-ansi-cjs/node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "engines": { - "node": ">=8" - } - }, - "node_modules/wrap-ansi-cjs/node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/wrap-ansi-cjs/node_modules/emoji-regex": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", - "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==" - }, - "node_modules/wrap-ansi-cjs/node_modules/string-width": { - "version": "4.2.3", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", - "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", - "dependencies": { - "emoji-regex": "^8.0.0", - "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/wrap-ansi-cjs/node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/wrappy": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", - "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", - "dev": true - }, - "node_modules/ws": { - "version": "8.18.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz", - "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==", - "engines": { - "node": ">=10.0.0" - }, - "peerDependencies": { - "bufferutil": "^4.0.1", - "utf-8-validate": ">=5.0.2" - }, - "peerDependenciesMeta": { - "bufferutil": { - "optional": true - }, - "utf-8-validate": { - "optional": true - } - } - }, - "node_modules/xdg-basedir": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/xdg-basedir/-/xdg-basedir-4.0.0.tgz", - "integrity": "sha512-PSNhEJDejZYV7h50BohL09Er9VaIefr2LMAf3OEmpCkjOi34eYyQYAXUTjEQtZJTKcF0E2UKTh+osDLsgNim9Q==", - "dev": true, - "engines": { - "node": ">=8" - } - }, - "node_modules/yallist": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", - "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", - "optional": true - }, - "node_modules/yocto-queue": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", - "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", - "dev": true, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - } - } -} diff --git a/tools/cluster-checker/package.json b/tools/cluster-checker/package.json deleted file mode 100644 index 231ef88..0000000 --- a/tools/cluster-checker/package.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "dependencies": { - "@kubernetes/client-node": "^0.21.0", - "kubernetes-resource-parser": "0.1.0" - }, - "devDependencies": { - "standard": "^17.1.2" - } -} diff --git a/tools/gotmpl/.gitignore b/tools/gotmpl/.gitignore deleted file mode 100644 index b4c6eb4..0000000 --- a/tools/gotmpl/.gitignore +++ /dev/null @@ -1 +0,0 @@ -gotmpl \ No newline at end of file diff --git a/tools/gotmpl/README.md b/tools/gotmpl/README.md deleted file mode 100644 index 6ac6911..0000000 --- a/tools/gotmpl/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Go Template Tool - -A simple CLI wrapping Go templates that is used to generate the SETUP -files for the MLBatch project. - diff --git a/tools/gotmpl/go.mod b/tools/gotmpl/go.mod deleted file mode 100644 index 6d690b9..0000000 --- a/tools/gotmpl/go.mod +++ /dev/null @@ -1,5 +0,0 @@ -module github.com/project-codeflare/mlbatch/tools/gotmpl - -go 1.22.4 - -require sigs.k8s.io/yaml v1.4.0 // indirect diff --git a/tools/gotmpl/go.sum b/tools/gotmpl/go.sum deleted file mode 100644 index 8c72424..0000000 --- a/tools/gotmpl/go.sum +++ /dev/null @@ -1,4 +0,0 @@ -github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= -sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/tools/gotmpl/gotmpl.go b/tools/gotmpl/gotmpl.go deleted file mode 100644 index eb2dfd8..0000000 --- a/tools/gotmpl/gotmpl.go +++ /dev/null @@ -1,56 +0,0 @@ -package main - -import ( - "flag" - "log" - "os" - "text/template" - - "sigs.k8s.io/yaml" -) - -func main() { - var input string - var output string - var values string - flag.StringVar(&input, "input", "", "The input template file") - flag.StringVar(&output, "output", "", "The output file") - flag.StringVar(&values, "values", "", "The values.yaml file") - - flag.CommandLine.SetOutput(os.Stderr) - flag.Parse() - - if input == "" { - log.Fatal("Must provide input template filename") - } - - if output == "" { - log.Fatal("Must provide output filename") - } - - if values == "" { - log.Fatal("Must provide input values filename") - } - - tmpl, err := template.ParseFiles(input) - if err != nil { - log.Fatalf("Parsing input template: %v", err) - } - - valueBytes, err := os.ReadFile(values) - if err != nil { - log.Fatalf("Reading values: %v", err) - } - var vals map[string]interface{} - err = yaml.Unmarshal(valueBytes, &vals) - if err != nil { - log.Fatalf("Processing values: %v", err) - } - - outfile, err := os.Create(output) - if err != nil { - log.Fatalf("Creating output file: %v", err) - } - - tmpl.Execute(outfile, vals) -} diff --git a/tools/pytorchjob-generator/README.md b/tools/pytorchjob-generator/README.md deleted file mode 100644 index 84ac186..0000000 --- a/tools/pytorchjob-generator/README.md +++ /dev/null @@ -1,72 +0,0 @@ -# PyTorchJob Generator - -The Helm chart defined in this folder facilitates the configuration of PyTorch -jobs for submission to an OpenShift cluster implementing MLBatch. - -Invocations of this chart generate a `PyTorchJob` wrapped into an `AppWrapper` -for better traceability and fault-tolerance. - -## Obtaining the Chart - -To start with, add the `mlbatch` Helm chart repository. -```sh -helm repo add mlbatch https://project-codeflare.github.io/mlbatch -helm repo update -``` -To verify the chart was installed correctly, search for `AppWrapper`. -```sh -helm search repo AppWrapper -``` -You should see output similar to the following: -```sh -NAME CHART VERSION APP VERSION DESCRIPTION -mlbatch/pytorchjob-generator 1.1.9 v1beta2 An AppWrapper generator for PyTorchJobs -``` - -## Configuring the Job - -Create a `settings.yaml` file with the settings for the PyTorch job, for -example: -```yaml -jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required) -queueName: default-queue # local queue to submit to (default: default-queue) - -numPods: 4 # total pod count including master and worker pods (default: 1) -numCpusPerPod: 500m # requested number of cpus per pod (default: 1) -numGpusPerPod: 8 # requested number of gpus per pod (default: 0) -totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi) - -priority: default-priority # default-priority (default), low-priority, or high-priority - -# container image for the pods (required) -containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - -# setup commands to run in each pod (optional) -setupCommands: -- git clone https://github.com/dbarnett/python-helloworld -- cd python-helloworld - -# main program to invoke via torchrun (optional) -mainProgram: helloworld.py -``` - -To learn more about the available settings see [chart/README.md](chart/README.md). - -## Submitting the Job - -To submit the Pytorch job to the cluster using the `settings.yaml` file, run: -```sh -helm template -f settings.yaml mlbatch/pytorchjob-generator | oc create -f- -``` -+ -To optionally capture the generated `AppWrapper` specification as a -`generated.yaml` file, run instead: -```sh -helm template -f settings.yaml mlbatch/pytorchjob-generator | tee generated.yaml | oc create -f- -``` - -To remove the PyTorch job from the cluster, delete the generated `AppWrapper` -object: -```sh -oc delete appwrapper my-job -``` diff --git a/tools/pytorchjob-generator/chart/.helmignore b/tools/pytorchjob-generator/chart/.helmignore deleted file mode 100644 index 2b29f27..0000000 --- a/tools/pytorchjob-generator/chart/.helmignore +++ /dev/null @@ -1 +0,0 @@ -tests diff --git a/tools/pytorchjob-generator/chart/Chart.yaml b/tools/pytorchjob-generator/chart/Chart.yaml deleted file mode 100644 index 6d45f81..0000000 --- a/tools/pytorchjob-generator/chart/Chart.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: v2 -name: pytorchjob-generator -description: An AppWrapper generator for PyTorchJobs -type: application -version: 1.1.9 -appVersion: "v1beta2" diff --git a/tools/pytorchjob-generator/chart/README.md b/tools/pytorchjob-generator/chart/README.md deleted file mode 100644 index f710179..0000000 --- a/tools/pytorchjob-generator/chart/README.md +++ /dev/null @@ -1,79 +0,0 @@ -# pytorchjob-generator - -An AppWrapper generator for PyTorchJobs - -![Version: 1.1.9](https://img.shields.io/badge/Version-1.1.9-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v1beta2](https://img.shields.io/badge/AppVersion-v1beta2-informational?style=flat-square) - -## Overview - -This file documents the variables that may be set in a user's `settings.yaml` to -customize the Jobs generated by the tool. - -## Values - -### Job Metadata - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| jobName | string | must be provided by user | Name of the Job. Will be the name of the AppWrapper and the PyTorchJob. | -| namespace | string | `nil` | Namespace in which to run the Job. If unspecified, the namespace will be inferred using normal Helm/Kubernetes mechanisms when the Job is submitted. | -| queueName | string | `"default-queue"` | Name of the local queue to which the Job will be submitted. | -| priority | string | `"default-priority"` | Type of priority for the job (choose from: "default-priority", "low-priority" or "high-priority"). | -| customLabels | array | `nil` | Optional array of custom labels to add to all the resources created by the Job (the PyTorchJob, the PodGroup, and the AppWrapper). | -| containerImage | string | must be provided by the user | Image used for creating the Job's containers (needs to have all the applications your job may need) | -| imagePullSecrets | array | `nil` | List of image-pull-secrets to be used for pulling containerImages | -| imagePullPolicy | string | `"IfNotPresent"` | Policy for pulling containerImages (choose from: "IfNotPresent", "Always", or "Never") | - -### Resource Requirements - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| numPods | integer | `1` | Total number of pods (i.e. master + worker pods) to be created | -| numCpusPerPod | integer or string | `1` | Number of CPUs for each pod. May be a positive integer or a ResourceQuantity (eg 500m) | -| numGpusPerPod | integer | `0` | Number of GPUs for each pod (all GPUs per node is currently recommended for distributed training). | -| totalMemoryPerPod | string | `"1Gi"` | Total memory for each pod expressed as a ResourceQuantity (eg 1Gi, 200M, etc.). | -| limitCpusPerPod | integer or string | numCpusPerPod | Limit on the number of CPUs per pod for elastic jobs. May be a positive integer or a ResourceQuantity (eg 500m). | -| limitGpusPerPod | integer | numGpusPerPod | Limit of number of GPUs per pod for elastic jobs. | -| limitMemoryPerPod | string | totalMemoryPerPod | Limit of total memory per pod for elastic jobs (eg 1Gi, 200M, etc.). | - -### Workload Specification - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| environmentVariables | array | `nil` | List of variables/values to be defined for all the ranks. Values can be literals or references to Kuberetes secrets or configmaps. See [values.yaml](values.yaml) for examples of supported syntaxes. NOTE: The following standard [PyTorch Distributed environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization) are set automatically and can be referenced in the commands without being set manually: WORLD_SIZE, RANK, MASTER_ADDR, MASTER_PORT. | -| sshGitCloneConfig | object | `nil` | Private GitHub clone support. See [values.yaml](values.yaml) for additional instructions. | -| setupCommands | array | no custom commands are executed | List of custom commands to be ran at the beginning of the execution. Use `setupCommand` to clone code, download data, and change directories. | -| mainProgram | string | `nil` | Name of the PyTorch program to be executed by `torchrun`. Please provide your program name here and NOT in "setupCommands" as this helm template provides the necessary "torchrun" arguments for the parallel execution. WARNING: this program is relative to the current path set by change-of-directory commands in "setupCommands". If no value is provided; then only `setupCommands` are executed and torchrun is elided. | -| volumes | array | No volumes are mounted | List of "(name, claimName, mountPath)" of volumes, with persistentVolumeClaim, to be mounted to the infrastructure | - -### Advanced Options - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| roceGdrResName | string | nvidia.com/roce_gdr | RoCE GDR resource name (can vary by cluster configuration) | -| numRoceGdr | integer | `0` | number of nvidia.com/roce_grd resources (0 means disabled; >0 means enable GDR over RoCE). Must be 0 unless numPods > 1. | -| topologyFileConfigMap | string | `nil` | Name of configmap containining /var/run/nvidia-topologyd/virtualTopology.xml for the system e.g. nvidia-topo-gdr | -| ncclGdrEnvConfigMap | string | `nil` | Name of configmap containing NCCL networking environment variables for the system e.g. nccl-netwk-env-vars | -| multiNicNetworkName | string | `nil` | Name of multi-NIC network, if one is available. Note: when GDR over RoCE is used/available, the RoCE multi-nic network instance should be specified here instead of the TCP multi-nic network instance. Existing instance names can be listed with `oc get multinicnetwork`. | -| disableSharedMemory | boolean | `false` | Control whether or not a shared memory volume is added to the PyTorchJob. | -| mountNVMe | object | `nil` | Mount NVMe as a volume. The environment variable MOUNT_PATH_NVME provides the runtime mount path | -| initContainers | array | `nil` | List of "(name, image, command[])" specifying an init containers to be run before the main job. The 'command' field is a list of commands to run in the container, see the Kubernetes entry on initContainers for reference. | -| autopilotHealthChecks | array | No pre-flight checks are enabled. | Autopilot health checks. List of labels enabling one or more system health pre-flight checks. | -| hostIgnoreList | array | `nil` | List of host names on which the Job must not be scheduled (to avoid faulty nodes). | -| schedulerName | string | `nil` | If non-nil, use the specified Kubernetes scheduler. ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** | -| serviceAccountName | string | the default service account for the namespace will be used. | Service account to be used for running the Job | - -### Fault Tolerance - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| admissionGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the admissionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| warmupGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the warmupGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| failureGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the failureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| retryPausePeriodDuration | string | The AppWrapper defaults will be used | Customize the retryPausePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| retryLimit | integer | The AppWrapper defaults will be used | Customize the retryLimit; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| forcefulDeletionGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the forcefulDeletionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| deletionOnFailureGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the deletionOnFailureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| successTTLDuration | string | The AppWrapper defaults will be used | Customize the successTTL; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| restartPolicy | string | `"Never"` | Set Kubernetes policy for restarting failed containers "in place" (without restarting the Pod). | -| terminationGracePeriodSeconds | integer | Kubernetes's default value is used | Set a non-default pod termination grace period (in seconds). | diff --git a/tools/pytorchjob-generator/chart/README.md.gotmpl b/tools/pytorchjob-generator/chart/README.md.gotmpl deleted file mode 100644 index dbf620e..0000000 --- a/tools/pytorchjob-generator/chart/README.md.gotmpl +++ /dev/null @@ -1,11 +0,0 @@ -{{ template "chart.header" . }} -{{ template "chart.description" . }} - -{{ template "chart.versionBadge" . }}{{ template "chart.typeBadge" . }}{{ template "chart.appVersionBadge" . }} - -## Overview - -This file documents the variables that may be set in a user's `settings.yaml` to -customize the Jobs generated by the tool. - -{{ template "chart.valuesSection" . }} diff --git a/tools/pytorchjob-generator/chart/templates/_helpers.tpl b/tools/pytorchjob-generator/chart/templates/_helpers.tpl deleted file mode 100644 index 68b09ab..0000000 --- a/tools/pytorchjob-generator/chart/templates/_helpers.tpl +++ /dev/null @@ -1,309 +0,0 @@ -# This file factors out code snippets that are duplicated in both the Master and Worker templates. - -{{- define "mlbatch.customLabels" }} -{{- if .Values.customLabels }} -{{- range $customLabel := .Values.customLabels }} -{{ $customLabel.key }}: {{ $customLabel.value }} -{{- end }} -{{- end }} -{{- end -}} - - -{{- define "mlbatch.container.metadata" }} -{{- if or .Values.customLabels .Values.autopilotHealthChecks .Values.multiNicNetworkName }} -metadata: - {{- if or .Values.customLabels .Values.autopilotHealthChecks }} - labels: - {{- include "mlbatch.customLabels" . | indent 8 }} - {{- if .Values.autopilotHealthChecks }} - autopilot: "" - {{- range $healthcheck := .Values.autopilotHealthChecks }} - {{ $healthcheck }}: "" - {{- end }} - {{- end }} - {{- end }} - {{- if .Values.multiNicNetworkName }} - annotations: - k8s.v1.cni.cncf.io/networks: {{ .Values.multiNicNetworkName }} - {{- end }} -{{- end }} -{{- end -}} - - -{{- define "mlbatch.schedulingSpec" }} -{{- if ne .Values.terminationGracePeriodSeconds nil }} -terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }} -{{- end }} -{{- if .Values.schedulerName }} -schedulerName: {{ .Values.schedulerName }} -{{- end }} -priorityClassName: {{ .Values.priority }} -affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT -{{- if .Values.hostIgnoreList }} - - key: kubernetes.io/hostname - operator: NotIn - values: - {{- range $host := .Values.hostIgnoreList }} - - {{ $host }} - {{- end }} -{{- end }} -{{- end -}} - - -{{- define "mlbatch.resources" }} -resources: - requests: - cpu: {{ .Values.numCpusPerPod }} - nvidia.com/gpu: {{ .Values.numGpusPerPod }} - memory: {{ .Values.totalMemoryPerPod }} - {{ .Values.roceGdrResName | default "nvidia.com/roce_gdr" }}: {{ .Values.numRoceGdr | default 0 }} - limits: - cpu: {{ .Values.limitCpusPerPod | default .Values.numCpusPerPod }} - nvidia.com/gpu: {{ .Values.limitGpusPerPod | default .Values.numGpusPerPod }} - memory: {{ .Values.limitMemoryPerPod | default .Values.totalMemoryPerPod }} - {{ .Values.roceGdrResName | default "nvidia.com/roce_gdr" }}: {{ .Values.numRoceGdr | default 0 }} -{{- end -}} - - -{{- define "mlbatch.env" }} -{{- if .Values.ncclGdrEnvConfigMap }} -envFrom: - - configMapRef: - name: {{ .Values.ncclGdrEnvConfigMap }} -{{- end }} -{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName "sakkara" ) }} -env: - {{- if eq .Values.schedulerName "sakkara" }} - - name: SAKKARA_RANK - valueFrom: - fieldRef: - fieldPath: metadata.labels['sakkara.member.rank'] - {{- end }} - {{- if .Values.topologyFileConfigMap }} - - name: NCCL_TOPO_FILE - value: /var/run/nvidia-topologyd/virtualTopology.xml - {{- end }} - {{- if .Values.mountNVMe }} - - name: NVME_MOUNT_PATH - {{- if .Values.mountNVMe.mountPath }} - value: {{ .Values.mountNVMe.mountPath | quote }} - {{- else }} - value: "/workspace/scratch-nvme" - {{- end }} - {{- end }} - {{- range $variable := .Values.environmentVariables }} - - name: {{ required "Missing 'name' in 'environmentVariables' list element" $variable.name }} - {{- if $variable.value }} - value: {{ $variable.value | quote }} - {{- else if $variable.secret }} - valueFrom: - secretKeyRef: - name: {{ required "Missing 'name' in 'environmentVariables.secret' list element" $variable.secret.name }} - key: {{ required "Missing 'key' in 'environmentVariables.secret' list element" $variable.secret.key | quote }} - {{- else if $variable.configmap }} - valueFrom: - configMapKeyRef: - name: {{ required "Missing 'name' in 'environmentVariables.configmap' list element" $variable.configmap.name }} - key: {{ required "Missing 'key' in 'environmentVariables.configmap' list element" $variable.configmap.key | quote }} - {{- else if ( kindIs "float64" $variable.value ) }} - value: "0" - {{- else }} - value: {{ required "Missing 'value' in 'environmentVariables' list element" "" }} - {{- end }} - {{- end }} - {{- if .Values.sshGitCloneConfig }} - - name: GIT_SSH_COMMAND - {{- if .Values.sshGitCloneConfig.sshCmd }} - value: {{ .Values.sshGitCloneConfig.sshCmd | quote }} - {{- else if .Values.sshGitCloneConfig.secretMountPath }} - {{- if .Values.sshGitCloneConfig.configMapMountPath }} - value: "ssh -i {{ .Values.sshGitCloneConfig.secretMountPath }}/id_rsa -o UserKnownHostsFile={{ .Values.sshGitCloneConfig.configMapMountPath }}/known_hosts -vv" - {{- else }} - value: "ssh -i {{ .Values.sshGitCloneConfig.secretMountPath }}/id_rsa -o UserKnownHostsFile=/tmp/.ssh/hosts/known_hosts -vv" - {{- end }} - {{- else if .Values.sshGitCloneConfig.configMapMountPath }} - value: "ssh -i /tmp/.ssh/keys/id_rsa -o UserKnownHostsFile={{ .Values.sshGitCloneConfig.configMapMountPath }}/known_hosts -vv" - {{- else }} - value: "ssh -i /tmp/.ssh/keys/id_rsa -o UserKnownHostsFile=/tmp/.ssh/hosts/known_hosts -vv" - {{- end }} - {{- end }} -{{- else }} -env: [] -{{- end }} -{{- end -}} - - -{{- define "mlbatch.command" }} -command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - {{- if eq .Values.schedulerName "sakkara" }} - echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank" - export RANK=$SAKKARA_RANK - {{- end }} - {{- range $command := .Values.setupCommands }} - {{ $command }} - {{- end }} - {{- if .Values.mainProgram }} - {{- if gt ( int .Values.numGpusPerPod ) 0 }} - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node={{ .Values.numGpusPerPod }} --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" {{ .Values.mainProgram }} - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node={{ .Values.numGpusPerPod }} --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" {{ .Values.mainProgram }} - {{- else }} - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" {{ .Values.mainProgram }} - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" {{ .Values.mainProgram }} - {{- end }} - {{- end }} -{{- end -}} - - -{{- define "mlbatch.volumeMounts" }} -{{- if or .Values.volumes .Values.sshGitCloneConfig ( not .Values.disableSharedMemory ) .Values.mountNVMe }} -volumeMounts: - {{- if .Values.topologyFileConfigMap }} - - name: topology-volume - mountPath: /var/run/nvidia-topologyd - {{- end }} - {{- if .Values.mountNVMe }} - - name: ephemeral-odf-lvm-vg1 - {{- if .Values.mountNVMe.mountPath }} - mountPath: {{ .Values.mountNVMe.mountPath | quote }} - {{- else }} - mountPath: "/workspace/scratch-nvme" - {{- end }} - {{- end }} - {{- range $volume := .Values.volumes }} - - name: {{ required "Missing 'name' in 'volumes' list element" $volume.name }} - mountPath: {{ required "Missing 'mountPath' in 'volumes' list element" $volume.mountPath }} - {{- end }} - {{- if .Values.sshGitCloneConfig }} - - name: private-ssh-git-deploy-key - readOnly: true - {{- if .Values.sshGitCloneConfig.secretMountPath }} - mountPath: {{ .Values.sshGitCloneConfig.secretMountPath }} - {{- else }} - mountPath: "/tmp/.ssh/keys" - {{- end }} - - name: github-known-hosts - {{- if .Values.sshGitCloneConfig.configMapMountPath }} - mountPath: {{ .Values.sshGitCloneConfig.configMapMountPath }} - {{- else }} - mountPath: "/tmp/.ssh/hosts" - {{- end }} - {{- end }} - {{- if eq .Values.disableSharedMemory false }} - - name: dshm - mountPath: "/dev/shm" - {{- end }} -{{- else }} -volumeMounts: [] -{{- end }} -{{- end -}} - - -{{- define "mlbatch.volumes" }} -{{- if or .Values.volumes .Values.sshGitCloneConfig ( not .Values.disableSharedMemory ) .Values.mountNVMe }} -volumes: - {{- if .Values.topologyFileConfigMap }} - - name: topology-volume - configMap: - name: {{ .Values.topologyFileConfigMap }} - {{- end }} - {{- if .Values.mountNVMe }} - - name: ephemeral-odf-lvm-vg1 - ephemeral: - volumeClaimTemplate: - spec: - storageClassName: odf-lvm-vg1 - volumeMode: Filesystem - accessModes: [ "ReadWriteOnce" ] - resources: - requests: - storage: {{ .Values.mountNVMe.storage }} - {{- end }} - {{- range $volume := .Values.volumes }} - - name: {{ required "Missing 'name' in 'volumes' list element" $volume.name }} - persistentVolumeClaim: - claimName: {{ required "Missing 'claimName' in 'volumes' list element" $volume.claimName }} - {{- end }} - {{- if .Values.sshGitCloneConfig }} - - name: private-ssh-git-deploy-key - secret: - secretName: {{ required "Missing 'secretName' in 'sshGitCloneConfig' " .Values.sshGitCloneConfig.secretName }} - optional: false - - name: github-known-hosts - configMap: - name: {{ required "Missing 'configMapName' in 'sshGitCloneConfig' " .Values.sshGitCloneConfig.configMapName }} - {{- end }} - -{{- if eq .Values.disableSharedMemory false }} - - name: dshm - emptyDir: - medium: Memory - {{- end }} -{{- else }} -volumes: [] -{{- end }} -{{- end -}} - - -{{- define "mlbatch.initContainers" }} -{{- if .Values.initContainers }} -initContainers: - {{- range $container := .Values.initContainers }} - - name: {{ required "Missing 'name' of initContainer" $container.name }} - image: {{ required "Missing 'image' of initContainer" $container.image }} - {{- if ( required "Missing 'command' array of initContainer" $container.command ) }} - {{- if kindIs "string" $container.command }} - command: {{ $container.command }} - {{- else }} - command: - {{- range $command := $container.command }} - - {{ $command }} - {{- end }} - {{- end }} - {{- end }} - {{- end }} -{{- end }} -{{- end -}} - - -{{- define "mlbatch.imagePullSecrets" }} -{{- if .Values.imagePullSecrets }} -imagePullSecrets: - {{- range $secret := .Values.imagePullSecrets }} - - name: {{ $secret.name }} - {{- end }} -{{- else }} -imagePullSecrets: [] -{{- end }} -{{- end -}} - - -{{- define "mlbatch.securityContext" }} -{{- if or (gt ( int .Values.numRoceGdr ) 0) (eq .Values.serviceAccountName "gdr") }} -securityContext: - capabilities: - add: - - IPC_LOCK -{{- end }} -{{- end -}} diff --git a/tools/pytorchjob-generator/chart/templates/appwrapper.yaml b/tools/pytorchjob-generator/chart/templates/appwrapper.yaml deleted file mode 100644 index 7702e3e..0000000 --- a/tools/pytorchjob-generator/chart/templates/appwrapper.yaml +++ /dev/null @@ -1,149 +0,0 @@ -{{- if .Values.jobName -}} -{{- if eq ( regexMatch "^[a-z]([-a-z0-9]*[a-z0-9])?$" .Values.jobName ) false -}} -{{ required "The 'jobName' provided is NOT correct. Some possible causes are: it begins with a number or a special character (including '-'), has one or more capital letters somewhere in the name, has one or more special characters other than '-', it ends with a special character (including '-')" "" }} -{{- else -}} -{{- if gt ( len .Values.jobName ) 50 -}} -{{ required "Your 'jobName' cannot be longer than 50 characters" "" -}} -{{- end -}} -{{- end -}} -{{- else -}} -{{ required "Please specify an 'jobName' in the user file" "" -}} -{{- end -}} - -{{- if .Values.mountNVMe -}} -{{- if (not .Values.mountNVMe.storage) -}} -{{ required "A 'storage' value is required for mountNVMe" "" }} -{{- end -}} -{{- end -}} - -{{- if .Values.customLabels -}} -{{- range $customLabel := .Values.customLabels -}} -{{- if not $customLabel.key -}} -{{ required "Missing 'key' in 'customLabels' list element" $customLabel.key }} -{{- end -}} -{{- $customLabelKey := split "/" $customLabel.key -}} -{{- if gt ( len $customLabelKey._0 ) 63 -}} -{{ required "The name of the 'customLabels.key' must be less than 64 characters" "" }} -{{- end -}} -{{- if eq ( regexMatch "^[a-z]([-a-z0-9._]*[a-z0-9])?$" $customLabelKey._0 ) false -}} -{{ required "The name of the 'customLabels.key' provided is NOT correct. Some possible causes are: it begins with a number or a special character (including '-._'), has one or more capital letters somewhere in the key, has one or more special characters other than '-._', it ends with a special character (including '-._')" "" }} -{{- end -}} -{{- if $customLabelKey._1 }} -{{- if gt ( len $customLabelKey._1 ) 254 -}} -{{ required "The prefix of the 'customLabels.key' must be less than 254 characters" "" }} -{{- end -}} -{{- if eq ( regexMatch "^[a-z]([-a-z0-9.]*[a-z0-9])?$" $customLabelKey._1 ) false -}} -{{ required "The prefix of the 'customLabels.key' provided is NOT correct. Some possible causes are: it begins with a number or a special character (including '-.'), has one or more capital letters somewhere in the key, has one or more special characters other than '-.', it ends with a special character (including '-.')" "" }} -{{- end -}} -{{- end -}} -{{- if not $customLabel.value -}} -{{ required "Missing 'value' in 'customLabels' list element" $customLabel.value }} -{{- end -}} -{{- if gt ( len $customLabel.value ) 63 -}} -{{ required "The length of the 'customLabels.value' must be less than 64 characters" "" }} -{{- end -}} -{{- if eq ( regexMatch "^[a-z]([-a-z0-9._]*[a-z0-9])?$" $customLabel.value ) false -}} -{{ required "The 'customLabels.value' provided is NOT correct. Some possible causes are: it begins with a number or a special character (including '-._'), has one or more capital letters somewhere in the name, has one or more special characters other than '-._', it ends with a special character (including '-._')" "" }} -{{- end -}} -{{- end -}} -{{- end -}} - -apiVersion: workload.codeflare.dev/v1beta2 -kind: AppWrapper -metadata: - name: {{ .Values.jobName }} - {{- if .Values.namespace }} - namespace: {{ .Values.namespace }} - {{- end }} - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: "{{ .Chart.Version }}" - {{- if .Values.admissionGracePeriodDuration }} - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: "{{ .Values.admissionGracePeriodDuration }}" - {{- end }} - {{- if .Values.warmupGracePeriodDuration }} - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: "{{ .Values.warmupGracePeriodDuration }}" - {{- end }} - {{- if .Values.failureGracePeriodDuration }} - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: "{{ .Values.failureGracePeriodDuration }}" - {{- end }} - {{- if .Values.retryPausePeriodDuration }} - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: "{{ .Values.retryPausePeriodDuration }}" - {{- end }} - {{- if ne .Values.retryLimit nil }} - workload.codeflare.dev.appwrapper/retryLimit: "{{ .Values.retryLimit }}" - {{- end }} - {{- if .Values.forcefulDeletionGracePeriodDuration }} - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: "{{ .Values.forcefulDeletionGracePeriodDuration }}" - {{- end }} - {{- if .Values.deletionOnFailureGracePeriodDuration }} - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "{{ .Values.deletionOnFailureGracePeriodDuration }}" - {{- end }} - {{- if .Values.successTTLDuration }} - workload.codeflare.dev.appwrapper/successTTLDuration: "{{ .Values.successTTLDuration }}" - {{- end }} - {{- if or .Values.queueName .Values.customLabels }} - labels: - {{- if .Values.queueName }} - kueue.x-k8s.io/queue-name: {{ .Values.queueName }} - {{- end }} - {{- include "mlbatch.customLabels" . | indent 8 }} - {{- end }} -spec: - components: - - template: - apiVersion: "kubeflow.org/v1" - kind: "PyTorchJob" - metadata: - name: {{ .Values.jobName }} - {{- if .Values.customLabels }} - labels: - {{- include "mlbatch.customLabels" . | indent 26 }} - {{- end }} - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: {{ .Values.restartPolicy | default "Never" }} - template: - {{- include "mlbatch.container.metadata" . | indent 34 }} - spec: - {{- if .Values.serviceAccountName }} - serviceAccountName: {{ .Values.serviceAccountName }} - {{- end }} - {{- include "mlbatch.imagePullSecrets" . | indent 38 }} - {{- include "mlbatch.initContainers" . | indent 38 }} - {{- include "mlbatch.schedulingSpec" . | indent 38 }} - {{- include "mlbatch.volumes" . | indent 38 }} - containers: - - name: pytorch - image: {{ required "Please specify a 'containerImage' in the user file" .Values.containerImage }} - imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }} - {{- include "mlbatch.securityContext" . | indent 44 }} - {{- include "mlbatch.env" . | indent 44 }} - {{- include "mlbatch.volumeMounts" . | indent 44 }} - {{- include "mlbatch.resources" . | indent 44 }} - {{- include "mlbatch.command" . | indent 44 }} - {{- if gt ( .Values.numPods | int ) 1 }} {{- /*Including a worker spec when only 1 pod (Master) is specified leads to strange behavior */}} - Worker: - replicas: {{ sub .Values.numPods 1 }} - restartPolicy: {{ .Values.restartPolicy | default "Never" }} - template: - {{- include "mlbatch.container.metadata" . | indent 34 }} - spec: - {{- if .Values.serviceAccountName }} - serviceAccountName: {{ .Values.serviceAccountName }} - {{- end }} - {{- include "mlbatch.imagePullSecrets" . | indent 38 }} - {{- include "mlbatch.initContainers" . | indent 38 }} - {{- include "mlbatch.schedulingSpec" . | indent 38 }} - {{- include "mlbatch.volumes" . | indent 38 }} - containers: - - name: pytorch - image: {{ required "Please specify a 'containerImage' in the user file" .Values.containerImage }} - imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }} - {{- include "mlbatch.securityContext" . | indent 44 }} - {{- include "mlbatch.env" . | indent 44 }} - {{- include "mlbatch.volumeMounts" . | indent 44 }} - {{- include "mlbatch.resources" . | indent 44 }} - {{- include "mlbatch.command" . | indent 44 }} - {{- end }} diff --git a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap deleted file mode 100644 index 16870fc..0000000 --- a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap +++ /dev/null @@ -1,1678 +0,0 @@ -Adding Volume Mounts: - 1: | - apiVersion: workload.codeflare.dev/v1beta2 - kind: AppWrapper - metadata: - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 - labels: - kueue.x-k8s.io/queue-name: default-queue - name: my-job - namespace: my-namespace - spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: my-job - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: [] - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /path/to/where/you/want/to/find/your/data - name: arbitrary-name-0 - - mountPath: /path/to/where/you/want/to/find/your/data-redux - name: arbitrary-name-1 - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - name: arbitrary-name-0 - persistentVolumeClaim: - claimName: name-matching-the-actual-PersistentVolumeClaim - - name: arbitrary-name-1 - persistentVolumeClaim: - claimName: name-matching-another-actual-PersistentVolumeClaim - - emptyDir: - medium: Memory - name: dshm - Worker: - replicas: 3 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: [] - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /path/to/where/you/want/to/find/your/data - name: arbitrary-name-0 - - mountPath: /path/to/where/you/want/to/find/your/data-redux - name: arbitrary-name-1 - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - name: arbitrary-name-0 - persistentVolumeClaim: - claimName: name-matching-the-actual-PersistentVolumeClaim - - name: arbitrary-name-1 - persistentVolumeClaim: - claimName: name-matching-another-actual-PersistentVolumeClaim - - emptyDir: - medium: Memory - name: dshm -Adding initContainers: - 1: | - apiVersion: workload.codeflare.dev/v1beta2 - kind: AppWrapper - metadata: - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 - labels: - kueue.x-k8s.io/queue-name: default-queue - name: my-job - namespace: my-namespace - spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: my-job - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: [] - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - whoami && ls -l - image: busybox - name: init-container-1 - - command: - - sh - - -c - - echo hello world! - image: ubuntu - name: init-container-2 - priorityClassName: default-priority - volumes: - - emptyDir: - medium: Memory - name: dshm - Worker: - replicas: 3 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: [] - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - whoami && ls -l - image: busybox - name: init-container-1 - - command: - - sh - - -c - - echo hello world! - image: ubuntu - name: init-container-2 - priorityClassName: default-priority - volumes: - - emptyDir: - medium: Memory - name: dshm -AppWrapper metadata should match snapshot: - 1: | - apiVersion: workload.codeflare.dev/v1beta2 - kind: AppWrapper - metadata: - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 - labels: - kueue.x-k8s.io/queue-name: default-queue - name: my-job - namespace: my-namespace - spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: my-job - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: [] - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - emptyDir: - medium: Memory - name: dshm - Worker: - replicas: 3 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: [] - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - emptyDir: - medium: Memory - name: dshm -AppWrapper spec should match snapshot: - 1: | - apiVersion: workload.codeflare.dev/v1beta2 - kind: AppWrapper - metadata: - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 - labels: - kueue.x-k8s.io/queue-name: default-queue - name: my-job - namespace: my-namespace - spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: my-job - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: [] - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - emptyDir: - medium: Memory - name: dshm - Worker: - replicas: 3 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: [] - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - emptyDir: - medium: Memory - name: dshm -Enabling NVMe: - 1: | - apiVersion: workload.codeflare.dev/v1beta2 - kind: AppWrapper - metadata: - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 - labels: - kueue.x-k8s.io/queue-name: default-queue - name: my-job - namespace: my-namespace - spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: my-job - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: NVME_MOUNT_PATH - value: /workspace/scratch-nvme - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /workspace/scratch-nvme - name: ephemeral-odf-lvm-vg1 - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - ephemeral: - volumeClaimTemplate: - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 800Gi - storageClassName: odf-lvm-vg1 - volumeMode: Filesystem - name: ephemeral-odf-lvm-vg1 - - emptyDir: - medium: Memory - name: dshm - Worker: - replicas: 3 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: NVME_MOUNT_PATH - value: /workspace/scratch-nvme - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /workspace/scratch-nvme - name: ephemeral-odf-lvm-vg1 - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - ephemeral: - volumeClaimTemplate: - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 800Gi - storageClassName: odf-lvm-vg1 - volumeMode: Filesystem - name: ephemeral-odf-lvm-vg1 - - emptyDir: - medium: Memory - name: dshm -Enabling RoCE GDR: - 1: | - apiVersion: workload.codeflare.dev/v1beta2 - kind: AppWrapper - metadata: - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 - labels: - kueue.x-k8s.io/queue-name: default-queue - name: my-job - namespace: my-namespace - spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: my-job - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: Never - template: - metadata: - annotations: - k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3 - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: NCCL_TOPO_FILE - value: /var/run/nvidia-topologyd/virtualTopology.xml - envFrom: - - configMapRef: - name: nccl-netwk-env-vars - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 2 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 2 - securityContext: - capabilities: - add: - - IPC_LOCK - volumeMounts: - - mountPath: /var/run/nvidia-topologyd - name: topology-volume - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - configMap: - name: nvidia-topo-gdr - name: topology-volume - - emptyDir: - medium: Memory - name: dshm - Worker: - replicas: 3 - restartPolicy: Never - template: - metadata: - annotations: - k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3 - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: NCCL_TOPO_FILE - value: /var/run/nvidia-topologyd/virtualTopology.xml - envFrom: - - configMapRef: - name: nccl-netwk-env-vars - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 2 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 2 - securityContext: - capabilities: - add: - - IPC_LOCK - volumeMounts: - - mountPath: /var/run/nvidia-topologyd - name: topology-volume - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - configMap: - name: nvidia-topo-gdr - name: topology-volume - - emptyDir: - medium: Memory - name: dshm -Enabling all advanced features at once: - 1: | - apiVersion: workload.codeflare.dev/v1beta2 - kind: AppWrapper - metadata: - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 - labels: - kueue.x-k8s.io/queue-name: default-queue - name: my-job - namespace: my-namespace - spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: my-job - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: Never - template: - metadata: - annotations: - k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3 - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: NCCL_TOPO_FILE - value: /var/run/nvidia-topologyd/virtualTopology.xml - - name: NVME_MOUNT_PATH - value: /workspace/scratch-nvme - - name: GIT_SSH_COMMAND - value: ssh -i /tmp/.ssh/keys/id_rsa -o UserKnownHostsFile=/tmp/.ssh/hosts/known_hosts -vv - envFrom: - - configMapRef: - name: nccl-netwk-env-vars - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 2 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 2 - securityContext: - capabilities: - add: - - IPC_LOCK - volumeMounts: - - mountPath: /var/run/nvidia-topologyd - name: topology-volume - - mountPath: /workspace/scratch-nvme - name: ephemeral-odf-lvm-vg1 - - mountPath: /path/to/where/you/want/to/find/your/data - name: arbitrary-name-0 - - mountPath: /path/to/where/you/want/to/find/your/data-redux - name: arbitrary-name-1 - - mountPath: /tmp/.ssh/keys - name: private-ssh-git-deploy-key - readOnly: true - - mountPath: /tmp/.ssh/hosts - name: github-known-hosts - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - whoami && ls -l - image: busybox - name: init-container-1 - - command: - - sh - - -c - - echo hello world! - image: ubuntu - name: init-container-2 - priorityClassName: default-priority - volumes: - - configMap: - name: nvidia-topo-gdr - name: topology-volume - - ephemeral: - volumeClaimTemplate: - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 800Gi - storageClassName: odf-lvm-vg1 - volumeMode: Filesystem - name: ephemeral-odf-lvm-vg1 - - name: arbitrary-name-0 - persistentVolumeClaim: - claimName: name-matching-the-actual-PersistentVolumeClaim - - name: arbitrary-name-1 - persistentVolumeClaim: - claimName: name-matching-another-actual-PersistentVolumeClaim - - name: private-ssh-git-deploy-key - secret: - optional: false - secretName: my-git-secret - - configMap: - name: my-git-config-map - name: github-known-hosts - - emptyDir: - medium: Memory - name: dshm - Worker: - replicas: 3 - restartPolicy: Never - template: - metadata: - annotations: - k8s.v1.cni.cncf.io/networks: multi-nic-cni-operator-ipvlanl3 - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: NCCL_TOPO_FILE - value: /var/run/nvidia-topologyd/virtualTopology.xml - - name: NVME_MOUNT_PATH - value: /workspace/scratch-nvme - - name: GIT_SSH_COMMAND - value: ssh -i /tmp/.ssh/keys/id_rsa -o UserKnownHostsFile=/tmp/.ssh/hosts/known_hosts -vv - envFrom: - - configMapRef: - name: nccl-netwk-env-vars - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 2 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 2 - securityContext: - capabilities: - add: - - IPC_LOCK - volumeMounts: - - mountPath: /var/run/nvidia-topologyd - name: topology-volume - - mountPath: /workspace/scratch-nvme - name: ephemeral-odf-lvm-vg1 - - mountPath: /path/to/where/you/want/to/find/your/data - name: arbitrary-name-0 - - mountPath: /path/to/where/you/want/to/find/your/data-redux - name: arbitrary-name-1 - - mountPath: /tmp/.ssh/keys - name: private-ssh-git-deploy-key - readOnly: true - - mountPath: /tmp/.ssh/hosts - name: github-known-hosts - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - whoami && ls -l - image: busybox - name: init-container-1 - - command: - - sh - - -c - - echo hello world! - image: ubuntu - name: init-container-2 - priorityClassName: default-priority - volumes: - - configMap: - name: nvidia-topo-gdr - name: topology-volume - - ephemeral: - volumeClaimTemplate: - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 800Gi - storageClassName: odf-lvm-vg1 - volumeMode: Filesystem - name: ephemeral-odf-lvm-vg1 - - name: arbitrary-name-0 - persistentVolumeClaim: - claimName: name-matching-the-actual-PersistentVolumeClaim - - name: arbitrary-name-1 - persistentVolumeClaim: - claimName: name-matching-another-actual-PersistentVolumeClaim - - name: private-ssh-git-deploy-key - secret: - optional: false - secretName: my-git-secret - - configMap: - name: my-git-config-map - name: github-known-hosts - - emptyDir: - medium: Memory - name: dshm -Enabling sshGitConfig injects the envvars, volumes, and volumeMounts: - 1: | - apiVersion: workload.codeflare.dev/v1beta2 - kind: AppWrapper - metadata: - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 - labels: - kueue.x-k8s.io/queue-name: default-queue - name: my-job - namespace: my-namespace - spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: my-job - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: GIT_SSH_COMMAND - value: ssh -i /tmp/.ssh/keys/id_rsa -o UserKnownHostsFile=/tmp/.ssh/hosts/known_hosts -vv - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /tmp/.ssh/keys - name: private-ssh-git-deploy-key - readOnly: true - - mountPath: /tmp/.ssh/hosts - name: github-known-hosts - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - name: private-ssh-git-deploy-key - secret: - optional: false - secretName: my-git-secret - - configMap: - name: my-git-config-map - name: github-known-hosts - - emptyDir: - medium: Memory - name: dshm - Worker: - replicas: 3 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: GIT_SSH_COMMAND - value: ssh -i /tmp/.ssh/keys/id_rsa -o UserKnownHostsFile=/tmp/.ssh/hosts/known_hosts -vv - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /tmp/.ssh/keys - name: private-ssh-git-deploy-key - readOnly: true - - mountPath: /tmp/.ssh/hosts - name: github-known-hosts - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - name: private-ssh-git-deploy-key - secret: - optional: false - secretName: my-git-secret - - configMap: - name: my-git-config-map - name: github-known-hosts - - emptyDir: - medium: Memory - name: dshm -scheduler can be set: - 1: | - apiVersion: workload.codeflare.dev/v1beta2 - kind: AppWrapper - metadata: - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 - labels: - kueue.x-k8s.io/queue-name: default-queue - name: my-job - namespace: my-namespace - spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: my-job - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank" - export RANK=$SAKKARA_RANK - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: SAKKARA_RANK - valueFrom: - fieldRef: - fieldPath: metadata.labels['sakkara.member.rank'] - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - schedulerName: sakkara - volumes: - - emptyDir: - medium: Memory - name: dshm - Worker: - replicas: 3 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank" - export RANK=$SAKKARA_RANK - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: SAKKARA_RANK - valueFrom: - fieldRef: - fieldPath: metadata.labels['sakkara.member.rank'] - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - schedulerName: sakkara - volumes: - - emptyDir: - medium: Memory - name: dshm -user-defined environment variables: - 1: | - apiVersion: workload.codeflare.dev/v1beta2 - kind: AppWrapper - metadata: - annotations: - workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9 - labels: - kueue.x-k8s.io/queue-name: default-queue - name: my-job - namespace: my-namespace - spec: - components: - - template: - apiVersion: kubeflow.org/v1 - kind: PyTorchJob - metadata: - name: my-job - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: EXAMPLE_VAR1 - value: "6" - - name: EXAMPLE_VAR2 - value: example2string - - name: EXAMPLE_VAR3 - valueFrom: - secretKeyRef: - key: my-secret-key - name: my-secret-name - - name: EXAMPLE_VAR4 - valueFrom: - configMapKeyRef: - key: my-configmap-key - name: my-configmap-name - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - emptyDir: - medium: Memory - name: dshm - Worker: - replicas: 3 - restartPolicy: Never - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: autopilot.ibm.com/gpuhealth - operator: NotIn - values: - - ERR - - TESTING - - EVICT - containers: - - command: - - sh - - -c - - | - echo "Environment variables set by the kubeflow training operator:" - echo ${MASTER_ADDR}:${MASTER_PORT} - echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED} - echo My global rank is ${RANK} / ${WORLD_SIZE} - echo "Other injected environment variables:" - echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH} - # - # User commands - # - git clone https://github.com/dbarnett/python-helloworld - cd python-helloworld - echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py - env: - - name: EXAMPLE_VAR1 - value: "6" - - name: EXAMPLE_VAR2 - value: example2string - - name: EXAMPLE_VAR3 - valueFrom: - secretKeyRef: - key: my-secret-key - name: my-secret-name - - name: EXAMPLE_VAR4 - valueFrom: - configMapKeyRef: - key: my-configmap-key - name: my-configmap-name - image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - imagePullPolicy: IfNotPresent - name: pytorch - resources: - limits: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - requests: - cpu: 500m - memory: 1Gi - nvidia.com/gpu: 8 - nvidia.com/roce_gdr: 0 - volumeMounts: - - mountPath: /dev/shm - name: dshm - imagePullSecrets: [] - priorityClassName: default-priority - volumes: - - emptyDir: - medium: Memory - name: dshm diff --git a/tools/pytorchjob-generator/chart/tests/helloworld.settings.yaml b/tools/pytorchjob-generator/chart/tests/helloworld.settings.yaml deleted file mode 100644 index 7fafe14..0000000 --- a/tools/pytorchjob-generator/chart/tests/helloworld.settings.yaml +++ /dev/null @@ -1,21 +0,0 @@ -namespace: my-namespace # namespace to deploy to (required) -jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required) -queueName: default-queue # local queue to submit to (default: default-queue) - -numPods: 4 # total pod count including master and worker pods (default: 1) -numCpusPerPod: 500m # requested number of cpus per pod (default: 1) -numGpusPerPod: 8 # requested number of gpus per pod (default: 0) -totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi) - -priority: default-priority # default-priority (default), low-priority, or high-priority - -# container image for the pods (required) -containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - -# setup commands to run in each pod (optional) -setupCommands: -- git clone https://github.com/dbarnett/python-helloworld -- cd python-helloworld - -# main program to invoke via torchrun (optional) -mainProgram: helloworld.py diff --git a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml deleted file mode 100644 index 83aa908..0000000 --- a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml +++ /dev/null @@ -1,272 +0,0 @@ -suite: AppWrapper Unit Tests -templates: -- templates/appwrapper.yaml -values: -- helloworld.settings.yaml -tests: -- it: AppWrapper metadata should match snapshot - asserts: - - matchSnapshot: - path: spec - -- it: AppWrapper spec should match snapshot - asserts: - - matchSnapshot: - path: spec - -- it: PyTorch worker is elided for single pod Jobs - set: - numPods: 1 - asserts: - - exists: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Master - - notExists: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Worker - -- it: Custom labels are injected at all levels - set: - customLabels: - - key: project-name - value: my-project - asserts: - - isSubset: - path: metadata.labels - content: - project-name: my-project - - isSubset: - path: spec.components[0].template.metadata.labels - content: - project-name: my-project - - isSubset: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Master.template.metadata.labels - content: - project-name: my-project - - isSubset: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Worker.template.metadata.labels - content: - project-name: my-project - -- it: Invalid job names are rejected - set: - jobName: 123Job - asserts: - - failedTemplate: {} - -- it: Long job names are rejected - set: - jobName: this-job-name-is-just-way-too-long-to-be-acceptable-for-our-chart - asserts: - - failedTemplate: {} - -- it: Disabling shared memory removes volumes and volume mounts - set: - disableSharedMemory: true - asserts: - - isEmpty: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Master.template.spec.volumes - - isEmpty: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Master.template.spec.containers[0].volumeMounts - - isEmpty: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Worker.template.spec.volumes - - isEmpty: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Worker.template.spec.containers[0].volumeMounts - -- it: queueName can be disabled - set: - queueName: - asserts: - - notExists: - path: metadata.labels - -- it: namespace can be set - set: - namespace: testing-ns - asserts: - - equal: - path: metadata.namespace - value: testing-ns - -- it: scheduler can be set - set: - schedulerName: sakkara - asserts: - - matchSnapshot: - path: spec.components[0].template - -- it: imagePullPolicy can be set - set: - imagePullPolicy: Always - asserts: - - equal: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Worker.template.spec.containers[0].imagePullPolicy - value: Always - - equal: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Worker.template.spec.containers[0].imagePullPolicy - value: Always - -- it: Invalid imagePullPolicies are rejected - set: - imagePullPolicy: Sometimes - asserts: - - failedTemplate: {} - -- it: Enabling sshGitConfig injects the envvars, volumes, and volumeMounts - set: - sshGitCloneConfig.secretName: my-git-secret - sshGitCloneConfig.configMapName: my-git-config-map - asserts: - - matchSnapshot: - path: spec.components[0].template - -- it: user-defined environment variables - set: - environmentVariables: - - name: EXAMPLE_VAR1 - value: 6 - - name: EXAMPLE_VAR2 - value: "example2string" - - name: EXAMPLE_VAR3 - secret: - name: my-secret-name - key: my-secret-key - - name: EXAMPLE_VAR4 - configmap: - name: my-configmap-name - key: my-configmap-key - asserts: - - matchSnapshot: - path: spec.components[0].template - -- it: Enabling RoCE GDR - set: - roceGdrResName: nvidia.com/roce_gdr - numRoceGdr: 2 - topologyFileConfigMap: nvidia-topo-gdr - ncclGdrEnvConfigMap: nccl-netwk-env-vars - multiNicNetworkName: multi-nic-cni-operator-ipvlanl3 - asserts: - - matchSnapshot: - path: spec.components[0].template - -- it: Enabling NVMe - set: - mountNVMe.storage: 800Gi - mountNVMe.mountPath: "/workspace/scratch-nvme" - asserts: - - matchSnapshot: - path: spec.components[0].template - -- it: imagePullSecrets - set: - imagePullSecrets: - - name: secret-one - asserts: - - equal: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Master.template.spec.imagePullSecrets[0].name - value: secret-one - - equal: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Worker.template.spec.imagePullSecrets[0].name - value: secret-one - -- it: Adding Volume Mounts - set: - volumes: - - name: arbitrary-name-0 - claimName: name-matching-the-actual-PersistentVolumeClaim - mountPath: /path/to/where/you/want/to/find/your/data - - name: arbitrary-name-1 - claimName: name-matching-another-actual-PersistentVolumeClaim - mountPath: /path/to/where/you/want/to/find/your/data-redux - asserts: - - matchSnapshot: - path: spec.components[0].template - -- it: Adding initContainers - set: - initContainers: - - name: init-container-1 - image: busybox - command: ["sh", "-c", "whoami && ls -l"] - - name: init-container-2 - image: ubuntu - command: ["sh", "-c", "echo hello world!"] - asserts: - - matchSnapshot: - patch: spec.components[0].template - -- it: Setting fault tolerance annotations - set: - admissionGracePeriodDuration: "10s" - warmupGracePeriodDuration: "11s" - failureGracePeriodDuration: "22s" - retryPausePeriodDuration: "17s" - retryLimit: 42 - forcefulDeletionGracePeriodDuration: "19s" - deletionOnFailureGracePeriodDuration: "2s" - successTTLDuration: "600s" - asserts: - - isSubset: - path: metadata.annotations - content: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: "10s" - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: "11s" - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: "22s" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: "17s" - workload.codeflare.dev.appwrapper/retryLimit: "42" - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: "19s" - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "2s" - workload.codeflare.dev.appwrapper/successTTLDuration: "600s" - -- it: Setting integer fault tolerance annotation to 0 - set: - retryLimit: 0 - terminationGracePeriodSeconds: 0 - asserts: - - isSubset: - path: metadata.annotations - content: - workload.codeflare.dev.appwrapper/retryLimit: "0" - - equal: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Master.template.spec.terminationGracePeriodSeconds - value: 0 - - equal: - path: spec.components[0].template.spec.pytorchReplicaSpecs.Worker.template.spec.terminationGracePeriodSeconds - value: 0 - -- it: Setting just one tolerance annotation - set: - deletionOnFailureGracePeriodDuration: "6h" - asserts: - - isSubset: - path: metadata.annotations - content: - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "6h" - -- it: Enabling all advanced features at once - set: - sshGitCloneConfig.secretName: my-git-secret - sshGitCloneConfig.configMapName: my-git-config-map - roceGdrResName: nvidia.com/roce_gdr - numRoceGdr: 2 - topologyFileConfigMap: nvidia-topo-gdr - ncclGdrEnvConfigMap: nccl-netwk-env-vars - multiNicNetworkName: multi-nic-cni-operator-ipvlanl3 - mountNVMe.storage: 800Gi - mountNVMe.mountPath: "/workspace/scratch-nvme" - volumes: - - name: arbitrary-name-0 - claimName: name-matching-the-actual-PersistentVolumeClaim - mountPath: /path/to/where/you/want/to/find/your/data - - name: arbitrary-name-1 - claimName: name-matching-another-actual-PersistentVolumeClaim - mountPath: /path/to/where/you/want/to/find/your/data-redux - initContainers: - - name: init-container-1 - image: busybox - command: ["sh", "-c", "whoami && ls -l"] - - name: init-container-2 - image: ubuntu - command: ["sh", "-c", "echo hello world!"] - asserts: - - matchSnapshot: - path: spec.components[0].template diff --git a/tools/pytorchjob-generator/chart/values.schema.json b/tools/pytorchjob-generator/chart/values.schema.json deleted file mode 100644 index 9bebe8f..0000000 --- a/tools/pytorchjob-generator/chart/values.schema.json +++ /dev/null @@ -1,200 +0,0 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema#", - "type": "object", - "required": [ - "jobName", - "containerImage" - ], - "additionalProperties": false, - "properties": { - "jobName": { "type": "string" }, - "namespace": { "oneOf": [ - { "type": "null" }, - { "$ref": "#/$defs/rfc1123Label" } - ]}, - "queueName": { "oneOf": [ - { "type": "null" }, - { "$ref": "#/$defs/rfc1123Label" } - ]}, - "priority": { "type": "string", "enum": [ "default-priority", "low-priority", "high-priority" ] }, - "customLabels": { "oneOf": [ - { "type": "null" }, - { "type": "array" } - ]}, - "containerImage": { "type": "string" }, - "numPods" : { "type": "integer", "minimum": 1 }, - "numCpusPerPod": { "$ref": "#/$defs/resourceCPU" }, - "numGpusPerPod": { "type": "integer", "minimum": 0 }, - "totalMemoryPerPod": { "$ref": "#/$defs/resourceMemory" }, - "limitCpusPerPod": { "oneOf": [ - { "type": "null" }, - { "$ref": "#/$defs/resourceCPU" } - ]}, - "limitGpusPerPod":{ "oneOf": [ - { "type": "null" }, - { "type": "integer", "minimum": 0 } - ]}, - "limitMemoryPerPod": { "oneOf": [ - { "type": "null" }, - { "$ref": "#/$defs/resourceMemory" } - ]}, - "environmentVariables": { "oneOf": [ - { "type": "null" }, - { "type": "array" } - ]}, - "sshGitCloneConfig": { "oneOf": [ - { "type": "null" }, - { - "type": "object", - "properties": { - "secretName": { "$ref": "#/$defs/rfc1123Label" }, - "configMapName": { "$ref": "#/$defs/rfc1123Label" }, - "secretMountPath": { "type": "string" }, - "configMapMountPath": { "type": "string" }, - "sshCmd": { "type": "string" } - }, - "required": [ "secretName", "configMapName" ], - "additionalProperties": false - } - ]}, - "setupCommands": { "oneOf": [ - { "type": "null" }, - { "type": "array" } - ]}, - "mainProgram": { "oneOf": [ - { "type": "null" }, - { "type": "string" } - ]}, - "imagePullSecrets": { "oneOf": [ - { "type": "null" }, - { "type": "array" } - ]}, - "imagePullPolicy": { "oneOf": [ - { "type": "null" }, - { "type": "string", "enum": [ "IfNotPresent", "Always", "Never" ] } - ]}, - "volumes": { "oneOf": [ - { "type": "null" }, - { "type": "array" } - ]}, - "roceGdrResName": { "oneOf": [ - { "type": "null" }, - { "type": "string" } - ]}, - "numRoceGdr": { "type": "integer", "minimum": 0 }, - "topologyFileConfigMap": { "oneOf": [ - { "type": "null" }, - { "$ref": "#/$defs/rfc1123Label" } - ]}, - "ncclGdrEnvConfigMap": { "oneOf": [ - { "type": "null" }, - { "$ref": "#/$defs/rfc1123Label" } - ]}, - "multiNicNetworkName": { "oneOf": [ - { "type": "null" }, - { "type": "string" } - ]}, - "disableSharedMemory": { "type": "boolean" }, - "mountNVMe": { "oneOf" : [ - { "type": "null" }, - { "type": "object", - "properties": { - "mountPath": { "type": "string" }, - "storage": { "type": "string" } - }, - "additionalProperties": false - } - ]}, - "initContainers": { "oneOf": [ - { "type": "null" }, - { "type": "array" } - ]}, - "autopilotHealthChecks": { "oneOf": [ - { "type": "null" }, - { "type": "array" } - ]}, - "restartPolicy" : { "type": "string", "enum": ["Never", "Always", "OnFailure" ] }, - "hostIgnoreList": { "oneOf" : [ - { "type": "null" }, - { "type": "array" } - ]}, - "schedulerName": { "oneOf": [ - { "type": "null" }, - { "type": "string", "enum": ["sakkara", "scheduler-plugins-scheduler", "default-scheduler" ] } - ]}, - "serviceAccountName": { "oneOf" : [ - { "type": "null" }, - { "$ref": "#/$defs/rfc1123Label" } - ]}, - "terminationGracePeriodSeconds": { "oneOf" : [ - { "type": "null" }, - { "type": "integer", "minimum": 0 } - ]}, - "admissionGracePeriodDuration": { "oneOf" : [ - { "type": "null" }, - { "$ref": "#/$defs/duration" } - ]}, - "warmupGracePeriodDuration": { "oneOf" : [ - { "type": "null" }, - { "$ref": "#/$defs/duration" } - ]}, - "failureGracePeriodDuration": { "oneOf" : [ - { "type": "null" }, - { "$ref": "#/$defs/duration" } - ]}, - "retryPausePeriodDuration": { "oneOf" : [ - { "type": "null" }, - { "$ref": "#/$defs/duration" } - ]}, - "retryLimit": { "oneOf" : [ - { "type": "null" }, - { "type": "integer", "minimum": 0, "maximum": 100 } - ]}, - "forcefulDeletionGracePeriodDuration": { "oneOf" : [ - { "type": "null" }, - { "$ref": "#/$defs/duration" } - ]}, - "deletionOnFailureGracePeriodDuration" : { "oneOf" : [ - { "type": "null" }, - { "$ref": "#/$defs/duration" } - ]}, - "successTTLDuration" : { "oneOf" : [ - { "type": "null" }, - { "$ref": "#/$defs/duration" } - ]} - }, - - "if": { - "properties": { - "numPods": { "const": 1 } - } - }, - "then": { - "properties": { - "numRoceGdr": { "const": 0 } - } - }, - - "$defs": { - "rfc1123Label": { - "type": "string", - "pattern": "^[a-z]([-a-z0-9]*[a-z0-9])?$", - "minLength": 1, - "maxLength": 63 - }, - "resourceCPU": { - "oneOf": [ - { "type": "integer", "mimimum": 1 }, - { "type": "string", "pattern": "^[0-9]+?(Ki|Mi|Gi|Ti|Pi|Ei|m|k|M|G|T|P|E)$" } - ] - }, - "resourceMemory": { - "type": "string", - "pattern": "^[0-9]+?(Ki|Mi|Gi|Ti|Pi|Ei|m|k|M|G|T|P|E)$" - }, - "duration": { - "type": "string", - "pattern": "^[0-9]+?(s|m|h|d)$" - } - } -} diff --git a/tools/pytorchjob-generator/chart/values.yaml b/tools/pytorchjob-generator/chart/values.yaml deleted file mode 100644 index 0b60656..0000000 --- a/tools/pytorchjob-generator/chart/values.yaml +++ /dev/null @@ -1,281 +0,0 @@ -#################### -# Job Metadata -#################### - -# -- (string) Name of the Job. Will be the name of the AppWrapper and the PyTorchJob. -# @default -- must be provided by user -# @section -- Job Metadata -jobName: - -# -- (string) Namespace in which to run the Job. If unspecified, the namespace will be inferred using normal Helm/Kubernetes mechanisms when the Job is submitted. -# @section -- Job Metadata -namespace: - -# -- (string) Name of the local queue to which the Job will be submitted. -# @section -- Job Metadata -queueName: "default-queue" - -# -- (string) Type of priority for the job (choose from: "default-priority", "low-priority" or "high-priority"). -# @section -- Job Metadata -priority: "default-priority" - -# -- (array) Optional array of custom labels to add to all the resources created by the Job (the PyTorchJob, the PodGroup, and the AppWrapper). -# @section -- Job Metadata -customLabels: -# - key: project-name -# value: my-project -# - key: oranization-name -# value: my-organization - -# -- (string) Image used for creating the Job's containers (needs to have all the applications your job may need) -# @default -- must be provided by the user -# @section -- Job Metadata -containerImage: - -# -- (array) List of image-pull-secrets to be used for pulling containerImages -# @section -- Job Metadata -imagePullSecrets: # -# - name: secret-one -# - name: secret-two - -# -- (string) Policy for pulling containerImages (choose from: "IfNotPresent", "Always", or "Never") -# @section -- Job Metadata -imagePullPolicy: IfNotPresent - -################################## -# Resource Requirements -################################## - -# -- (integer) Total number of pods (i.e. master + worker pods) to be created -# @section -- Resource Requirements -numPods: 1 - -# -- (integer or string) Number of CPUs for each pod. May be a positive integer or a ResourceQuantity (eg 500m) -# @section -- Resource Requirements -numCpusPerPod: 1 - -# -- (integer) Number of GPUs for each pod (all GPUs per node is currently recommended for distributed training). -# @section -- Resource Requirements -numGpusPerPod: 0 - -# -- (string) Total memory for each pod expressed as a ResourceQuantity (eg 1Gi, 200M, etc.). -# @section -- Resource Requirements -totalMemoryPerPod: 1Gi - -# -- (integer or string) Limit on the number of CPUs per pod for elastic jobs. May be a positive integer or a ResourceQuantity (eg 500m). -# @default -- numCpusPerPod -# @section -- Resource Requirements -limitCpusPerPod: - -# -- (integer) Limit of number of GPUs per pod for elastic jobs. -# @default -- numGpusPerPod -# @section -- Resource Requirements -limitGpusPerPod: # Limit of number of GPUs per pod for elastic jobs. - -# -- (string) Limit of total memory per pod for elastic jobs (eg 1Gi, 200M, etc.). -# @default -- totalMemoryPerPod -# @section -- Resource Requirements -limitMemoryPerPod: # Limit of total memory per pod for elastic jobs - -######################## -# Workload Specification -######################## - -# -- (array) List of variables/values to be defined for all the ranks. Values can be literals or -# references to Kuberetes secrets or configmaps. See [values.yaml](values.yaml) for examples of supported syntaxes. -# -# NOTE: The following standard [PyTorch Distributed environment variables](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization) -# are set automatically and can be referenced in the commands without being set manually: WORLD_SIZE, RANK, MASTER_ADDR, MASTER_PORT. -# @section -- Workload Specification -environmentVariables: -# - name: EXAMPLE_VAR1 -# value: 6 -# - name: EXAMPLE_VAR2 -# value: "example2string" -# - name: EXAMPLE_VAR3 -# secret: -# name: secret-name -# key: secret-key -# - name: EXAMPLE_VAR4 -# configmap: -# name: configmap-name -# key: configmap-key - -# Private GitHub clone support. -# -# 0) Create a secret and configMap to enable Private GitHub cloning as documented for your organization. -# 1) Then fill the name of the secret and configMap below in sshGitCloneConfig -# 2) Finally, add your (ssh) git clone command to setupCommands in the next section -# - -# -- (object) Private GitHub clone support. See [values.yaml](values.yaml) for additional instructions. -# @section -- Workload Specification -sshGitCloneConfig: # Field with "(secretName, configMapName)", optionally "(secretName, configMapName, secretMountPath, configMapMountPath, sshCmd)" -# secretName: # see steps 1-3 of detailed instructions -# configMapName: # see step 4 of detailed instructions. -# secretMountPath: # -# configMapMountPath: # -# sshCmd: # - -# Commands -# -# Any command can be listed here -# -# -- (array) List of custom commands to be ran at the beginning of the execution. Use `setupCommand` to clone code, download data, and change directories. -# @default -- no custom commands are executed -# @section -- Workload Specification -setupCommands: # -# - git clone https://github.com/dbarnett/python-helloworld -# - cd python-helloworld - -# Main PyTorch Program -# -# Single command to be fed to `torchrun`. Use setupCommands instead -# if main program should be executed with any entry-point other than `torchrun` -# e.g. `fairseq`, `colossialai`, `torch.distributed.launch` ... -# -# -- (string) Name of the PyTorch program to be executed by `torchrun`. Please provide your program name here and NOT in "setupCommands" as this helm template provides the necessary "torchrun" arguments for the parallel execution. WARNING: this program is relative to the current path set by change-of-directory commands in "setupCommands". -# If no value is provided; then only `setupCommands` are executed and torchrun is elided. -# @section -- Workload Specification -mainProgram: # - -# -- (array) List of "(name, claimName, mountPath)" of volumes, with persistentVolumeClaim, to be mounted to the infrastructure -# @default -- No volumes are mounted -# @section -- Workload Specification -volumes: -# - name: arbitrary-name-0 -# claimName: name-matching-the-actual-PersistentVolumeClaim -# mountPath: /path/to/where/you/want/to/find/your/data -# - name: arbitrary-name-1 -# claimName: name-matching-another-actual-PersistentVolumeClaim -# mountPath: /path/to/where/you/want/to/find/your/data - -# ------------------------------------------------------------------------------------------------ -# Advanced options begin here -# - -# GDR support -# -# -- (string) RoCE GDR resource name (can vary by cluster configuration) -# @default -- nvidia.com/roce_gdr -# @section -- Advanced Options -roceGdrResName: # - -# -- (integer) number of nvidia.com/roce_grd resources (0 means disabled; >0 means enable GDR over RoCE). Must be 0 unless numPods > 1. -# @section -- Advanced Options -numRoceGdr: 0 - -# -- (string) Name of configmap containining /var/run/nvidia-topologyd/virtualTopology.xml for the system e.g. nvidia-topo-gdr -# @section -- Advanced Options -topologyFileConfigMap: # TODO make this required if numRoceGdr > 0 ? - -# -- (string) Name of configmap containing NCCL networking environment variables for the system e.g. nccl-netwk-env-vars -# @section -- Advanced Options -ncclGdrEnvConfigMap: # TODO make this required if numRoceGdr > 0 ? - -# -- (string) Name of multi-NIC network, if one is available. -# Note: when GDR over RoCE is used/available, the RoCE multi-nic network instance -# should be specified here instead of the TCP multi-nic network instance. -# Existing instance names can be listed with `oc get multinicnetwork`. -# -# @section -- Advanced Options -multiNicNetworkName: - -# -- (boolean) Control whether or not a shared memory volume is added to the PyTorchJob. -# @section -- Advanced Options -disableSharedMemory: false - -# -- (object) Mount NVMe as a volume. -# The environment variable MOUNT_PATH_NVME provides the runtime mount path -# @section -- Advanced Options -mountNVMe: - # storage: 800Gi - # mountPath: "/workspace/scratch-nvme" - -# -- (array) List of "(name, image, command[])" specifying an init containers to be run before the main job. The 'command' field is a list of commands to run in the container, see the Kubernetes entry on initContainers for reference. -# -# @section -- Advanced Options -initContainers: -# - name: init-container-1 -# image: busybox -# command: ["sh", "-c", "whoami && ls -l"] -# - name: init-container-2 -# image: ubuntu -# command: ["sh", "-c", "echo hello world!"] - -# -- (array) Autopilot health checks. -# List of labels enabling one or more system health pre-flight checks. -# @default -- No pre-flight checks are enabled. -# @section -- Advanced Options -autopilotHealthChecks: -# - gpu-pcie-bw - -# -- (array) List of host names on which the Job must not be scheduled (to avoid faulty nodes). -# @section -- Advanced Options -hostIgnoreList: -# - a100-large-drlfv-worker-3-with-secondary-nw5qh -# - a100-large-drlfv-worker-3-with-secondary-lb7ch - -# -- (string) If non-nil, use the specified Kubernetes scheduler. -# ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this -# to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** -# @section -- Advanced Options -schedulerName: - -# -- (string) Service account to be used for running the Job -# @section -- Advanced Options -# @default -- the default service account for the namespace will be used. -serviceAccountName: # service account name - -############################ -# Fault Tolerance -############################ - -# -- (string) Customize the admissionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ -# @section -- Fault Tolerance -# @default -- The AppWrapper defaults will be used -admissionGracePeriodDuration: - -# -- (string) Customize the warmupGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ -# @section -- Fault Tolerance -# @default -- The AppWrapper defaults will be used -warmupGracePeriodDuration: - -# -- (string) Customize the failureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ -# @section -- Fault Tolerance -# @default -- The AppWrapper defaults will be used -failureGracePeriodDuration: - -# -- (string) Customize the retryPausePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ -# @section -- Fault Tolerance -# @default -- The AppWrapper defaults will be used -retryPausePeriodDuration: - -# -- (integer) Customize the retryLimit; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ -# @section -- Fault Tolerance -# @default -- The AppWrapper defaults will be used -retryLimit: - -# -- (string) Customize the forcefulDeletionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ -# @section -- Fault Tolerance -# @default -- The AppWrapper defaults will be used -forcefulDeletionGracePeriodDuration: - -# -- (string) Customize the deletionOnFailureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ -# @section -- Fault Tolerance -# @default -- The AppWrapper defaults will be used -deletionOnFailureGracePeriodDuration: - -# -- (string) Customize the successTTL; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ -# @section -- Fault Tolerance -# @default -- The AppWrapper defaults will be used -successTTLDuration: - -# -- (string) Set Kubernetes policy for restarting failed containers "in place" (without restarting the Pod). -# @section -- Fault Tolerance -restartPolicy: "Never" - -# -- (integer) Set a non-default pod termination grace period (in seconds). -# @default -- Kubernetes's default value is used -# @section -- Fault Tolerance -terminationGracePeriodSeconds: diff --git a/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml b/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml deleted file mode 100644 index 67c83cc..0000000 --- a/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml +++ /dev/null @@ -1,26 +0,0 @@ -jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required) -queueName: default-queue # local queue to submit to (default: default-queue) - -schedulerName: sakkara -# If additional constraints are used, specify the configmap here: -#customLabels: -# - key: sakkara.group.name -# value: my-topogrp-0 - -numPods: 4 # total pod count including master and worker pods (default: 1) -numCpusPerPod: 500m # requested number of cpus per pod (default: 1) -numGpusPerPod: 8 # requested number of gpus per pod (default: 0) -totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi) - -priority: default-priority # default-priority (default), low-priority, or high-priority - -# container image for the pods (required) -containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - -# setup commands to run in each pod (optional) -setupCommands: -- git clone https://github.com/dbarnett/python-helloworld -- cd python-helloworld - -# main program to invoke via torchrun (optional) -mainProgram: helloworld.py diff --git a/tools/pytorchjob-generator/examples/helloworld.settings.yaml b/tools/pytorchjob-generator/examples/helloworld.settings.yaml deleted file mode 100644 index a027d91..0000000 --- a/tools/pytorchjob-generator/examples/helloworld.settings.yaml +++ /dev/null @@ -1,20 +0,0 @@ -jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required) -queueName: default-queue # local queue to submit to (default: default-queue) - -numPods: 4 # total pod count including master and worker pods (default: 1) -numCpusPerPod: 500m # requested number of cpus per pod (default: 1) -numGpusPerPod: 8 # requested number of gpus per pod (default: 0) -totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi) - -priority: default-priority # default-priority (default), low-priority, or high-priority - -# container image for the pods (required) -containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126 - -# setup commands to run in each pod (optional) -setupCommands: -- git clone https://github.com/dbarnett/python-helloworld -- cd python-helloworld - -# main program to invoke via torchrun (optional) -mainProgram: helloworld.py diff --git a/tools/pytorchjob-generator/release-instructions.md b/tools/pytorchjob-generator/release-instructions.md deleted file mode 100644 index fb2a0ea..0000000 --- a/tools/pytorchjob-generator/release-instructions.md +++ /dev/null @@ -1,20 +0,0 @@ -## Release Instructions - -1. Create a release prep branch - -2. Update the version number in chart/Chart.yaml - -3. Do a `helm unittest -u chart` and then run precommit to - regenerate the helmdocs. Inspect the diff and make sure - the only changes are the Chart version - -4. Update the chart version number in the example - of `helm repo search` in the main README.md - -5. Submit & merge a PR with these changes - -6. Manually trigger the `Release Charts` workflow in the Actions - tab of the MLBatch GitHub project. This action will automatically - generate and push tags for the newly released chart and trigger an - update of the GH Pages (which contains the helm repo). - diff --git a/tools/sakkara-deploy/README.md b/tools/sakkara-deploy/README.md deleted file mode 100644 index ea531f0..0000000 --- a/tools/sakkara-deploy/README.md +++ /dev/null @@ -1,3 +0,0 @@ -The helm/chart-installer-action does not understand git submodules. - -Therfore we maintain a copy of https://github.com/atantawi/sakkara-deploy/tree/main/install/ here. \ No newline at end of file diff --git a/tools/sakkara-deploy/release-instructions.md b/tools/sakkara-deploy/release-instructions.md deleted file mode 100644 index fb2a0ea..0000000 --- a/tools/sakkara-deploy/release-instructions.md +++ /dev/null @@ -1,20 +0,0 @@ -## Release Instructions - -1. Create a release prep branch - -2. Update the version number in chart/Chart.yaml - -3. Do a `helm unittest -u chart` and then run precommit to - regenerate the helmdocs. Inspect the diff and make sure - the only changes are the Chart version - -4. Update the chart version number in the example - of `helm repo search` in the main README.md - -5. Submit & merge a PR with these changes - -6. Manually trigger the `Release Charts` workflow in the Actions - tab of the MLBatch GitHub project. This action will automatically - generate and push tags for the newly released chart and trigger an - update of the GH Pages (which contains the helm repo). - diff --git a/tools/sakkara-deploy/sakkara-scheduler/.helmignore b/tools/sakkara-deploy/sakkara-scheduler/.helmignore deleted file mode 100644 index 0e8a0eb..0000000 --- a/tools/sakkara-deploy/sakkara-scheduler/.helmignore +++ /dev/null @@ -1,23 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*.orig -*~ -# Various IDEs -.project -.idea/ -*.tmproj -.vscode/ diff --git a/tools/sakkara-deploy/sakkara-scheduler/Chart.yaml b/tools/sakkara-deploy/sakkara-scheduler/Chart.yaml deleted file mode 100644 index 347ee24..0000000 --- a/tools/sakkara-deploy/sakkara-scheduler/Chart.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: v2 -appVersion: v0.29.7 -description: Deploy sakkara group and topology aware scheduler plugin in a cluster -name: sakkara-scheduler -type: application -version: 0.0.1 diff --git a/tools/sakkara-deploy/sakkara-scheduler/README.md b/tools/sakkara-deploy/sakkara-scheduler/README.md deleted file mode 100644 index caec5b5..0000000 --- a/tools/sakkara-deploy/sakkara-scheduler/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# sakkara-scheduler - -![Version: 0.0.1](https://img.shields.io/badge/Version-0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.29.7](https://img.shields.io/badge/AppVersion-v0.29.7-informational?style=flat-square) - -Deploy sakkara group and topology aware scheduler plugin in a cluster - -## Values - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| fullnameOverride | string | `""` | | -| image.repository | string | `"quay.io"` | repository to fetch images from | -| image.tag | string | `"v0.0.1"` | default is the chart appVersion | -| nameOverride | string | `"sakkara"` | | -| nodeSelector | object | `{}` | | -| pluginConfig[0].args.topologyConfigMapNameSpace | string | `"sakkara-scheduler"` | | -| pluginConfig[0].name | string | `"ClusterTopologyPlacementGroup"` | | -| plugins.permit.enabled[0].name | string | `"ClusterTopologyPlacementGroup"` | | -| plugins.postBind.enabled[0].name | string | `"ClusterTopologyPlacementGroup"` | | -| plugins.postFilter.enabled[0].name | string | `"ClusterTopologyPlacementGroup"` | | -| plugins.preEnqueue.enabled[0].name | string | `"ClusterTopologyPlacementGroup"` | | -| plugins.preScore.enabled[0].name | string | `"ClusterTopologyPlacementGroup"` | | -| plugins.queueSort.disabled[0].name | string | `"*"` | | -| plugins.queueSort.enabled[0].name | string | `"ClusterTopologyPlacementGroup"` | | -| plugins.reserve.enabled[0].name | string | `"ClusterTopologyPlacementGroup"` | | -| plugins.score.disabled[0].name | string | `"*"` | | -| plugins.score.enabled[0].name | string | `"ClusterTopologyPlacementGroup"` | | -| plugins.score.enabled[0].weight | int | `10` | | -| podAnnotations | object | `{}` | | -| priorityClassName | string | `"system-node-critical"` | | -| scheduler.affinity | object | `{}` | affinity for deployment's pods | -| scheduler.enabled | bool | `true` | deploy second scheduler as deployment | -| scheduler.image | string | `"ibm/sakkara-scheduler"` | path to scheduler image from repository | -| scheduler.imagePullPolicy | string | `"IfNotPresent"` | | -| scheduler.leaderElect | bool | `false` | enable for HA mode | -| scheduler.replicaCount | int | `1` | increase for HA mode | -| scheduler.resources | object | `{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"200m","memory":"512Mi"}}` | requests/limits for scheduler deployment resources: {} | -| scheduler.strategy.type | string | `"RollingUpdate"` | Deployment update strategy type | -| scheduler.verbosity | int | `6` | Log level from 1 to 9 | -| schedulerConfig.apiVersion | string | `"kubescheduler.config.k8s.io/v1"` | scheduler config apiversion (ref: https://kubernetes.io/docs/reference/scheduling/config/) | -| securityContext.privileged | bool | `false` | | -| tolerations | list | `[]` | | -| useForKubeSchedulerUser | bool | `false` | allow User system:kube-scheduler to work with metrics and CRDs. primary usage is to replace default-scheduler with custom one | - ----------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) diff --git a/tools/sakkara-deploy/sakkara-scheduler/crds/scheduling.x-k8s.io_podgroups.yaml b/tools/sakkara-deploy/sakkara-scheduler/crds/scheduling.x-k8s.io_podgroups.yaml deleted file mode 100644 index a0790dc..0000000 --- a/tools/sakkara-deploy/sakkara-scheduler/crds/scheduling.x-k8s.io_podgroups.yaml +++ /dev/null @@ -1,97 +0,0 @@ ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - api-approved.kubernetes.io: https://github.com/kubernetes-sigs/scheduler-plugins/pull/50 - controller-gen.kubebuilder.io/version: v0.11.1 - creationTimestamp: null - name: podgroups.scheduling.x-k8s.io -spec: - group: scheduling.x-k8s.io - names: - kind: PodGroup - listKind: PodGroupList - plural: podgroups - shortNames: - - pg - - pgs - singular: podgroup - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: PodGroup is a collection of Pod; used for batch workload. - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object - spec: - description: Specification of the desired behavior of the pod group. - properties: - minMember: - description: MinMember defines the minimal number of members/tasks - to run the pod group; if there's not enough resources to start all - tasks, the scheduler will not start anyone. - format: int32 - type: integer - minResources: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: MinResources defines the minimal resource of members/tasks - to run the pod group; if there's not enough resources to start all - tasks, the scheduler will not start anyone. - type: object - scheduleTimeoutSeconds: - description: ScheduleTimeoutSeconds defines the maximal time of members/tasks - to wait before run the pod group; - format: int32 - type: integer - type: object - status: - description: Status represents the current information about a pod group. - This data may not be up to date. - properties: - failed: - description: The number of pods which reached phase Failed. - format: int32 - type: integer - occupiedBy: - description: OccupiedBy marks the workload (e.g., deployment, statefulset) - UID that occupy the podgroup. It is empty if not initialized. - type: string - phase: - description: Current phase of PodGroup. - type: string - running: - description: The number of actively running pods. - format: int32 - type: integer - scheduleStartTime: - description: ScheduleStartTime of the group - format: date-time - type: string - succeeded: - description: The number of pods which reached phase Succeeded. - format: int32 - type: integer - type: object - type: object - served: true - storage: true - subresources: - status: {} diff --git a/tools/sakkara-deploy/sakkara-scheduler/templates/_helpers.tpl b/tools/sakkara-deploy/sakkara-scheduler/templates/_helpers.tpl deleted file mode 100644 index 8edc98d..0000000 --- a/tools/sakkara-deploy/sakkara-scheduler/templates/_helpers.tpl +++ /dev/null @@ -1,51 +0,0 @@ -{{/* -Expand the name of the chart. -*/}} -{{- define "scheduler-plugins.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "scheduler-plugins.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} -{{- end }} -{{- end }} -{{- end }} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "scheduler-plugins.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Common labels -*/}} -{{- define "scheduler-plugins.labels" -}} -helm.sh/chart: {{ include "scheduler-plugins.chart" . }} -{{ include "scheduler-plugins.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- end }} - -{{/* -Selector labels -*/}} -{{- define "scheduler-plugins.selectorLabels" -}} -app.kubernetes.io/name: {{ include "scheduler-plugins.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end }} diff --git a/tools/sakkara-deploy/sakkara-scheduler/templates/configmap.yaml b/tools/sakkara-deploy/sakkara-scheduler/templates/configmap.yaml deleted file mode 100644 index 5adb1a8..0000000 --- a/tools/sakkara-deploy/sakkara-scheduler/templates/configmap.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "scheduler-plugins.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "scheduler-plugins.labels" . | nindent 4 }} -data: - scheduler-config.yaml: | - apiVersion: {{ .Values.schedulerConfig.apiVersion }} - kind: KubeSchedulerConfiguration - leaderElection: - leaderElect: {{ .Values.scheduler.leaderElect }} - resourceName: {{ include "scheduler-plugins.fullname" . }} - profiles: - # Compose all plugins in one profile - - schedulerName: {{ include "scheduler-plugins.fullname" . }} - plugins: - {{- toYaml $.Values.plugins | nindent 8 }} - {{- if $.Values.pluginConfig }} - pluginConfig: {{ toYaml $.Values.pluginConfig | nindent 6 }} -{{- end }} diff --git a/tools/sakkara-deploy/sakkara-scheduler/templates/deployment.yaml b/tools/sakkara-deploy/sakkara-scheduler/templates/deployment.yaml deleted file mode 100644 index 9951018..0000000 --- a/tools/sakkara-deploy/sakkara-scheduler/templates/deployment.yaml +++ /dev/null @@ -1,66 +0,0 @@ -{{- if .Values.scheduler.enabled }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "scheduler-plugins.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "scheduler-plugins.labels" . | nindent 4 }} - component: scheduler -spec: - replicas: {{ .Values.scheduler.replicaCount }} - {{- with .Values.scheduler.strategy }} - strategy: - {{- toYaml . | nindent 4 }} - {{- end }} - selector: - matchLabels: - {{- include "scheduler-plugins.selectorLabels" . | nindent 6 }} - component: scheduler - template: - metadata: - annotations: - checksum/configmap: '{{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}' - {{- with .Values.podAnnotations }} - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "scheduler-plugins.selectorLabels" . | nindent 8 }} - component: scheduler - spec: - priorityClassName: {{ .Values.priorityClassName }} - serviceAccountName: {{ include "scheduler-plugins.fullname" . }} - containers: - - command: - - /bin/kube-scheduler - - --config=/etc/kubernetes/scheduler-config.yaml - - --v={{ .Values.scheduler.verbosity }} - name: scheduler - image: "{{ .Values.image.repository }}/{{ .Values.scheduler.image }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: {{ .Values.scheduler.imagePullPolicy }} - resources: - {{- toYaml .Values.scheduler.resources | nindent 12 }} - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - volumeMounts: - - name: scheduler-config - mountPath: /etc/kubernetes - readOnly: true - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.scheduler.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - volumes: - - name: scheduler-config - configMap: - name: {{ include "scheduler-plugins.fullname" . }} -{{- end }} diff --git a/tools/sakkara-deploy/sakkara-scheduler/templates/rbac.yaml b/tools/sakkara-deploy/sakkara-scheduler/templates/rbac.yaml deleted file mode 100644 index 17b88c6..0000000 --- a/tools/sakkara-deploy/sakkara-scheduler/templates/rbac.yaml +++ /dev/null @@ -1,126 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ include "scheduler-plugins.fullname" . }} - labels: - {{- include "scheduler-plugins.labels" . | nindent 4 }} -rules: - - apiGroups: ["", "events.k8s.io"] - resources: ["events"] - verbs: ["create", "patch", "update"] - - apiGroups: ["coordination.k8s.io"] - resources: ["leases"] - verbs: ["create"] - - apiGroups: ["coordination.k8s.io"] - resourceNames: ["kube-scheduler"] - resources: ["leases"] - verbs: ["get", "update"] - - apiGroups: [""] - resources: ["endpoints"] - verbs: ["create"] - - apiGroups: [""] - resourceNames: ["kube-scheduler"] - resources: ["endpoints"] - verbs: ["get", "update"] - - apiGroups: [""] - resources: ["nodes"] - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: ["pods"] - verbs: ["delete", "get", "list", "watch", "patch", "update"] - - apiGroups: [""] - resources: ["bindings", "pods/binding"] - verbs: ["create"] - - apiGroups: [""] - resources: ["pods/status"] - verbs: ["patch", "update"] - - apiGroups: [""] - resources: ["replicationcontrollers", "services"] - verbs: ["get", "list", "watch"] - - apiGroups: ["apps", "extensions"] - resources: ["replicasets"] - verbs: ["get", "list", "watch"] - - apiGroups: ["apps"] - resources: ["statefulsets"] - verbs: ["get", "list", "watch"] - - apiGroups: ["policy"] - resources: ["poddisruptionbudgets"] - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: ["persistentvolumeclaims", "persistentvolumes"] - verbs: ["get", "list", "watch", "patch", "update"] - - apiGroups: ["authentication.k8s.io"] - resources: ["tokenreviews"] - verbs: ["create"] - - apiGroups: ["authorization.k8s.io"] - resources: ["subjectaccessreviews"] - verbs: ["create"] - - apiGroups: ["storage.k8s.io"] - resources: ["csinodes", "storageclasses"] - verbs: ["get", "list", "watch"] - - apiGroups: ["scheduling.x-k8s.io"] - resources: ["podgroups", "elasticquotas"] - verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] - - apiGroups: [""] - resources: ["events"] - verbs: ["create", "patch", "update"] - - apiGroups: [""] - resources: ["configmaps", "storageclasses"] - verbs: ["get", "list", "watch", "patch", "update"] - - apiGroups: [""] - resources: ["endpoints"] - verbs: ["create", "get", "list", "watch", "update"] - - apiGroups: [""] - resourceNames: ["kube-scheduler"] - resources: ["endpoints"] - verbs: ["get", "delete", "update", "patch"] - - apiGroups: [""] - resources: ["bindings","pods/binding"] - verbs: ["create"] - - apiGroups: [""] - resources: ["poddisruptionbudgets"] - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: ["namespaces"] - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: ["csistoragecapacities"] - verbs: ["get", "list"] - - apiGroups: ["storage.k8s.io"] - resources: ["csidrivers", "csistoragecapacities"] - verbs: ["get", "list", "watch"] - - apiGroups: ["metrics.k8s.io"] - resources: ["nodes"] - verbs: ["get", "list", "watch"] ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: {{ include "scheduler-plugins.fullname" . }} - labels: - {{- include "scheduler-plugins.labels" . | nindent 4 }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "scheduler-plugins.fullname" . }} -subjects: -- kind: ServiceAccount - name: {{ include "scheduler-plugins.fullname" . }} - namespace: {{ .Release.Namespace }} -{{- if .Values.useForKubeSchedulerUser }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - {{- include "scheduler-plugins.labels" . | nindent 4 }} - name: scheduler-plugins-kube-scheduler -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "scheduler-plugins.fullname" . }} -subjects: -- apiGroup: rbac.authorization.k8s.io - kind: User - name: system:kube-scheduler -{{- end }} diff --git a/tools/sakkara-deploy/sakkara-scheduler/templates/serviceaccount.yaml b/tools/sakkara-deploy/sakkara-scheduler/templates/serviceaccount.yaml deleted file mode 100644 index 8963040..0000000 --- a/tools/sakkara-deploy/sakkara-scheduler/templates/serviceaccount.yaml +++ /dev/null @@ -1,8 +0,0 @@ ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "scheduler-plugins.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "scheduler-plugins.labels" . | nindent 4 }} diff --git a/tools/sakkara-deploy/sakkara-scheduler/values.yaml b/tools/sakkara-deploy/sakkara-scheduler/values.yaml deleted file mode 100644 index f751fe8..0000000 --- a/tools/sakkara-deploy/sakkara-scheduler/values.yaml +++ /dev/null @@ -1,92 +0,0 @@ -nameOverride: "sakkara" -fullnameOverride: "" - -image: - # -- repository to fetch images from - repository: quay.io - # -- default is the chart appVersion - tag: "v0.0.1" - -# -- allow User system:kube-scheduler to work with metrics and CRDs. -# primary usage is to replace default-scheduler with custom one -useForKubeSchedulerUser: false - -scheduler: - # -- deploy second scheduler as deployment - enabled: true - # -- path to scheduler image from repository - image: ibm/sakkara-scheduler - imagePullPolicy: IfNotPresent - # -- increase for HA mode - replicaCount: 1 - # -- enable for HA mode - leaderElect: false - # -- Log level from 1 to 9 - verbosity: 6 - strategy: - # -- Deployment update strategy type - type: RollingUpdate - # -- requests/limits for scheduler deployment - # resources: {} - resources: - requests: - cpu: "200m" - memory: "512Mi" - limits: - cpu: "500m" - memory: "512Mi" - # -- affinity for deployment's pods - affinity: {} - -priorityClassName: system-node-critical - -tolerations: [] - -nodeSelector: {} - -podAnnotations: {} - -securityContext: - privileged: false - -# scheduler framework plugins -plugins: - preEnqueue: - enabled: - - name: ClusterTopologyPlacementGroup - queueSort: - enabled: - - name: ClusterTopologyPlacementGroup - disabled: - - name: "*" - postFilter: - enabled: - - name: ClusterTopologyPlacementGroup - preScore: - enabled: - - name: ClusterTopologyPlacementGroup - score: - enabled: - - name: ClusterTopologyPlacementGroup - weight: 10 - disabled: - - name: "*" - reserve: - enabled: - - name: ClusterTopologyPlacementGroup - permit: - enabled: - - name: ClusterTopologyPlacementGroup - postBind: - enabled: - - name: ClusterTopologyPlacementGroup - -# plugin specific args -pluginConfig: -- name: ClusterTopologyPlacementGroup - args: - topologyConfigMapNameSpace: sakkara-scheduler - -schedulerConfig: - # -- scheduler config apiversion (ref: https://kubernetes.io/docs/reference/scheduling/config/) - apiVersion: kubescheduler.config.k8s.io/v1