Skip to content

Commit

Permalink
Merge branch 'gdrcopy' into 'master'
Browse files Browse the repository at this point in the history
Add support for deploying GDRCopy driver in the driver daemonset

See merge request nvidia/kubernetes/gpu-operator!997
  • Loading branch information
cdesiniotis committed Jan 23, 2024
2 parents 0917d1f + 86ea0b5 commit 17a2236
Show file tree
Hide file tree
Showing 10 changed files with 419 additions and 4 deletions.
65 changes: 63 additions & 2 deletions api/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ type ClusterPolicySpec struct {
Validator ValidatorSpec `json:"validator,omitempty"`
// GPUDirectStorage defines the spec for GDS components(Experimental)
GPUDirectStorage *GPUDirectStorageSpec `json:"gds,omitempty"`
// GDRCopy component spec
GDRCopy *GDRCopySpec `json:"gdrcopy,omitempty"`
// SandboxWorkloads defines the spec for handling sandbox workloads (i.e. Virtual Machines)
SandboxWorkloads SandboxWorkloadsSpec `json:"sandboxWorkloads,omitempty"`
// VFIOManager for configuration to deploy VFIO-PCI Manager
Expand Down Expand Up @@ -1258,6 +1260,53 @@ type GPUDirectStorageSpec struct {
Env []EnvVar `json:"env,omitempty"`
}

// GDRCopySpec defines the properties for NVIDIA GDRCopy driver (gdrdrv) deployment
type GDRCopySpec struct {
// Enabled indicates if GDRCopy is enabled through GPU Operator
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable GDRCopy through GPU operator"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Enabled *bool `json:"enabled,omitempty"`

// NVIDIA GDRCopy driver image repository
// +kubebuilder:validation:Optional
Repository string `json:"repository,omitempty"`

// NVIDIA GDRCopy driver image name
// +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+
Image string `json:"image,omitempty"`

// NVIDIA GDRCopy driver image tag
// +kubebuilder:validation:Optional
Version string `json:"version,omitempty"`

// Image pull policy
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image Pull Policy"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:imagePullPolicy"
ImagePullPolicy string `json:"imagePullPolicy,omitempty"`

// Image pull secrets
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image pull secrets"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret"
ImagePullSecrets []string `json:"imagePullSecrets,omitempty"`

// Optional: List of arguments
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Arguments"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
Args []string `json:"args,omitempty"`

// Optional: List of environment variables
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
Env []EnvVar `json:"env,omitempty"`
}

// MIGPartedConfigSpec defines custom mig-parted config for NVIDIA MIG Manager container
type MIGPartedConfigSpec struct {
// ConfigMap name
Expand Down Expand Up @@ -1703,6 +1752,9 @@ func ImagePath(spec interface{}) (string, error) {
case *GPUDirectStorageSpec:
config := spec.(*GPUDirectStorageSpec)
return imagePath(config.Repository, config.Image, config.Version, "GDS_IMAGE")
case *GDRCopySpec:
config := spec.(*GDRCopySpec)
return imagePath(config.Repository, config.Image, config.Version, "GDRCOPY_IMAGE")
case *VFIOManagerSpec:
config := spec.(*VFIOManagerSpec)
return imagePath(config.Repository, config.Image, config.Version, "VFIO_MANAGER_IMAGE")
Expand Down Expand Up @@ -1891,7 +1943,7 @@ func (m *NodeStatusExporterSpec) IsEnabled() bool {
return *m.Enabled
}

// IsEnabled returns true if GPUDirect RDMA are enabled through gpu-perator
// IsEnabled returns true if GPUDirect RDMA are enabled through gpu-operator
func (g *GPUDirectRDMASpec) IsEnabled() bool {
if g.Enabled == nil {
// GPUDirectRDMA is disabled by default
Expand All @@ -1900,7 +1952,7 @@ func (g *GPUDirectRDMASpec) IsEnabled() bool {
return *g.Enabled
}

// IsEnabled returns true if GPUDirect Storage are enabled through gpu-perator
// IsEnabled returns true if GPUDirect Storage are enabled through gpu-operator
func (gds *GPUDirectStorageSpec) IsEnabled() bool {
if gds.Enabled == nil {
// GPUDirectStorage is disabled by default
Expand Down Expand Up @@ -1931,6 +1983,15 @@ func (gds *GPUDirectStorageSpec) IsOpenKernelModulesRequired() bool {
return false
}

// IsEnabled returns true if GDRCopy is enabled through gpu-operator
func (gdrcopy *GDRCopySpec) IsEnabled() bool {
if gdrcopy.Enabled == nil {
// GDRCopy is disabled by default
return false
}
return *gdrcopy.Enabled
}

// IsEnabled returns true if DCGM hostengine as a separate Pod is enabled through gpu-perator
func (dcgm *DCGMSpec) IsEnabled() bool {
if dcgm.Enabled == nil {
Expand Down
40 changes: 40 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 36 additions & 0 deletions assets/state-driver/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,42 @@ spec:
failureThreshold: 1
successThreshold: 1
timeoutSeconds: 10
- image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
name: nvidia-gdrcopy-ctr
command: [bash, -xc]
args: ["until [ -d /run/nvidia/driver/usr/src ] && lsmod | grep nvidia; do echo Waiting for nvidia-driver to be installed...; sleep 10; done; exec nvidia-gdrcopy-driver install"]
securityContext:
privileged: true
seLinuxOptions:
level: "s0"
volumeMounts:
- name: run-nvidia
mountPath: /run/nvidia
mountPropagation: HostToContainer
- name: var-log
mountPath: /var/log
- name: dev-log
mountPath: /dev/log
readOnly: true
startupProbe:
exec:
command:
[sh, -c, 'lsmod | grep gdrdrv']
initialDelaySeconds: 10
failureThreshold: 120
successThreshold: 1
periodSeconds: 10
timeoutSeconds: 10
livenessProbe:
exec:
command:
[sh, -c, 'lsmod | grep gdrdrv']
periodSeconds: 30
initialDelaySeconds: 30
failureThreshold: 1
successThreshold: 1
timeoutSeconds: 10
# Only kept when OpenShift DriverToolkit side-car is enabled.
- image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ metadata:
},
"gds": {
"enabled": false
},
"gdrcopy": {
"enabled": false
}
}
},
Expand Down
47 changes: 47 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,53 @@ spec:
type: string
type: object
type: object
gdrcopy:
description: GDRCopy component spec
properties:
args:
description: 'Optional: List of arguments'
items:
type: string
type: array
enabled:
description: Enabled indicates if GDRCopy is enabled through GPU
Operator
type: boolean
env:
description: 'Optional: List of environment variables'
items:
description: EnvVar represents an environment variable present
in a Container.
properties:
name:
description: Name of the environment variable.
type: string
value:
description: Value of the environment variable.
type: string
required:
- name
type: object
type: array
image:
description: NVIDIA GDRCopy driver image name
pattern: '[a-zA-Z0-9\-]+'
type: string
imagePullPolicy:
description: Image pull policy
type: string
imagePullSecrets:
description: Image pull secrets
items:
type: string
type: array
repository:
description: NVIDIA GDRCopy driver image repository
type: string
version:
description: NVIDIA GDRCopy driver image tag
type: string
type: object
gds:
description: GPUDirectStorage defines the spec for GDS components(Experimental)
properties:
Expand Down
47 changes: 47 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,53 @@ spec:
type: string
type: object
type: object
gdrcopy:
description: GDRCopy component spec
properties:
args:
description: 'Optional: List of arguments'
items:
type: string
type: array
enabled:
description: Enabled indicates if GDRCopy is enabled through GPU
Operator
type: boolean
env:
description: 'Optional: List of environment variables'
items:
description: EnvVar represents an environment variable present
in a Container.
properties:
name:
description: Name of the environment variable.
type: string
value:
description: Value of the environment variable.
type: string
required:
- name
type: object
type: array
image:
description: NVIDIA GDRCopy driver image name
pattern: '[a-zA-Z0-9\-]+'
type: string
imagePullPolicy:
description: Image pull policy
type: string
imagePullSecrets:
description: Image pull secrets
items:
type: string
type: array
repository:
description: NVIDIA GDRCopy driver image repository
type: string
version:
description: NVIDIA GDRCopy driver image tag
type: string
type: object
gds:
description: GPUDirectStorage defines the spec for GDS components(Experimental)
properties:
Expand Down
Loading

0 comments on commit 17a2236

Please sign in to comment.