Skip to content

Commit

Permalink
Add support for deploying GDRCopy driver in the driver daemonset
Browse files Browse the repository at this point in the history
  • Loading branch information
cdesiniotis committed Jan 23, 2024
1 parent 0917d1f commit 86ea0b5
Show file tree
Hide file tree
Showing 10 changed files with 419 additions and 4 deletions.
65 changes: 63 additions & 2 deletions api/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ type ClusterPolicySpec struct {
Validator ValidatorSpec `json:"validator,omitempty"`
// GPUDirectStorage defines the spec for GDS components(Experimental)
GPUDirectStorage *GPUDirectStorageSpec `json:"gds,omitempty"`
// GDRCopy component spec
GDRCopy *GDRCopySpec `json:"gdrcopy,omitempty"`
// SandboxWorkloads defines the spec for handling sandbox workloads (i.e. Virtual Machines)
SandboxWorkloads SandboxWorkloadsSpec `json:"sandboxWorkloads,omitempty"`
// VFIOManager for configuration to deploy VFIO-PCI Manager
Expand Down Expand Up @@ -1258,6 +1260,53 @@ type GPUDirectStorageSpec struct {
Env []EnvVar `json:"env,omitempty"`
}

// GDRCopySpec defines the properties for NVIDIA GDRCopy driver (gdrdrv) deployment
type GDRCopySpec struct {
// Enabled indicates if GDRCopy is enabled through GPU Operator
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable GDRCopy through GPU operator"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Enabled *bool `json:"enabled,omitempty"`

// NVIDIA GDRCopy driver image repository
// +kubebuilder:validation:Optional
Repository string `json:"repository,omitempty"`

// NVIDIA GDRCopy driver image name
// +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+
Image string `json:"image,omitempty"`

// NVIDIA GDRCopy driver image tag
// +kubebuilder:validation:Optional
Version string `json:"version,omitempty"`

// Image pull policy
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image Pull Policy"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:imagePullPolicy"
ImagePullPolicy string `json:"imagePullPolicy,omitempty"`

// Image pull secrets
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image pull secrets"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret"
ImagePullSecrets []string `json:"imagePullSecrets,omitempty"`

// Optional: List of arguments
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Arguments"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
Args []string `json:"args,omitempty"`

// Optional: List of environment variables
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
Env []EnvVar `json:"env,omitempty"`
}

// MIGPartedConfigSpec defines custom mig-parted config for NVIDIA MIG Manager container
type MIGPartedConfigSpec struct {
// ConfigMap name
Expand Down Expand Up @@ -1703,6 +1752,9 @@ func ImagePath(spec interface{}) (string, error) {
case *GPUDirectStorageSpec:
config := spec.(*GPUDirectStorageSpec)
return imagePath(config.Repository, config.Image, config.Version, "GDS_IMAGE")
case *GDRCopySpec:
config := spec.(*GDRCopySpec)
return imagePath(config.Repository, config.Image, config.Version, "GDRCOPY_IMAGE")
case *VFIOManagerSpec:
config := spec.(*VFIOManagerSpec)
return imagePath(config.Repository, config.Image, config.Version, "VFIO_MANAGER_IMAGE")
Expand Down Expand Up @@ -1891,7 +1943,7 @@ func (m *NodeStatusExporterSpec) IsEnabled() bool {
return *m.Enabled
}

// IsEnabled returns true if GPUDirect RDMA are enabled through gpu-perator
// IsEnabled returns true if GPUDirect RDMA are enabled through gpu-operator
func (g *GPUDirectRDMASpec) IsEnabled() bool {
if g.Enabled == nil {
// GPUDirectRDMA is disabled by default
Expand All @@ -1900,7 +1952,7 @@ func (g *GPUDirectRDMASpec) IsEnabled() bool {
return *g.Enabled
}

// IsEnabled returns true if GPUDirect Storage are enabled through gpu-perator
// IsEnabled returns true if GPUDirect Storage are enabled through gpu-operator
func (gds *GPUDirectStorageSpec) IsEnabled() bool {
if gds.Enabled == nil {
// GPUDirectStorage is disabled by default
Expand Down Expand Up @@ -1931,6 +1983,15 @@ func (gds *GPUDirectStorageSpec) IsOpenKernelModulesRequired() bool {
return false
}

// IsEnabled returns true if GDRCopy is enabled through gpu-operator
func (gdrcopy *GDRCopySpec) IsEnabled() bool {
if gdrcopy.Enabled == nil {
// GDRCopy is disabled by default
return false
}
return *gdrcopy.Enabled
}

// IsEnabled returns true if DCGM hostengine as a separate Pod is enabled through gpu-perator
func (dcgm *DCGMSpec) IsEnabled() bool {
if dcgm.Enabled == nil {
Expand Down
40 changes: 40 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 36 additions & 0 deletions assets/state-driver/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,42 @@ spec:
failureThreshold: 1
successThreshold: 1
timeoutSeconds: 10
- image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
name: nvidia-gdrcopy-ctr
command: [bash, -xc]
args: ["until [ -d /run/nvidia/driver/usr/src ] && lsmod | grep nvidia; do echo Waiting for nvidia-driver to be installed...; sleep 10; done; exec nvidia-gdrcopy-driver install"]
securityContext:
privileged: true
seLinuxOptions:
level: "s0"
volumeMounts:
- name: run-nvidia
mountPath: /run/nvidia
mountPropagation: HostToContainer
- name: var-log
mountPath: /var/log
- name: dev-log
mountPath: /dev/log
readOnly: true
startupProbe:
exec:
command:
[sh, -c, 'lsmod | grep gdrdrv']
initialDelaySeconds: 10
failureThreshold: 120
successThreshold: 1
periodSeconds: 10
timeoutSeconds: 10
livenessProbe:
exec:
command:
[sh, -c, 'lsmod | grep gdrdrv']
periodSeconds: 30
initialDelaySeconds: 30
failureThreshold: 1
successThreshold: 1
timeoutSeconds: 10
# Only kept when OpenShift DriverToolkit side-car is enabled.
- image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ metadata:
},
"gds": {
"enabled": false
},
"gdrcopy": {
"enabled": false
}
}
},
Expand Down
47 changes: 47 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,53 @@ spec:
type: string
type: object
type: object
gdrcopy:
description: GDRCopy component spec
properties:
args:
description: 'Optional: List of arguments'
items:
type: string
type: array
enabled:
description: Enabled indicates if GDRCopy is enabled through GPU
Operator
type: boolean
env:
description: 'Optional: List of environment variables'
items:
description: EnvVar represents an environment variable present
in a Container.
properties:
name:
description: Name of the environment variable.
type: string
value:
description: Value of the environment variable.
type: string
required:
- name
type: object
type: array
image:
description: NVIDIA GDRCopy driver image name
pattern: '[a-zA-Z0-9\-]+'
type: string
imagePullPolicy:
description: Image pull policy
type: string
imagePullSecrets:
description: Image pull secrets
items:
type: string
type: array
repository:
description: NVIDIA GDRCopy driver image repository
type: string
version:
description: NVIDIA GDRCopy driver image tag
type: string
type: object
gds:
description: GPUDirectStorage defines the spec for GDS components(Experimental)
properties:
Expand Down
47 changes: 47 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,53 @@ spec:
type: string
type: object
type: object
gdrcopy:
description: GDRCopy component spec
properties:
args:
description: 'Optional: List of arguments'
items:
type: string
type: array
enabled:
description: Enabled indicates if GDRCopy is enabled through GPU
Operator
type: boolean
env:
description: 'Optional: List of environment variables'
items:
description: EnvVar represents an environment variable present
in a Container.
properties:
name:
description: Name of the environment variable.
type: string
value:
description: Value of the environment variable.
type: string
required:
- name
type: object
type: array
image:
description: NVIDIA GDRCopy driver image name
pattern: '[a-zA-Z0-9\-]+'
type: string
imagePullPolicy:
description: Image pull policy
type: string
imagePullSecrets:
description: Image pull secrets
items:
type: string
type: array
repository:
description: NVIDIA GDRCopy driver image repository
type: string
version:
description: NVIDIA GDRCopy driver image tag
type: string
type: object
gds:
description: GPUDirectStorage defines the spec for GDS components(Experimental)
properties:
Expand Down
Loading

0 comments on commit 86ea0b5

Please sign in to comment.