Skip to content

Commit

Permalink
Add GDRCopy to NVIDIADriver API
Browse files Browse the repository at this point in the history
Signed-off-by: Christopher Desiniotis <[email protected]>
  • Loading branch information
cdesiniotis committed Jan 25, 2024
1 parent 17a2236 commit 4c08bb9
Show file tree
Hide file tree
Showing 13 changed files with 861 additions and 3 deletions.
84 changes: 84 additions & 0 deletions api/v1alpha1/nvidiadriver_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ type NVIDIADriverSpec struct {
// GPUDirectStorage defines the spec for GDS driver
GPUDirectStorage *GPUDirectStorageSpec `json:"gds,omitempty"`

// GDRCopy defines the spec for GDRCopy driver
GDRCopy *GDRCopySpec `json:"gdrcopy,omitempty"`

// NVIDIA Driver repository
// +kubebuilder:validation:Optional
Repository string `json:"repository,omitempty"`
Expand Down Expand Up @@ -323,6 +326,53 @@ type GPUDirectRDMASpec struct {
UseHostMOFED *bool `json:"useHostMofed,omitempty"`
}

// GDRCopySpec defines the properties for NVIDIA GDRCopy driver deployment
type GDRCopySpec struct {
// Enabled indicates if GDRCopy is enabled through GPU operator
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable GDRCopy through GPU operator"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Enabled *bool `json:"enabled,omitempty"`

// GDRCopy diver image repository
// +kubebuilder:validation:Optional
Repository string `json:"repository,omitempty"`

// GDRCopy driver image name
// +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+
Image string `json:"image,omitempty"`

// GDRCopy driver image tag
// +kubebuilder:validation:Optional
Version string `json:"version,omitempty"`

// Image pull policy
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image Pull Policy"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:imagePullPolicy"
ImagePullPolicy string `json:"imagePullPolicy,omitempty"`

// Image pull secrets
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image pull secrets"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret"
ImagePullSecrets []string `json:"imagePullSecrets,omitempty"`

// Optional: List of arguments
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Arguments"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
Args []string `json:"args,omitempty"`

// Optional: List of environment variables
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
Env []EnvVar `json:"env,omitempty"`
}

// KernelModuleConfigSpec defines custom configuration parameters for the NVIDIA Driver
type KernelModuleConfigSpec struct {
// +kubebuilder:validation:Optional
Expand Down Expand Up @@ -515,6 +565,31 @@ func (d *GPUDirectStorageSpec) GetImagePath(osVersion string) (string, error) {
return image, nil
}

// GetImagePath returns the gdrcopy driver image path given the information
// provided in GDRCopySpec and the osVersion passed as an argument.
// The driver image path will be in the following format unless the spec
// contains a digest.
// <repository>/<image>:<driver-ver>-<os-ver>
func (d *GDRCopySpec) GetImagePath(osVersion string) (string, error) {
image, err := image.ImagePath(d.Repository, d.Image, d.Version, "")
if err != nil {
return "", fmt.Errorf("failed to get image path from crd: %w", err)
}

// if image digest is specified, use it directly
if !strings.Contains(image, "sha256:") {
// append '-<osVersion>' to the driver tag
image = fmt.Sprintf("%s-%s", image, osVersion)
}

_, err = ref.New(image)
if err != nil {
return "", fmt.Errorf("failed to parse driver image path: %w", err)
}

return image, nil
}

// GetPrecompiledImagePath returns the precompiled driver image path for a
// given os version and kernel version. Precompiled driver images follow
// the following format:
Expand Down Expand Up @@ -554,6 +629,15 @@ func (d *NVIDIADriverSpec) IsGDSEnabled() bool {
return *d.GPUDirectStorage.Enabled
}

// IsGDRCopyEnabled returns true if GDRCopy is enabled through gpu-operator
func (d *NVIDIADriverSpec) IsGDRCopyEnabled() bool {
if d.GDRCopy == nil || d.GDRCopy.Enabled == nil {
// default is false if not specified by user
return false
}
return *d.GDRCopy.Enabled
}

// IsOpenKernelModulesEnabled returns true if NVIDIA OpenRM drivers are enabled
func (d *NVIDIADriverSpec) IsOpenKernelModulesEnabled() bool {
if d.UseOpenKernelModules == nil || !*d.UseOpenKernelModules {
Expand Down
69 changes: 69 additions & 0 deletions api/v1alpha1/nvidiadriver_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,72 @@ func TestGDSGetImagePath(t *testing.T) {
})
}
}

func TestGDRCopyGetImagePath(t *testing.T) {
testCases := []struct {
description string
spec *GDRCopySpec
osVersion string
errorExpected bool
expectedImage string
}{
{
description: "malformed repository",
spec: &GDRCopySpec{
Repository: "malformed?/repo",
},
errorExpected: true,
expectedImage: "",
},
{
description: "malformed image",
spec: &GDRCopySpec{
Image: "malformed?image",
},
errorExpected: true,
expectedImage: "",
},
{
description: "valid image",
spec: &GDRCopySpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "gdrdrv",
Version: "v2.4.1",
},
osVersion: "ubuntu20.04",
errorExpected: false,
expectedImage: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.4.1-ubuntu20.04",
},
{
description: "only image provided with no tag or digest",
spec: &GDRCopySpec{
Image: "nvcr.io/nvidia/cloud-native",
},
errorExpected: true,
expectedImage: "",
},
{
description: "repository, image, and version set; version is a digest",
spec: &GDRCopySpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "gdrdrv",
Version: "sha256:" + testDigest,
},
osVersion: "ubuntu22.04",
errorExpected: false,
expectedImage: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:10d1df8034373061366d4fb17b364b3b28d766b54d5a0b700c1a5a75378cf125",
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
image, err := tc.spec.GetImagePath(tc.osVersion)
if tc.errorExpected {
require.Error(t, err)
} else {
require.NoError(t, err)
}
require.Equal(t, image, tc.expectedImage)
})
}
}
40 changes: 40 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 47 additions & 0 deletions bundle/manifests/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,53 @@ spec:
- name
type: object
type: array
gdrcopy:
description: GDRCopy defines the spec for GDRCopy driver
properties:
args:
description: 'Optional: List of arguments'
items:
type: string
type: array
enabled:
description: Enabled indicates if GDRCopy is enabled through GPU
operator
type: boolean
env:
description: 'Optional: List of environment variables'
items:
description: EnvVar represents an environment variable present
in a Container.
properties:
name:
description: Name of the environment variable.
type: string
value:
description: Value of the environment variable.
type: string
required:
- name
type: object
type: array
image:
description: GDRCopy driver image name
pattern: '[a-zA-Z0-9\-]+'
type: string
imagePullPolicy:
description: Image pull policy
type: string
imagePullSecrets:
description: Image pull secrets
items:
type: string
type: array
repository:
description: GDRCopy diver image repository
type: string
version:
description: GDRCopy driver image tag
type: string
type: object
gds:
description: GPUDirectStorage defines the spec for GDS driver
properties:
Expand Down
47 changes: 47 additions & 0 deletions config/crd/bases/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,53 @@ spec:
- name
type: object
type: array
gdrcopy:
description: GDRCopy defines the spec for GDRCopy driver
properties:
args:
description: 'Optional: List of arguments'
items:
type: string
type: array
enabled:
description: Enabled indicates if GDRCopy is enabled through GPU
operator
type: boolean
env:
description: 'Optional: List of environment variables'
items:
description: EnvVar represents an environment variable present
in a Container.
properties:
name:
description: Name of the environment variable.
type: string
value:
description: Value of the environment variable.
type: string
required:
- name
type: object
type: array
image:
description: GDRCopy driver image name
pattern: '[a-zA-Z0-9\-]+'
type: string
imagePullPolicy:
description: Image pull policy
type: string
imagePullSecrets:
description: Image pull secrets
items:
type: string
type: array
repository:
description: GDRCopy diver image repository
type: string
version:
description: GDRCopy driver image tag
type: string
type: object
gds:
description: GPUDirectStorage defines the spec for GDS driver
properties:
Expand Down
4 changes: 2 additions & 2 deletions controllers/nvidiadriver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
return reconcile.Result{}, nil
}

if instance.Spec.UsePrecompiledDrivers() && instance.Spec.IsGDSEnabled() {
err = fmt.Errorf("GPUDirect Storage driver (nvidia-fs) is not supported along with pre-compiled NVIDIA drivers")
if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
err = fmt.Errorf("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
logger.V(consts.LogLevelError).Error(nil, err.Error())
instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
Expand Down
Loading

0 comments on commit 4c08bb9

Please sign in to comment.