Avoid using hostPort for DCGM communication in favor of ClusterIP ser…

…vice with local internal traffic policy
mayooot · Apr 29, 2024 · 19bd3c0 · 19bd3c0
1 parent 015e29c
commit 19bd3c0
Show file tree

Hide file tree

Showing 11 changed files with 171 additions and 40 deletions.
diff --git a/api/v1/clusterpolicy_types.go b/api/v1/clusterpolicy_types.go
@@ -980,7 +980,7 @@ type DCGMSpec struct {
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
 	Env []EnvVar `json:"env,omitempty"`
 
-	// HostPort represents host port that needs to be bound for DCGM engine (Default: 5555)
+	// Deprecated: HostPort represents host port that needs to be bound for DCGM engine (Default: 5555)
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Host port to bind for DCGM engine"
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:number"
 	HostPort int32 `json:"hostPort,omitempty"`

diff --git a/assets/state-dcgm/0400_dcgm.yml b/assets/state-dcgm/0400_dcgm.yml
@@ -35,7 +35,6 @@ spec:
           - name: run-nvidia
             mountPath: /run/nvidia
             mountPropagation: HostToContainer
-      hostNetwork: true
       containers:
       - image: "FILLED BY THE OPERATOR"
         name: nvidia-dcgm-ctr
@@ -44,7 +43,6 @@ spec:
         ports:
         - name: "dcgm"
           containerPort: 5555
-          hostPort: 5555
       volumes:
         - name: run-nvidia
           hostPath:

diff --git a/assets/state-dcgm/0500_service.yaml b/assets/state-dcgm/0500_service.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app: nvidia-dcgm
+  name: nvidia-dcgm
+  namespace: "FILLED BY THE OPERATOR"
+spec:
+  internalTrafficPolicy: Local
+  ports:
+  - name: dcgm
+    port: 5555
+    protocol: TCP
+  selector:
+    app: nvidia-dcgm
+  type: ClusterIP
diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml
@@ -248,8 +248,8 @@ spec:
                       type: object
                     type: array
                   hostPort:
-                    description: 'HostPort represents host port that needs to be bound
-                      for DCGM engine (Default: 5555)'
+                    description: 'Deprecated: HostPort represents host port that needs
+                      to be bound for DCGM engine (Default: 5555)'
                     format: int32
                     type: integer
                   image:

diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml
@@ -248,8 +248,8 @@ spec:
                       type: object
                     type: array
                   hostPort:
-                    description: 'HostPort represents host port that needs to be bound
-                      for DCGM engine (Default: 5555)'
+                    description: 'Deprecated: HostPort represents host port that needs
+                      to be bound for DCGM engine (Default: 5555)'
                     format: int32
                     type: integer
                   image:

diff --git a/controllers/object_controls.go b/controllers/object_controls.go
@@ -112,8 +112,8 @@ const (
 	MigDefaultGPUClientsConfigMapName = "default-gpu-clients"
 	// DCGMRemoteEngineEnvName indicates env name to specify remote DCGM host engine ip:port
 	DCGMRemoteEngineEnvName = "DCGM_REMOTE_HOSTENGINE_INFO"
-	// DCGMDefaultHostPort indicates default host port bound to DCGM host engine
-	DCGMDefaultHostPort = 5555
+	// DCGMDefaultPort indicates default port bound to DCGM host engine
+	DCGMDefaultPort = 5555
 	// GPUDirectRDMAEnabledEnvName indicates if GPU direct RDMA is enabled through GPU operator
 	GPUDirectRDMAEnabledEnvName = "GPU_DIRECT_RDMA_ENABLED"
 	// UseHostMOFEDEnvName indicates if MOFED driver is pre-installed on the host
@@ -1461,14 +1461,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 
 	// check if DCGM hostengine is enabled as a separate Pod and setup env accordingly
 	if config.DCGM.IsEnabled() {
-		// enable hostNetwork for communication with external DCGM using NODE_IP(localhost)
-		obj.Spec.Template.Spec.HostNetwork = true
-		// set DCGM host engine env. localhost will be substituted during pod runtime
-		dcgmHostPort := int32(DCGMDefaultHostPort)
-		if config.DCGM.HostPort != 0 {
-			dcgmHostPort = config.DCGM.HostPort
-		}
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName, fmt.Sprintf("localhost:%d", dcgmHostPort))
+		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName, fmt.Sprintf("nvidia-dcgm:%d", DCGMDefaultPort))
 	} else {
 		// case for DCGM running on the host itself(DGX BaseOS)
 		remoteEngine := getContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName)
@@ -1477,6 +1470,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 			obj.Spec.Template.Spec.HostNetwork = true
 		}
 	}
+
 	// set RuntimeClass for supported runtimes
 	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
 
@@ -1594,18 +1588,6 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu
 		}
 	}
 
-	// set host port to bind for DCGM engine
-	for i, port := range obj.Spec.Template.Spec.Containers[0].Ports {
-		if port.Name == "dcgm" {
-			obj.Spec.Template.Spec.Containers[0].Ports[i].HostPort = DCGMDefaultHostPort
-			if config.DCGM.HostPort != 0 {
-				obj.Spec.Template.Spec.Containers[0].Ports[i].HostPort = config.DCGM.HostPort
-				// We set the containerPort to the same value as the hostPort as hostNetwork is set to true
-				obj.Spec.Template.Spec.Containers[0].Ports[i].ContainerPort = config.DCGM.HostPort
-			}
-		}
-	}
-
 	// set RuntimeClass for supported runtimes
 	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
 
@@ -4326,11 +4308,6 @@ func SecurityContextConstraints(n ClusterPolicyController) (gpuv1.State, error)
 		obj.Users[idx] = fmt.Sprintf("system:serviceaccount:%s:%s", obj.Namespace, obj.Name)
 	}
 
-	// Allow hostNetwork only when a separate standalone DCGM engine is deployed for communication
-	if obj.Name == "nvidia-dcgm-exporter" && n.singleton.Spec.DCGM.IsEnabled() {
-		obj.AllowHostNetwork = true
-	}
-
 	if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
 		return gpuv1.NotReady, err
 	}

diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go
@@ -34,6 +34,7 @@ import (
 	nodev1 "k8s.io/api/node/v1"
 	rbacv1 "k8s.io/api/rbac/v1"
 	schedv1 "k8s.io/api/scheduling/v1beta1"
+	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
@@ -55,6 +56,7 @@ const (
 	vGPUManagerAssetsPath         = "assets/state-vgpu-manager/"
 	sandboxDevicePluginAssetsPath = "assets/state-sandbox-device-plugin"
 	devicePluginAssetsPath        = "assets/state-device-plugin/"
+	dcgmExporterAssetsPath        = "assets/state-dcgm-exporter/"
 	nfdNvidiaPCILabelKey          = "feature.node.kubernetes.io/pci-10de.present"
 	upgradedKernel                = "5.4.135-generic"
 )
@@ -164,6 +166,9 @@ func setup() error {
 	if err := promv1.AddToScheme(s); err != nil {
 		return fmt.Errorf("unable to add promv1 schema: %v", err)
 	}
+	if err := apiextensionsv1.AddToScheme(s); err != nil {
+		return fmt.Errorf("unable to add apiextensionsv1 schema: %v", err)
+	}
 	if err := secv1.Install(s); err != nil {
 		return fmt.Errorf("unable to add secv1 schema: %v", err)
 	}
@@ -386,6 +391,24 @@ func testDaemonsetCommon(t *testing.T, cp *gpuv1.ClusterPolicy, component string
 		if err != nil {
 			return nil, fmt.Errorf("unable to get mainCtrImage for sandbox-device-plugin: %v", err)
 		}
+	case "DCGMExporter":
+		spec = commonDaemonsetSpec{
+			repository:       cp.Spec.DCGMExporter.Repository,
+			image:            cp.Spec.DCGMExporter.Image,
+			version:          cp.Spec.DCGMExporter.Version,
+			imagePullPolicy:  cp.Spec.DCGMExporter.ImagePullPolicy,
+			imagePullSecrets: getImagePullSecrets(cp.Spec.DCGMExporter.ImagePullSecrets),
+			args:             cp.Spec.DCGMExporter.Args,
+			env:              cp.Spec.DCGMExporter.Env,
+			resources:        cp.Spec.DCGMExporter.Resources,
+		}
+		dsLabel = "nvidia-dcgm-exporter"
+		mainCtrName = "nvidia-dcgm-exporter"
+		manifestFile = filepath.Join(cfg.root, dcgmExporterAssetsPath)
+		mainCtrImage, err = gpuv1.ImagePath(&cp.Spec.DCGMExporter)
+		if err != nil {
+			return nil, fmt.Errorf("unable to get mainCtrImage for dcgm-exporter: %v", err)
+		}
 	default:
 		return nil, fmt.Errorf("invalid component for testDaemonsetCommon(): %s", component)
 	}
@@ -996,3 +1019,117 @@ func TestIsOpenKernelModulesRequired(t *testing.T) {
 		})
 	}
 }
+
+// getDCGMExporterTestInput return a ClusterPolicy instance for a particular
+// dcgm-exporter test case.
+func getDCGMExporterTestInput(testCase string) *gpuv1.ClusterPolicy {
+	cp := clusterPolicy.DeepCopy()
+
+	// Set some default values
+	cp.Spec.DCGMExporter.Repository = "nvcr.io/nvidia/k8s"
+	cp.Spec.DCGMExporter.Image = "dcgm-exporter"
+	cp.Spec.DCGMExporter.Version = "3.3.0-3.2.0-ubuntu22.04"
+	cp.Spec.DCGMExporter.ImagePullSecrets = []string{"ngc-secret"}
+
+	cp.Spec.Validator.Repository = "nvcr.io/nvidia/cloud-native"
+	cp.Spec.Validator.Image = "gpu-operator-validator"
+	cp.Spec.Validator.Version = "v23.9.2"
+	cp.Spec.Validator.ImagePullSecrets = []string{"ngc-secret"}
+
+	switch testCase {
+	case "default":
+		// Do nothing
+	case "standalone-dcgm":
+		dcgmEnabled := true
+		cp.Spec.DCGM.Enabled = &dcgmEnabled
+	default:
+		return nil
+	}
+
+	return cp
+}
+
+// getDCGMExporterTestOutput returns a map containing expected output for
+// dcgm-exporter test case.
+func getDCGMExporterTestOutput(testCase string) map[string]interface{} {
+	// default output
+	output := map[string]interface{}{
+		"numDaemonsets":     1,
+		"dcgmExporterImage": "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04",
+		"imagePullSecret":   "ngc-secret",
+	}
+
+	switch testCase {
+	case "default":
+		output["env"] = map[string]string{}
+	case "standalone-dcgm":
+		output["env"] = map[string]string{
+			"DCGM_REMOTE_HOSTENGINE_INFO": "nvidia-dcgm:5555",
+		}
+	default:
+		return nil
+	}
+
+	return output
+}
+
+// TestDCGMExporter tests that the GPU Operator correctly deploys the dcgm-exporter daemonset
+// under various scenarios/config options
+func TestDCGMExporter(t *testing.T) {
+	testCases := []struct {
+		description   string
+		clusterPolicy *gpuv1.ClusterPolicy
+		output        map[string]interface{}
+	}{
+		{
+			"Default",
+			getDCGMExporterTestInput("default"),
+			getDCGMExporterTestOutput("default"),
+		},
+		{
+			"StandalongDCGM",
+			getDCGMExporterTestInput("standalone-dcgm"),
+			getDCGMExporterTestOutput("standalone-dcgm"),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			ds, err := testDaemonsetCommon(t, tc.clusterPolicy, "DCGMExporter", tc.output["numDaemonsets"].(int))
+			if err != nil {
+				t.Fatalf("error in testDaemonsetCommon(): %v", err)
+			}
+			if ds == nil {
+				return
+			}
+
+			dcgmExporterImage := ""
+			for _, container := range ds.Spec.Template.Spec.Containers {
+				if container.Name == "nvidia-dcgm-exporter" {
+					dcgmExporterImage = container.Image
+					break
+				}
+			}
+			for key, value := range tc.output["env"].(map[string]string) {
+				envFound := false
+				for _, envVar := range ds.Spec.Template.Spec.Containers[0].Env {
+					if envVar.Name == key && envVar.Value == value {
+						envFound = true
+					}
+				}
+				if !envFound {
+					t.Fatalf("Expected env is not set for daemonset nvidia-dcgm-exporter %s->%s", key, value)
+				}
+			}
+
+			require.Equal(t, tc.output["dcgmExporterImage"], dcgmExporterImage, "Unexpected configuration for dcgm-exporter image")
+
+			// cleanup by deleting all kubernetes objects
+			err = removeState(&clusterPolicyController, clusterPolicyController.idx-1)
+			if err != nil {
+				t.Fatalf("error removing state %v:", err)
+			}
+			clusterPolicyController.idx--
+		})
+	}
+}
diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies_crd.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies_crd.yaml
@@ -248,8 +248,8 @@ spec:
                       type: object
                     type: array
                   hostPort:
-                    description: 'HostPort represents host port that needs to be bound
-                      for DCGM engine (Default: 5555)'
+                    description: 'Deprecated: HostPort represents host port that needs
+                      to be bound for DCGM engine (Default: 5555)'
                     format: int32
                     type: integer
                   image:

diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml
@@ -482,9 +482,6 @@ spec:
     {{- if .Values.dcgm.args }}
     args: {{ toYaml .Values.dcgm.args | nindent 6 }}
     {{- end }}
-    {{- if .Values.dcgm.hostPort }}
-    hostPort: {{ .Values.dcgm.hostPort }}
-    {{- end }}
   dcgmExporter:
     enabled: {{ .Values.dcgmExporter.enabled }}
     {{- if .Values.dcgmExporter.repository }}

diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
@@ -290,7 +290,6 @@ dcgm:
   image: dcgm
   version: 3.3.5-1-ubuntu22.04
   imagePullPolicy: IfNotPresent
-  hostPort: 5555
   args: []
   env: []
   resources: {}

diff --git a/tests/scripts/update-clusterpolicy.sh b/tests/scripts/update-clusterpolicy.sh
@@ -112,6 +112,13 @@ test_enable_dcgm() {
     # Verify that standalone nvidia-dcgm and exporter pods are running successfully after update
     check_pod_ready "nvidia-dcgm"
     check_pod_ready "nvidia-dcgm-exporter"
+
+    # Test that nvidia-dcgm service is created with interalTrafficPolicy set to "local"
+    trafficPolicy=$(kubectl  get service nvidia-dcgm -n $TEST_NAMESPACE -o json | jq -r '.spec.internalTrafficPolicy')
+    if [ "$trafficPolicy" != "Local" ]; then
+        echo "service nvidia-dcgm is missing or internal traffic policy is not set to local"
+        exit 1
+    fi
 }
 
 test_gpu_sharing() {