Skip to content

Commit

Permalink
Avoid using hostPort for DCGM communication in favor of ClusterIP ser…
Browse files Browse the repository at this point in the history
…vice with local internal traffic policy
  • Loading branch information
shivamerla committed Apr 29, 2024
1 parent 015e29c commit 19bd3c0
Show file tree
Hide file tree
Showing 11 changed files with 171 additions and 40 deletions.
2 changes: 1 addition & 1 deletion api/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,7 @@ type DCGMSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
Env []EnvVar `json:"env,omitempty"`

// HostPort represents host port that needs to be bound for DCGM engine (Default: 5555)
// Deprecated: HostPort represents host port that needs to be bound for DCGM engine (Default: 5555)
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Host port to bind for DCGM engine"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:number"
HostPort int32 `json:"hostPort,omitempty"`
Expand Down
2 changes: 0 additions & 2 deletions assets/state-dcgm/0400_dcgm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ spec:
- name: run-nvidia
mountPath: /run/nvidia
mountPropagation: HostToContainer
hostNetwork: true
containers:
- image: "FILLED BY THE OPERATOR"
name: nvidia-dcgm-ctr
Expand All @@ -44,7 +43,6 @@ spec:
ports:
- name: "dcgm"
containerPort: 5555
hostPort: 5555
volumes:
- name: run-nvidia
hostPath:
Expand Down
16 changes: 16 additions & 0 deletions assets/state-dcgm/0500_service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: v1
kind: Service
metadata:
labels:
app: nvidia-dcgm
name: nvidia-dcgm
namespace: "FILLED BY THE OPERATOR"
spec:
internalTrafficPolicy: Local
ports:
- name: dcgm
port: 5555
protocol: TCP
selector:
app: nvidia-dcgm
type: ClusterIP
4 changes: 2 additions & 2 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,8 @@ spec:
type: object
type: array
hostPort:
description: 'HostPort represents host port that needs to be bound
for DCGM engine (Default: 5555)'
description: 'Deprecated: HostPort represents host port that needs
to be bound for DCGM engine (Default: 5555)'
format: int32
type: integer
image:
Expand Down
4 changes: 2 additions & 2 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,8 @@ spec:
type: object
type: array
hostPort:
description: 'HostPort represents host port that needs to be bound
for DCGM engine (Default: 5555)'
description: 'Deprecated: HostPort represents host port that needs
to be bound for DCGM engine (Default: 5555)'
format: int32
type: integer
image:
Expand Down
31 changes: 4 additions & 27 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@ const (
MigDefaultGPUClientsConfigMapName = "default-gpu-clients"
// DCGMRemoteEngineEnvName indicates env name to specify remote DCGM host engine ip:port
DCGMRemoteEngineEnvName = "DCGM_REMOTE_HOSTENGINE_INFO"
// DCGMDefaultHostPort indicates default host port bound to DCGM host engine
DCGMDefaultHostPort = 5555
// DCGMDefaultPort indicates default port bound to DCGM host engine
DCGMDefaultPort = 5555
// GPUDirectRDMAEnabledEnvName indicates if GPU direct RDMA is enabled through GPU operator
GPUDirectRDMAEnabledEnvName = "GPU_DIRECT_RDMA_ENABLED"
// UseHostMOFEDEnvName indicates if MOFED driver is pre-installed on the host
Expand Down Expand Up @@ -1461,14 +1461,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe

// check if DCGM hostengine is enabled as a separate Pod and setup env accordingly
if config.DCGM.IsEnabled() {
// enable hostNetwork for communication with external DCGM using NODE_IP(localhost)
obj.Spec.Template.Spec.HostNetwork = true
// set DCGM host engine env. localhost will be substituted during pod runtime
dcgmHostPort := int32(DCGMDefaultHostPort)
if config.DCGM.HostPort != 0 {
dcgmHostPort = config.DCGM.HostPort
}
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName, fmt.Sprintf("localhost:%d", dcgmHostPort))
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName, fmt.Sprintf("nvidia-dcgm:%d", DCGMDefaultPort))
} else {
// case for DCGM running on the host itself(DGX BaseOS)
remoteEngine := getContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DCGMRemoteEngineEnvName)
Expand All @@ -1477,6 +1470,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
obj.Spec.Template.Spec.HostNetwork = true
}
}

// set RuntimeClass for supported runtimes
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)

Expand Down Expand Up @@ -1594,18 +1588,6 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu
}
}

// set host port to bind for DCGM engine
for i, port := range obj.Spec.Template.Spec.Containers[0].Ports {
if port.Name == "dcgm" {
obj.Spec.Template.Spec.Containers[0].Ports[i].HostPort = DCGMDefaultHostPort
if config.DCGM.HostPort != 0 {
obj.Spec.Template.Spec.Containers[0].Ports[i].HostPort = config.DCGM.HostPort
// We set the containerPort to the same value as the hostPort as hostNetwork is set to true
obj.Spec.Template.Spec.Containers[0].Ports[i].ContainerPort = config.DCGM.HostPort
}
}
}

// set RuntimeClass for supported runtimes
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)

Expand Down Expand Up @@ -4326,11 +4308,6 @@ func SecurityContextConstraints(n ClusterPolicyController) (gpuv1.State, error)
obj.Users[idx] = fmt.Sprintf("system:serviceaccount:%s:%s", obj.Namespace, obj.Name)
}

// Allow hostNetwork only when a separate standalone DCGM engine is deployed for communication
if obj.Name == "nvidia-dcgm-exporter" && n.singleton.Spec.DCGM.IsEnabled() {
obj.AllowHostNetwork = true
}

if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
return gpuv1.NotReady, err
}
Expand Down
137 changes: 137 additions & 0 deletions controllers/object_controls_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
nodev1 "k8s.io/api/node/v1"
rbacv1 "k8s.io/api/rbac/v1"
schedv1 "k8s.io/api/scheduling/v1beta1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
Expand All @@ -55,6 +56,7 @@ const (
vGPUManagerAssetsPath = "assets/state-vgpu-manager/"
sandboxDevicePluginAssetsPath = "assets/state-sandbox-device-plugin"
devicePluginAssetsPath = "assets/state-device-plugin/"
dcgmExporterAssetsPath = "assets/state-dcgm-exporter/"
nfdNvidiaPCILabelKey = "feature.node.kubernetes.io/pci-10de.present"
upgradedKernel = "5.4.135-generic"
)
Expand Down Expand Up @@ -164,6 +166,9 @@ func setup() error {
if err := promv1.AddToScheme(s); err != nil {
return fmt.Errorf("unable to add promv1 schema: %v", err)
}
if err := apiextensionsv1.AddToScheme(s); err != nil {
return fmt.Errorf("unable to add apiextensionsv1 schema: %v", err)
}
if err := secv1.Install(s); err != nil {
return fmt.Errorf("unable to add secv1 schema: %v", err)
}
Expand Down Expand Up @@ -386,6 +391,24 @@ func testDaemonsetCommon(t *testing.T, cp *gpuv1.ClusterPolicy, component string
if err != nil {
return nil, fmt.Errorf("unable to get mainCtrImage for sandbox-device-plugin: %v", err)
}
case "DCGMExporter":
spec = commonDaemonsetSpec{
repository: cp.Spec.DCGMExporter.Repository,
image: cp.Spec.DCGMExporter.Image,
version: cp.Spec.DCGMExporter.Version,
imagePullPolicy: cp.Spec.DCGMExporter.ImagePullPolicy,
imagePullSecrets: getImagePullSecrets(cp.Spec.DCGMExporter.ImagePullSecrets),
args: cp.Spec.DCGMExporter.Args,
env: cp.Spec.DCGMExporter.Env,
resources: cp.Spec.DCGMExporter.Resources,
}
dsLabel = "nvidia-dcgm-exporter"
mainCtrName = "nvidia-dcgm-exporter"
manifestFile = filepath.Join(cfg.root, dcgmExporterAssetsPath)
mainCtrImage, err = gpuv1.ImagePath(&cp.Spec.DCGMExporter)
if err != nil {
return nil, fmt.Errorf("unable to get mainCtrImage for dcgm-exporter: %v", err)
}
default:
return nil, fmt.Errorf("invalid component for testDaemonsetCommon(): %s", component)
}
Expand Down Expand Up @@ -996,3 +1019,117 @@ func TestIsOpenKernelModulesRequired(t *testing.T) {
})
}
}

// getDCGMExporterTestInput return a ClusterPolicy instance for a particular
// dcgm-exporter test case.
func getDCGMExporterTestInput(testCase string) *gpuv1.ClusterPolicy {
cp := clusterPolicy.DeepCopy()

// Set some default values
cp.Spec.DCGMExporter.Repository = "nvcr.io/nvidia/k8s"
cp.Spec.DCGMExporter.Image = "dcgm-exporter"
cp.Spec.DCGMExporter.Version = "3.3.0-3.2.0-ubuntu22.04"
cp.Spec.DCGMExporter.ImagePullSecrets = []string{"ngc-secret"}

cp.Spec.Validator.Repository = "nvcr.io/nvidia/cloud-native"
cp.Spec.Validator.Image = "gpu-operator-validator"
cp.Spec.Validator.Version = "v23.9.2"
cp.Spec.Validator.ImagePullSecrets = []string{"ngc-secret"}

switch testCase {
case "default":
// Do nothing
case "standalone-dcgm":
dcgmEnabled := true
cp.Spec.DCGM.Enabled = &dcgmEnabled
default:
return nil
}

return cp
}

// getDCGMExporterTestOutput returns a map containing expected output for
// dcgm-exporter test case.
func getDCGMExporterTestOutput(testCase string) map[string]interface{} {
// default output
output := map[string]interface{}{
"numDaemonsets": 1,
"dcgmExporterImage": "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.0-3.2.0-ubuntu22.04",
"imagePullSecret": "ngc-secret",
}

switch testCase {
case "default":
output["env"] = map[string]string{}
case "standalone-dcgm":
output["env"] = map[string]string{
"DCGM_REMOTE_HOSTENGINE_INFO": "nvidia-dcgm:5555",
}
default:
return nil
}

return output
}

// TestDCGMExporter tests that the GPU Operator correctly deploys the dcgm-exporter daemonset
// under various scenarios/config options
func TestDCGMExporter(t *testing.T) {
testCases := []struct {
description string
clusterPolicy *gpuv1.ClusterPolicy
output map[string]interface{}
}{
{
"Default",
getDCGMExporterTestInput("default"),
getDCGMExporterTestOutput("default"),
},
{
"StandalongDCGM",
getDCGMExporterTestInput("standalone-dcgm"),
getDCGMExporterTestOutput("standalone-dcgm"),
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
ds, err := testDaemonsetCommon(t, tc.clusterPolicy, "DCGMExporter", tc.output["numDaemonsets"].(int))
if err != nil {
t.Fatalf("error in testDaemonsetCommon(): %v", err)
}
if ds == nil {
return
}

dcgmExporterImage := ""
for _, container := range ds.Spec.Template.Spec.Containers {
if container.Name == "nvidia-dcgm-exporter" {
dcgmExporterImage = container.Image
break
}
}
for key, value := range tc.output["env"].(map[string]string) {
envFound := false
for _, envVar := range ds.Spec.Template.Spec.Containers[0].Env {
if envVar.Name == key && envVar.Value == value {
envFound = true
}
}
if !envFound {
t.Fatalf("Expected env is not set for daemonset nvidia-dcgm-exporter %s->%s", key, value)
}
}

require.Equal(t, tc.output["dcgmExporterImage"], dcgmExporterImage, "Unexpected configuration for dcgm-exporter image")

// cleanup by deleting all kubernetes objects
err = removeState(&clusterPolicyController, clusterPolicyController.idx-1)
if err != nil {
t.Fatalf("error removing state %v:", err)
}
clusterPolicyController.idx--
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,8 @@ spec:
type: object
type: array
hostPort:
description: 'HostPort represents host port that needs to be bound
for DCGM engine (Default: 5555)'
description: 'Deprecated: HostPort represents host port that needs
to be bound for DCGM engine (Default: 5555)'
format: int32
type: integer
image:
Expand Down
3 changes: 0 additions & 3 deletions deployments/gpu-operator/templates/clusterpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -482,9 +482,6 @@ spec:
{{- if .Values.dcgm.args }}
args: {{ toYaml .Values.dcgm.args | nindent 6 }}
{{- end }}
{{- if .Values.dcgm.hostPort }}
hostPort: {{ .Values.dcgm.hostPort }}
{{- end }}
dcgmExporter:
enabled: {{ .Values.dcgmExporter.enabled }}
{{- if .Values.dcgmExporter.repository }}
Expand Down
1 change: 0 additions & 1 deletion deployments/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,6 @@ dcgm:
image: dcgm
version: 3.3.5-1-ubuntu22.04
imagePullPolicy: IfNotPresent
hostPort: 5555
args: []
env: []
resources: {}
Expand Down
7 changes: 7 additions & 0 deletions tests/scripts/update-clusterpolicy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,13 @@ test_enable_dcgm() {
# Verify that standalone nvidia-dcgm and exporter pods are running successfully after update
check_pod_ready "nvidia-dcgm"
check_pod_ready "nvidia-dcgm-exporter"

# Test that nvidia-dcgm service is created with interalTrafficPolicy set to "local"
trafficPolicy=$(kubectl get service nvidia-dcgm -n $TEST_NAMESPACE -o json | jq -r '.spec.internalTrafficPolicy')
if [ "$trafficPolicy" != "Local" ]; then
echo "service nvidia-dcgm is missing or internal traffic policy is not set to local"
exit 1
fi
}

test_gpu_sharing() {
Expand Down

0 comments on commit 19bd3c0

Please sign in to comment.