Skip to content

Commit

Permalink
FC support: Allow per node plugin configuration and provide support f…
Browse files Browse the repository at this point in the history
…or CM creation during install.
  • Loading branch information
shivamerla committed Oct 28, 2022
1 parent 1efa960 commit 309014d
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 20 deletions.
18 changes: 7 additions & 11 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)

// update env required for MIG support
applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy, true)
applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)

return nil
}
Expand Down Expand Up @@ -1028,7 +1028,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)

// update env required for MIG support
applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy, false)
applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)

return nil
}
Expand Down Expand Up @@ -1760,18 +1760,14 @@ func setRuntimeClass(podSpec *corev1.PodSpec, runtime gpuv1.Runtime, runtimeClas
}

// applies MIG related configuration env to container spec
func applyMIGConfiguration(c *corev1.Container, strategy gpuv1.MIGStrategy, isGFD bool) {
// if not set then default to "none" strategy
func applyMIGConfiguration(c *corev1.Container, strategy gpuv1.MIGStrategy) {
// if not set then let plugin decide this per node(default: none)
if strategy == "" {
strategy = gpuv1.MIGStrategyNone
setContainerEnv(c, "NVIDIA_MIG_MONITOR_DEVICES", "all")
return
}

if isGFD {
// this is temporary until we align env name for GFD with device-plugin
setContainerEnv(c, "GFD_MIG_STRATEGY", string(strategy))
} else {
setContainerEnv(c, "MIG_STRATEGY", string(strategy))
}
setContainerEnv(c, "MIG_STRATEGY", string(strategy))
if strategy != gpuv1.MIGStrategyNone {
setContainerEnv(c, "NVIDIA_MIG_MONITOR_DEVICES", "all")
}
Expand Down
11 changes: 7 additions & 4 deletions deployments/gpu-operator/templates/clusterpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@ spec:
{{- if .Values.operator.defaultGPUMode }}
defaultGPUMode: {{ .Values.operator.defaultGPUMode }}
{{- end }}
{{- if .Values.operator.migStrategy }}
migStrategy: {{ .Values.operator.migStrategy }}
{{- end }}
{{- if .Values.operator.initContainer }}
initContainer:
{{- if .Values.operator.initContainer.repository }}
Expand Down Expand Up @@ -353,7 +350,13 @@ spec:
args: {{ toYaml .Values.devicePlugin.args | nindent 6 }}
{{- end }}
{{- if .Values.devicePlugin.config }}
config: {{ toYaml .Values.devicePlugin.config | nindent 6 }}
config:
{{- if .Values.devicePlugin.config.name }}
name: {{ .Values.devicePlugin.config.name }}
{{- end }}
{{- if .Values.devicePlugin.config.default }}
default: {{ .Values.devicePlugin.config.default }}
{{- end }}
{{- end }}
dcgm:
enabled: {{ .Values.dcgm.enabled }}
Expand Down
9 changes: 9 additions & 0 deletions deployments/gpu-operator/templates/plugin_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{{- if and (.Values.devicePlugin.config.create) (not (empty .Values.devicePlugin.config.data)) }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Values.devicePlugin.config.name }}
namespace: {{ .Release.Namespace }}
data: {{ toYaml .Values.devicePlugin.config.data | nindent 2 }}
{{- end }}

25 changes: 24 additions & 1 deletion deployments/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,34 @@ devicePlugin:
value: all
resources: {}
# Plugin configuration
# Use "name" to either point to an existing ConfigMap or to create a new one with a list of configurations(i.e with create=true).
# Use "data" to build an integrated ConfigMap from a set of configurations as
# part of this helm chart. An example of setting "data" might be:
# config:
# name: device-plugin-config
# create: true
# data:
# default: |-
# version: v1
# flags:
# migStrategy: none
# mig-single: |-
# version: v1
# flags:
# migStrategy: single
# mig-mixed: |-
# version: v1
# flags:
# migStrategy: mixed
config:
# ConfigMap name
# Create a ConfigMap (default: false)
create: false
# ConfigMap name (either exiting or to create a new one with create=true above)
name: ""
# Default config name within the ConfigMap
default: ""
# Data section for the ConfigMap to create (i.e only applies when create=true)
data: {}

# standalone dcgm hostengine
dcgm:
Expand Down
6 changes: 3 additions & 3 deletions tests/scripts/checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ check_pod_ready() {
kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}

echo "Checking $pod_label pod readiness"
is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}')
is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated")

if [ "${is_pod_ready}" = "True" ]; then
# Check if the pod is not in terminating state
is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}')
if [ "${is_pod_terminating}" = "30" ]; then
is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated")
if [ "${is_pod_terminating}" != "" ]; then
echo "pod $pod_label is in terminating state..."
else
echo "Pod $pod_label is ready"
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/update-clusterpolicy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ test_mig_strategy_updates() {
sleep 10

# Validate that MIG strategy value is applied to both GFD and Device-Plugin Daemonsets
kubectl get daemonsets -lapp=gpu-feature-discovery -n $TEST_NAMESPACE -o=jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.template.spec.containers[*].env[?(@.name=="GFD_MIG_STRATEGY")]}{"\n"}{end}' | grep GFD_MIG_STRATEGY.*$MIG_STRATEGY
kubectl get daemonsets -lapp=gpu-feature-discovery -n $TEST_NAMESPACE -o=jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.template.spec.containers[*].env[?(@.name=="MIG_STRATEGY")]}{"\n"}{end}' | grep MIG_STRATEGY.*$MIG_STRATEGY
if [ "$?" -ne 0 ]; then
echo "cannot update MIG strategy to value $MIG_STRATEGY with GFD Daemonset"
exit 1
Expand Down

0 comments on commit 309014d

Please sign in to comment.