From 5d4e11ff437fffd3cb91c893b829bb3c3738beb0 Mon Sep 17 00:00:00 2001 From: guptaNswati Date: Wed, 11 Mar 2020 18:28:40 -0700 Subject: [PATCH] Configure Prometheus to access DCGM metrics --- assets/state-monitoring/0400_prom_role.yaml | 16 ++++++++++++++++ .../state-monitoring/0500_prom_rolebinding.yaml | 13 +++++++++++++ .../{0400_service.yaml => 0600_service.yaml} | 0 .../state-monitoring/0700_service_monitor.yaml | 15 +++++++++++++++ .../{0500_configmap.yaml => 0800_configmap.yaml} | 0 .../{0600_daemonset.yaml => 0900_daemonset.yaml} | 0 .../gpu-operator/templates/0010_namespace.yaml | 2 ++ 7 files changed, 46 insertions(+) create mode 100644 assets/state-monitoring/0400_prom_role.yaml create mode 100644 assets/state-monitoring/0500_prom_rolebinding.yaml rename assets/state-monitoring/{0400_service.yaml => 0600_service.yaml} (100%) create mode 100644 assets/state-monitoring/0700_service_monitor.yaml rename assets/state-monitoring/{0500_configmap.yaml => 0800_configmap.yaml} (100%) rename assets/state-monitoring/{0600_daemonset.yaml => 0900_daemonset.yaml} (100%) diff --git a/assets/state-monitoring/0400_prom_role.yaml b/assets/state-monitoring/0400_prom_role.yaml new file mode 100644 index 000000000..5988aadcf --- /dev/null +++ b/assets/state-monitoring/0400_prom_role.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-k8s + namespace: gpu-operator-resources +rules: +- apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch diff --git a/assets/state-monitoring/0500_prom_rolebinding.yaml b/assets/state-monitoring/0500_prom_rolebinding.yaml new file mode 100644 index 000000000..7b2467780 --- /dev/null +++ b/assets/state-monitoring/0500_prom_rolebinding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: gpu-operator-resources +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: openshift-monitoring diff --git a/assets/state-monitoring/0400_service.yaml b/assets/state-monitoring/0600_service.yaml similarity index 100% rename from assets/state-monitoring/0400_service.yaml rename to assets/state-monitoring/0600_service.yaml diff --git a/assets/state-monitoring/0700_service_monitor.yaml b/assets/state-monitoring/0700_service_monitor.yaml new file mode 100644 index 000000000..13a1858ac --- /dev/null +++ b/assets/state-monitoring/0700_service_monitor.yaml @@ -0,0 +1,15 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: nvidia-dcgm-exporter + namespace: gpu-operator-resources +spec: + endpoints: + - port: gpu-metrics + jobLabel: app + namespaceSelector: + matchNames: + - gpu-operator-resources + selector: + matchLabels: + app: nvidia-dcgm-exporter diff --git a/assets/state-monitoring/0500_configmap.yaml b/assets/state-monitoring/0800_configmap.yaml similarity index 100% rename from assets/state-monitoring/0500_configmap.yaml rename to assets/state-monitoring/0800_configmap.yaml diff --git a/assets/state-monitoring/0600_daemonset.yaml b/assets/state-monitoring/0900_daemonset.yaml similarity index 100% rename from assets/state-monitoring/0600_daemonset.yaml rename to assets/state-monitoring/0900_daemonset.yaml diff --git a/deployments/gpu-operator/templates/0010_namespace.yaml b/deployments/gpu-operator/templates/0010_namespace.yaml index 8e65a7d28..ac2a14be9 100644 --- a/deployments/gpu-operator/templates/0010_namespace.yaml +++ b/deployments/gpu-operator/templates/0010_namespace.yaml @@ -2,3 +2,5 @@ apiVersion: v1 kind: Namespace metadata: name: gpu-operator-resources + labels: + openshift.io/cluster-monitoring: "true"