KUB-64 - Added support for k8s statefulsets (#158)

* Added support for k8s statefulsets Signed-off-by: nileshbhadana <[email protected]>
grofers · Jan 24, 2022 · b841055 · b841055
1 parent bbc4fab
commit b841055
Show file tree

Hide file tree

Showing 6 changed files with 188 additions and 7 deletions.
diff --git a/kubernetes/tests/basic_test.yaml b/kubernetes/tests/basic_test.yaml
@@ -23,4 +23,4 @@ spec:
     components:
       playframework:
         dimensions:
-          - service: sample-play-service
+          - service: sample-play-service
diff --git a/kubernetes/tests/metric_test.yaml b/kubernetes/tests/metric_test.yaml
@@ -20,8 +20,8 @@ spec:
       documentation: https://github.com/grofers/legend/tree/master/docs
       metrics_definition: https://github.com/grofers/legend
     tags:
-    - prod
-    - infra
+      - prod
+      - infra
     components:
 
     # Application frameworks
@@ -52,8 +52,8 @@ spec:
             - host: sample-ec2-host
           db:
             - masters:
-              - host: sample-mysql-host-master
-                db_name: sample-db
+                - host: sample-mysql-host-master
+                  db_name: sample-db
               slaves:
                 - host: sample-mysql-host-slave
                   db_name: sample-db
@@ -92,6 +92,9 @@ spec:
       platform_k8s_deployment:
         dimensions:
           - deployment_name: sample-deployment-name
+      platform_k8s_statefulset:
+        dimensions:
+          - statefulset_name: sample-statefulset-name
       platform_k8s_ingress:
         dimensions:
           - namespace: sample-namespace
@@ -132,7 +135,7 @@ spec:
           - job: sample-starlette-service
             path: sample-starlette-path
 
-    # Applications 
+    # Applications
       consul:
         dimensions:
           - region: sample-aws-region
@@ -158,4 +161,4 @@ spec:
               region: sample-region
             filters:
               - "err"
-              - "500"
+              - "500"
diff --git a/legend/metrics_library/metrics/platform_k8s_statefulset_metrics.j2 b/legend/metrics_library/metrics/platform_k8s_statefulset_metrics.j2
@@ -0,0 +1,158 @@
+component: platform_k8s_statefulset
+data_source_type: Prometheus
+metrics_source: https://github.com/kubernetes/kube-state-metrics
+reference: https://blog.freshtracks.io/a-deep-dive-into-kubernetes-metrics-part-3-container-resource-metrics-361c5ee46e66
+description: Kubernetes Statefulset
+panels:
+  - title: (U) CPU Utilisation
+    type: Graph
+    description: current cpu utilisation per container
+    targets:
+      {% for dimension in data %}
+      - metric: round(((sum(rate(container_cpu_usage_seconds_total{container!~"POD", pod=~"^{{ dimension.statefulset_name }}.*"}[5m])) by (container) / sum(kube_pod_container_resource_limits{pod=~"^{{ dimension.statefulset_name }}.*", resource="cpu", unit="core"}) by (container)) * 100), 0.1)
+        legend: '{{ '{{container}}' }}'
+        ref_no: 1
+      {% endfor %}
+    formatY1: percent
+    alert_config:
+      priority: P3
+      message: High CPU Utilisation
+      rule:
+        for_duration: 5m
+        evaluate_every: 1m
+      condition_query:
+        - OR,avg,1,now,5m,gt,80
+
+  - title: (S) CPU Saturation
+    type: Graph
+    description: Amount of time the container was throttled
+    targets:
+      {% for dimension in data %}
+      - metric: sum(rate(container_cpu_cfs_throttled_seconds_total{container!~"POD", pod=~"^{{ dimension.statefulset_name }}.*"}[5m])) by (container)
+        legend: '{{ '{{container}}' }}'
+      {% endfor %}
+    formatY1: s
+
+  - title: (U) Memory Utilisation
+    type: Graph
+    description: Current memory usage per container
+    targets:
+      {% for dimension in data %}
+      - metric: round(((sum(container_memory_working_set_bytes{container!~"POD", pod=~"^{{ dimension.statefulset_name }}.*"}) by (container) / sum(kube_pod_container_resource_limits{pod=~"^{{ dimension.statefulset_name }}.*", resource="memory", unit="byte"}) by (container)) * 100), 0.1)
+        legend: '{{ '{{container}}' }}'
+        ref_no: 1
+      {% endfor %}
+    formatY1: percent
+    alert_config:
+      priority: P3
+      message: High Memory Utilisation
+      rule:
+        for_duration: 5m
+        evaluate_every: 1m
+      condition_query:
+        - OR,avg,1,now,5m,gt,80
+
+  - title: (S) Memory Saturation
+    type: Graph
+    description: Amount of available memory from the limit
+    targets:
+      {% for dimension in data %}
+      - metric: (sum(container_memory_working_set_bytes{pod=~"^{{ dimension.statefulset_name }}.*"}) by (container) / sum(kube_pod_container_resource_limits{pod=~"^{{ dimension.statefulset_name }}.*", resource="memory", unit="byte"}) by (container))
+        legend: '{{ '{{container}}' }}'
+      {% endfor %}
+    formatY1: bytes
+
+  - title: (U) Disk Utilisation
+    type: Graph
+    description: bytes read/written
+    targets:
+      {% for dimension in data %}
+      - metric: sum(rate(container_fs_writes_bytes_total{pod=~"^{{ dimension.statefulset_name }}.*"}[5m])) by (container,device)
+        legend: '{{ '{{container}} {{device}} Writes' }}'
+      - metric: sum(rate(container_fs_reads_bytes_total{pod=~"^{{ dimension.statefulset_name }}.*"}[5m])) by (container,device)
+        legend: '{{ '{{container}} {{device}} Reads' }}'
+      {% endfor %}
+    formatY1: bytes
+
+  - title: (U) Network Utilisation
+    type: Graph
+    description: bytes received/transmitted
+    targets:
+      {% for dimension in data %}
+      - metric: sum(rate(container_network_receive_bytes_total{pod=~"^{{ dimension.statefulset_name }}.*"}[5m])) by (pod, interface)
+        legend: '{{ '{{pod}} rx' }}'
+      - metric: sum(rate(container_network_transmit_bytes_total{pod=~"^{{ dimension.statefulset_name }}.*"}[5m])) by (pod, interface)
+        legend: '{{ '{{pod}} tx' }}'
+      {% endfor %}
+
+  - title: (E) Network Errors
+    type: Graph
+    description: Number of network errors
+    targets:
+      {% for dimension in data %}
+      - metric: sum(rate(container_network_receive_errors_total{pod=~"^{{ dimension.statefulset_name }}.*"}[5m])) by (pod)
+        legend: '{{ '{{pod}} rx' }}'
+      - metric: sum(rate(container_network_transmit_errors_total{pod=~"^{{ dimension.statefulset_name }}.*"}[5m])) by (pod)
+        legend: '{{ '{{pod}} tx' }}'
+      {% endfor %}
+
+  - title: (E) Unavailable Replica Percentage
+    type: Graph
+    description: Percentage of replicas not available in the statefulset
+    targets:
+      {% for dimension in data %}
+      - metric: round((((kube_statefulset_replicas{statefulset=~"^{{ dimension.statefulset_name }}.*"}-kube_statefulset_status_replicas_ready{statefulset=~"^{{ dimension.statefulset_name }}.*"})/kube_statefulset_replicas{statefulset=~"^{{ dimension.statefulset_name }}.*"}) * 100), 1)
+        legend: '{{ '{{statefulset}}' }}'
+        ref_no: 1
+      {% endfor %}
+    formatY1: percent
+    alert_config:
+      priority: P2
+      message: High Unavailable Replica Percentage
+      rule:
+        for_duration: 5m
+        evaluate_every: 1m
+      condition_query:
+        - OR,avg,1,now,5m,gt,60
+
+  - title: (E) Running replicas
+    type: Graph
+    description: Running replicas
+    targets:
+      {% for dimension in data %}
+      - metric: kube_statefulset_status_replicas_ready{statefulset=~"^{{ dimension.statefulset_name }}.*"}
+        legend: '{{ '{{statefulset}}' }}'
+        ref_no: 1
+      {% endfor %}
+
+  - title: (U) CPU Utilisation(Request)
+    type: Graph
+    description: current cpu utilisation per container from the request
+    targets:
+      {% for dimension in data %}
+      - metric: round(((sum(rate(container_cpu_usage_seconds_total{container!~"POD", pod=~"^{{ dimension.statefulset_name }}.*"}[5m])) by (container) / sum(kube_pod_container_resource_requests{pod=~"^{{ dimension.statefulset_name }}.*", resource="cpu", unit="core"}) by (container)) * 100), 0.1)
+        legend: '{{ '{{container}}' }}'
+        ref_no: 1
+      {% endfor %}
+    formatY1: percent
+
+  - title: (S) Memory Saturation(Request)
+    type: Graph
+    description: Amount of available memory from the request
+    targets:
+      {% for dimension in data %}
+      - metric: (sum(container_memory_working_set_bytes{pod=~"^{{ dimension.statefulset_name }}.*"}) by (container) / sum(kube_pod_container_resource_requests{pod=~"^{{ dimension.statefulset_name }}.*", resource="memory", unit="byte"}) by (container))
+        legend: '{{ '{{container}}' }}'
+      {% endfor %}
+    formatY1: bytes
+
+  - title: (U) Memory Utilisation(Request)
+    type: Graph
+    description: Current memory usage per container
+    targets:
+      {% for dimension in data %}
+      - metric: round(((sum(container_memory_working_set_bytes{container!~"POD", pod=~"^{{ dimension.statefulset_name }}.*"}) by (container) / sum(kube_pod_container_resource_requests{pod=~"^{{ dimension.statefulset_name }}.*", resource="memory", unit="byte"}) by (container)) * 100), 0.1)
+        legend: '{{ '{{container}}' }}'
+        ref_no: 1
+      {% endfor %}
+    formatY1: percent
diff --git a/legend/metrics_library/metrics_schema.py b/legend/metrics_library/metrics_schema.py
@@ -73,6 +73,17 @@
     },
 }
 
+platform_k8s_statefulset_schema = {
+    "data_source": {"type": "string", "required": False},
+    "dimensions": {
+        "type": "list",
+        "schema": {
+            "type": "dict",
+            "schema": {"statefulset_name": {"type": "string", "required": True},},
+        },
+    },
+}
+
 platform_k8s_cronjob_schema = {
     "data_source": {"type": "string", "required": False},
     "dimensions": {

diff --git a/legend/metrics_library/schema.py b/legend/metrics_library/schema.py
@@ -20,6 +20,7 @@
     promtail_schema,
     celery_schema,
     platform_k8s_deployment_schema,
+    platform_k8s_statefulset_schema,
     platform_k8s_ingress_schema,
     redis_schema,
     redis_elasticache_schema,
@@ -208,6 +209,11 @@ def md(x, y):
                 "schema": md(default_panels_schema, platform_k8s_deployment_schema),
                 "required": False,
             },
+            "platform_k8s_statefulset": {
+                "type": "dict",
+                "schema": md(default_panels_schema, platform_k8s_statefulset_schema),
+                "required": False,
+            },
             "platform_k8s_hpa": {
                 "type": "dict",
                 "schema": md(default_panels_schema, platform_k8s_hpa_schema),

diff --git a/sample_input.yaml b/sample_input.yaml
@@ -99,6 +99,9 @@ components:
   platform_k8s_deployment:
     dimensions:
       - deployment_name: sample-deployment-name
+  platform_k8s_statefulset:
+    dimensions:
+      - statefulset_name: sample-statefulset-name
   platform_k8s_ingress:
     dimensions:
       - namespace: sample-namespace