add redis and elasticsearch

qjpoo · Nov 1, 2021 · cb3cd63 · cb3cd63
1 parent e5b7802
commit cb3cd63
Show file tree

Hide file tree

Showing 12 changed files with 2,573 additions and 97 deletions.
diff --git a/24-kube-prometheus-0.4.0/manifests/README.md b/24-kube-prometheus-0.4.0/manifests/README.md
@@ -96,9 +96,9 @@ kubectl apply -f manifests/grafana-deployment.yaml
    3. alertmanager为30093
       kubectl  patch svc  alertmanager-main -n monitoring -p '{"spec":{"type":"NodePort","ports":[{"name":"web","port":9093,"protocol":"TCP","targetPort":"web","nodePort":30093}]}}'
 
-   我这里用的是traefik, 安装traefik详见以前章节的的安装步骤.
+   我这里用的是traefik, 安装traefik详见以前章节的的安装步骤, 在此不说废话了.
       1. grafana的ingressroute   
-         cat g.yaml
+         root@<master|192.168.1.23|~/demo/system/traefik/prod>:#cat g.yaml
          apiVersion: traefik.containo.us/v1alpha1
          kind: IngressRoute
          metadata:
@@ -114,80 +114,48 @@ kubectl apply -f manifests/grafana-deployment.yaml
                  - name: grafana
                    port: 3000
    
-     加一个中间件, 做一下basic auth
-     安装工具
-     yum -y install httpd-tools
-
-     生成一个文件auth, admin用户和admin密码:
-     htpasswd -bc auth admin dg-mall.com
- 
-     先做一个用户名和密码的secret
-     kubectl create secret generic traefik-ui-auth --from-file=auth -n monitoring
-
-     中间件的yaml文件
-     cat middleware.yaml
-     apiVersion: traefik.containo.us/v1alpha1
-     kind: Middleware
-     metadata:
-       name: traefik-exposed-dashboard-basic-auth
-       namespace: monitoring
-     spec:
-       basicAuth:
-         secret: traefik-ui-auth
-
-      prometheus的ingressroute文件
-      cat p.dg.local.yaml
-      apiVersion: traefik.containo.us/v1alpha1
-      kind: IngressRoute
-      metadata:
-        name: prometheus-route
-        namespace: monitoring
-      spec:
-        entryPoints:
-        - web
-        routes:
-        - kind: Rule
-          match: Host(`p.dg.local`)
-          middlewares:
-          - name: traefik-exposed-dashboard-basic-auth
-            namespace: monitoring
-          services:
-          - name: prometheus-k8s
-            port: 9090
-
-      alertmanager的ingressroute文件
-      cat a.dg.local.yaml
-      apiVersion: traefik.containo.us/v1alpha1
-      kind: IngressRoute
-      metadata:
-        name: alertmanager-route
-        namespace: monitoring
-      spec:
-        entryPoints:
-          - web
-        routes:
-          - match: Host(`a.dg.local`)
-            kind: Rule
-            middlewares:
-            - name: traefik-exposed-dashboard-basic-auth
-              namespace: monitoring
-            services:
-              - name: alertmanager-main
-                port: 9093
-
-   原文件都在 13-baisc-auth下面
-   k create -f g.yaml
-   k create -f p.dg.local.yaml
-   k create -f  a.dg.local.yaml
-
-   k get ingressroute -n monitoring
-   NAME                 AGE
-   alertmanager-route   17h
-   grafana-route        9d
-   prometheus-route     22h
-
-
-   k get po
+      2. prometheus的ingressroute   
+         root@<master|192.168.1.23|~/demo/system/traefik/prod>:#cat p.yaml
+         apiVersion: traefik.containo.us/v1alpha1
+         kind: IngressRoute
+         metadata:
+           name: prometheus-route
+           namespace: monitoring
+         spec:
+           entryPoints:
+             - web
+           routes:
+             - match: Host(`p.dg.local`)
+               kind: Rule
+               services:
+                 - name: prometheus-k8s
+                   port: 9090
+
+      3. alertmanager的ingressroute   
+         root@<master|192.168.1.23|~/demo/system/traefik/prod>:#cat a.yaml
+         apiVersion: traefik.containo.us/v1alpha1
+         kind: IngressRoute
+         metadata:
+           name: alertmanager-route
+           namespace: monitoring
+         spec:
+           entryPoints:
+             - web
+           routes:
+             - match: Host(`a.dg.local`)
+               kind: Rule
+               services:
+                 - name: alertmanager-main
+                   port: 9093
+      4. 查看ingressroute
+      root@<master|192.168.1.23|~/demo/system/traefik/prod>:#k get ingressroute
+      NAME                 AGE
+      alertmanager-route   80s
+      grafana-route        4s
+      prometheus-route     84s
+
+6. get po, svc
+   root@<master|192.168.1.23|~/demo/system/kube-prometheus-0.4.0/manifests/bug>:#k get po
    NAME                                  READY   STATUS    RESTARTS   AGE
    alertmanager-main-0                   2/2     Running   0          22m
    alertmanager-main-1                   2/2     Running   0          22m
@@ -364,6 +332,8 @@ kubectl apply -f manifests/grafana-deployment.yaml
     先生成additional-configs secret, 然后在执行如下二步
     k create -f 00-blackbox-exporter-cm.yaml
     k create -f  prometheus-additional.yaml
+
+    导入black exporter的grafana模版 9965
     
     
 14. endpoints自动发现, 把这个job加入到prometheus-additional.yaml
@@ -423,10 +393,47 @@ kubectl apply -f manifests/grafana-deployment.yaml
 
     执行里面的rules:
     k create -f bug/12-warning/rules/
-    这个里面的告警值, 根据自己观察业务的值来设置
 
     alertmanager.yaml.ok  这个文件是routes指定不同的labels来发送到不同的平台
 
+
+17. 单个redis监控
+    进入到bug/13-outside-redis-single-exporter
+    修改里面的地址
+    - addresses:
+      - ip: 192.168.1.17
+    k create -f outside-redis.yaml
+    导入redis的grafana模版: https://grafana.com/grafana/dashboards/763
+
+18. 监控集群redis
+    进入到bug/14-outside-redis-cluster-exporter, 修改redis cluster的地址
+    k create -f prometheus-additional.yaml
+    grafana导入先导入上面的763模版, 然后在导入目录中的redis-cluster集群.json  模版文件即可
+
+19. node_exporter
+    先要到需要监控的目标节点上, 安装和启动 node_exporter
+    进入到bug/11-outside-exporter, 修改里面要监控的节点IP
+    k create -f .
+    导入node_exporter的grafana模版: 8919
+
+20. 监控elasticsearch集群
+    先去下载elasticsearch-exporter, 执行如下命令, 我的是一台节点上跑了三个elasticsearch
+    nohup ./elasticsearch_exporter --web.listen-address ":9700"  --es.uri http://192.168.1.15:9200 &>>/dev/null &
+    nohup ./elasticsearch_exporter --web.listen-address ":9701"  --es.uri http://192.168.1.15:9201 &>>/dev/null &
+    nohup ./elasticsearch_exporter --web.listen-address ":9702"  --es.uri http://192.168.1.15:9202 &>>/dev/null &
+    查看有没有结果, 有返回, 说明是正确的
+    curl 127.0.0.1:9700/metrics
+    
+    进入到bug/15-outside-elasearch-exporter目录中去, 执行如下命令
+    k delete secret additional-configs
+    kubectl create secret generic additional-configs --from-file=prometheus-additional.yaml -n monitoring 
+    导入elasticsearch_exporter的grafana模版: 6483
+    https://grafana.com/grafana/dashboards/6483
+   
+    如果elasticsearch有用户名和密码的话:
+    /usr/local/elasticsearch_exporter/elasticsearch_exporter --web.listen-address ":9308" --es.uri=http://username:[email protected]:9200
+
+    
 ----------------------------------------------------------------------------------------------
 troubleshooting:
 我把我遇到的问题, 尽可能回忆起来, 因为这一路坑太多了

diff --git a/24-kube-prometheus-0.4.0/manifests/bug/12-warning/alertmanager.yaml b/24-kube-prometheus-0.4.0/manifests/bug/12-warning/alertmanager.yaml
@@ -1,11 +1,11 @@
 # 全局配置项
 global:
   resolve_timeout: 5m # 处理超时时间，默认为5min
-  smtp_smarthost: 'smtp.exmail.qq.com:25'
+  smtp_smarthost: 'smtp.exmail.qq.com:587'
   smtp_from: '[email protected]'
   smtp_auth_username: '[email protected]'
   smtp_auth_password: 'OPcoYoqHGYuiqwd'
-  smtp_require_tls: false
+  smtp_require_tls: true
 
 # 定义模板信息
 templates:

diff --git a/24-kube-prometheus-0.4.0/manifests/bug/12-warning/email.tmpl b/24-kube-prometheus-0.4.0/manifests/bug/12-warning/email.tmpl
@@ -5,7 +5,7 @@
 告警级别: {{ .Labels.severity }} 级<br>
 告警类型: {{ .Labels.alertname }}<br>
 告警主题: {{ .Annotations.summary }}<br>
-⏱ : {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
+告警⏱ : {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
 </pre>
 {{ end }}{{ end -}}
 {{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
@@ -14,8 +14,8 @@
 告警级别: {{ .Labels.severity }} 级<br>
 告警类型: {{ .Labels.alertname }}<br>
 恢复主题: {{ .Annotations.summary }}<br>
-⏱ : {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
-恢复时间：{{ .EndsAt.Format "2006-01-02 15:04:05" }} <br>
+告警⏱ : {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
+恢复时间：{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
 </pre>
 {{ end }}{{ end -}}
 {{- end }}
diff --git a/24-kube-prometheus-0.4.0/manifests/bug/12-warning/rules/01-node-rules.yaml b/24-kube-prometheus-0.4.0/manifests/bug/12-warning/rules/01-node-rules.yaml
@@ -12,7 +12,7 @@ spec:
     rules:
     - alert: NodeFilesystemUsage
       expr: |
-        100 - (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 > 90
+        100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot"}) * 100 > 90
       for: 1m
       labels:
         severity: warning 
@@ -22,7 +22,8 @@ spec:
 
     #根据当前一小时内磁盘增长量，判断磁盘在 4 个小时内会不会被写满
     - alert: HostDiskWillFillIn4Hours
-      expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
+      #expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
+      expr: predict_linear(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot"}[1h], 4 * 3600) < 0
       for: 5m
       labels:
         severity: warning
@@ -41,14 +42,24 @@ spec:
 
     - alert: NodeMemoryUsage
       expr: |
-        100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
+        #100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
+         node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
       for: 1m
       labels:
         severity: warning
       annotations:
         summary: "Instance {{ $labels.instance }} 内存使用率过高"
         description: "{{ $labels.instance }}内存使用大于90% (当前值: {{ $value }})"
 
+    - alert: HostCpuLoadAvage
+      expr: sum(node_load5) by (instance) > 5
+      for: 1m
+      annotations:
+        summary: "5分钟内CPU负载过高"
+        description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU负载超过5 (当前值：{{ $value }})"
+      labels:
+        severity: 'warning'
+
     - alert: NodeCPUUsage    
       expr: |
         100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 90 
@@ -69,50 +80,50 @@ spec:
         message: '{{ $labels.node }} 已经有10多分钟没有准备好了.'
 
     - alert: UnusualDiskReadRate
-      expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 10
+      expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 25
       for: 5m
       labels:
         severity: warning
       annotations:
         summary: "Unusual disk read rate (instance {{ $labels.instance }})"
-        description: "Disk is probably reading too much data (> 10 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+        description: "Disk is probably reading too much data (> 20 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 
     - alert: UnusualDiskWriteRate
-      expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 10
+      expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 20
       for: 5m
       labels:
         severity: warning
       annotations:
         summary: "Unusual disk write rate (instance {{ $labels.instance }})"
-        description: "Disk is probably writing too much data (> 10 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+        description: "Disk is probably writing too much data (> 20 MB/s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 
     - alert: UnusualDiskReadLatency
-      expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
+      expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.3 and rate(node_disk_reads_completed_total[1m]) > 0
       for: 5m
       labels:
         severity: warning
       annotations:
         summary: "Unusual disk read latency (instance {{ $labels.instance }})"
-        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+        description: "Disk latency is growing (read operations > 300ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 
 
     - alert: UnusualDiskWriteLatency
-      expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
+      expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.3 and rate(node_disk_writes_completed_total[1m]) > 0
       for: 5m
       labels:
         severity: warning
       annotations:
         summary: "Unusual disk write latency (instance {{ $labels.instance }})"
-        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+        description: "Disk latency is growing (write operations > 300ms)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 
     - alert: DiskReadIOPS
-      expr: rate(node_disk_reads_completed_total[3m]) > 300
+      expr: rate(node_disk_reads_completed_total[3m]) > 750
       for: 5m
       labels:
         severity: warning
       annotations:
         summary: "disk read iops too high (instance {{ $labels.instance }})"
-        description: "Disk read iops is growing (read iops operations > 300)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
+        description: "Disk read iops is growing (read iops operations > 750)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 
     - alert: DiskWriteIOPS
       expr: rate(node_disk_writes_completed_total[3m]) > 350
@@ -123,6 +134,15 @@ spec:
         summary: "disk write iops too high (instance {{ $labels.instance }})"
         description: "Disk write iops is growing (write iops operations > 350)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 
+    - alert: HostIOWait
+      expr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10
+      for: 1m
+      annotations:
+        summary: "磁盘负载过高"
+        description: "主机: 【{{ $labels.instance }}】 5五分钟内磁盘负载过高 (当前负载值：{{ $value }})"
+      labels:
+        severity: 'warning'
+
 
     - alert: HostOomKillDetected
       expr: increase(node_vmstat_oom_kill[5m]) > 0
@@ -132,3 +152,30 @@ spec:
       annotations:
         summary: Host OOM kill detected (instance {{ $labels.instance }})
         description: "OOM kill detected VALUE = {{ $value }} LABELS: {{ $labels }}"
+
+    - alert: HostNetworkConnection-ESTABLISHED
+      expr:  sum(node_netstat_Tcp_CurrEstab) by (instance) > 1500
+      for: 5m
+      labels:
+        severity: 'warning'
+      annotations:
+        summary: "主机ESTABLISHED连接数过高"
+        description: "主机: 【{{ $labels.instance }}】 ESTABLISHED连接数超过1500, 当前ESTABLISHED连接数: {{ $value }}"
+
+    - alert: HostNetworkConnection-TIME_WAIT
+      expr:  sum(node_sockstat_TCP_tw) by (instance) > 3000
+      for: 5m
+      labels:
+        severity: 'warning'
+      annotations:
+        summary: "主机TIME_WAIT连接数过高"
+        description: "主机: 【{{ $labels.instance }}】 TIME_WAIT连接数超过3000, 当前TIME_WAIT连接数: {{ $value }}"
+
+    - alert: HostOutOfInodes
+      expr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } * 100 < 10
+      for: 2m
+      labels:
+        severity: 'warning'
+      annotations:
+        summary: "主机分区Inode节点不足"
+        description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区inode节点不足 (可用值小于{{ $value }}%)"
diff --git a/24-kube-prometheus-0.4.0/manifests/bug/12-warning/rules/02-pod-rules.yaml b/24-kube-prometheus-0.4.0/manifests/bug/12-warning/rules/02-pod-rules.yaml
@@ -12,12 +12,12 @@ spec:
     rules:
     - alert: PodCPUUsage
       expr: |
-         sum by(pod, namespace) (rate(container_cpu_usage_seconds_total{image!=""}[5m]) * 100) > 80
+         sum by(pod, namespace) (rate(container_cpu_usage_seconds_total{image!=""}[5m]) * 100) > 90
       for: 5m
       labels:
         severity: warning 
       annotations:
-        summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于80% (当前值: {{ $value }})"
+        summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于90% (当前值: {{ $value }})"
 
     - alert: PodMemoryUsage
       expr: |