Skip to content

Commit

Permalink
add redis and elasticsearch
Browse files Browse the repository at this point in the history
  • Loading branch information
qjpoo committed Nov 1, 2021
1 parent e5b7802 commit cb3cd63
Show file tree
Hide file tree
Showing 12 changed files with 2,573 additions and 97 deletions.
161 changes: 84 additions & 77 deletions 24-kube-prometheus-0.4.0/manifests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ kubectl apply -f manifests/grafana-deployment.yaml
3. alertmanager为30093
kubectl patch svc alertmanager-main -n monitoring -p '{"spec":{"type":"NodePort","ports":[{"name":"web","port":9093,"protocol":"TCP","targetPort":"web","nodePort":30093}]}}'
我这里用的是traefik, 安装traefik详见以前章节的的安装步骤.
我这里用的是traefik, 安装traefik详见以前章节的的安装步骤, 在此不说废话了.
1. grafana的ingressroute
cat g.yaml
root@<master|192.168.1.23|~/demo/system/traefik/prod>:#cat g.yaml
apiVersion: traefik.containo.us/v1alpha1
kind: IngressRoute
metadata:
Expand All @@ -114,80 +114,48 @@ kubectl apply -f manifests/grafana-deployment.yaml
- name: grafana
port: 3000
加一个中间件, 做一下basic auth
安装工具
yum -y install httpd-tools
生成一个文件auth, admin用户和admin密码:
htpasswd -bc auth admin dg-mall.com
先做一个用户名和密码的secret
kubectl create secret generic traefik-ui-auth --from-file=auth -n monitoring
中间件的yaml文件
cat middleware.yaml
apiVersion: traefik.containo.us/v1alpha1
kind: Middleware
metadata:
name: traefik-exposed-dashboard-basic-auth
namespace: monitoring
spec:
basicAuth:
secret: traefik-ui-auth
prometheus的ingressroute文件
cat p.dg.local.yaml
apiVersion: traefik.containo.us/v1alpha1
kind: IngressRoute
metadata:
name: prometheus-route
namespace: monitoring
spec:
entryPoints:
- web
routes:
- kind: Rule
match: Host(`p.dg.local`)
middlewares:
- name: traefik-exposed-dashboard-basic-auth
namespace: monitoring
services:
- name: prometheus-k8s
port: 9090
alertmanager的ingressroute文件
cat a.dg.local.yaml
apiVersion: traefik.containo.us/v1alpha1
kind: IngressRoute
metadata:
name: alertmanager-route
namespace: monitoring
spec:
entryPoints:
- web
routes:
- match: Host(`a.dg.local`)
kind: Rule
middlewares:
- name: traefik-exposed-dashboard-basic-auth
namespace: monitoring
services:
- name: alertmanager-main
port: 9093
原文件都在 13-baisc-auth下面
k create -f g.yaml
k create -f p.dg.local.yaml
k create -f a.dg.local.yaml
k get ingressroute -n monitoring
NAME AGE
alertmanager-route 17h
grafana-route 9d
prometheus-route 22h
k get po
2. prometheus的ingressroute
root@<master|192.168.1.23|~/demo/system/traefik/prod>:#cat p.yaml
apiVersion: traefik.containo.us/v1alpha1
kind: IngressRoute
metadata:
name: prometheus-route
namespace: monitoring
spec:
entryPoints:
- web
routes:
- match: Host(`p.dg.local`)
kind: Rule
services:
- name: prometheus-k8s
port: 9090
3. alertmanager的ingressroute
root@<master|192.168.1.23|~/demo/system/traefik/prod>:#cat a.yaml
apiVersion: traefik.containo.us/v1alpha1
kind: IngressRoute
metadata:
name: alertmanager-route
namespace: monitoring
spec:
entryPoints:
- web
routes:
- match: Host(`a.dg.local`)
kind: Rule
services:
- name: alertmanager-main
port: 9093
4. 查看ingressroute
root@<master|192.168.1.23|~/demo/system/traefik/prod>:#k get ingressroute
NAME AGE
alertmanager-route 80s
grafana-route 4s
prometheus-route 84s
6. get po, svc
root@<master|192.168.1.23|~/demo/system/kube-prometheus-0.4.0/manifests/bug>:#k get po
NAME READY STATUS RESTARTS AGE
alertmanager-main-0 2/2 Running 0 22m
alertmanager-main-1 2/2 Running 0 22m
Expand Down Expand Up @@ -364,6 +332,8 @@ kubectl apply -f manifests/grafana-deployment.yaml
先生成additional-configs secret, 然后在执行如下二步
k create -f 00-blackbox-exporter-cm.yaml
k create -f prometheus-additional.yaml
导入black exporter的grafana模版 9965
14. endpoints自动发现, 把这个job加入到prometheus-additional.yaml
Expand Down Expand Up @@ -423,10 +393,47 @@ kubectl apply -f manifests/grafana-deployment.yaml
执行里面的rules:
k create -f bug/12-warning/rules/
这个里面的告警值, 根据自己观察业务的值来设置
alertmanager.yaml.ok 这个文件是routes指定不同的labels来发送到不同的平台
17. 单个redis监控
进入到bug/13-outside-redis-single-exporter
修改里面的地址
- addresses:
- ip: 192.168.1.17
k create -f outside-redis.yaml
导入redis的grafana模版: https://grafana.com/grafana/dashboards/763
18. 监控集群redis
进入到bug/14-outside-redis-cluster-exporter, 修改redis cluster的地址
k create -f prometheus-additional.yaml
grafana导入先导入上面的763模版, 然后在导入目录中的redis-cluster集群.json 模版文件即可
19. node_exporter
先要到需要监控的目标节点上, 安装和启动 node_exporter
进入到bug/11-outside-exporter, 修改里面要监控的节点IP
k create -f .
导入node_exporter的grafana模版: 8919
20. 监控elasticsearch集群
先去下载elasticsearch-exporter, 执行如下命令, 我的是一台节点上跑了三个elasticsearch
nohup ./elasticsearch_exporter --web.listen-address ":9700" --es.uri http://192.168.1.15:9200 &>>/dev/null &
nohup ./elasticsearch_exporter --web.listen-address ":9701" --es.uri http://192.168.1.15:9201 &>>/dev/null &
nohup ./elasticsearch_exporter --web.listen-address ":9702" --es.uri http://192.168.1.15:9202 &>>/dev/null &
查看有没有结果, 有返回, 说明是正确的
curl 127.0.0.1:9700/metrics
进入到bug/15-outside-elasearch-exporter目录中去, 执行如下命令
k delete secret additional-configs
kubectl create secret generic additional-configs --from-file=prometheus-additional.yaml -n monitoring
导入elasticsearch_exporter的grafana模版: 6483
https://grafana.com/grafana/dashboards/6483
如果elasticsearch有用户名和密码的话:
/usr/local/elasticsearch_exporter/elasticsearch_exporter --web.listen-address ":9308" --es.uri=http://username:[email protected]:9200
----------------------------------------------------------------------------------------------
troubleshooting:
我把我遇到的问题, 尽可能回忆起来, 因为这一路坑太多了
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# 全局配置项
global:
resolve_timeout: 5m # 处理超时时间,默认为5min
smtp_smarthost: 'smtp.exmail.qq.com:25'
smtp_smarthost: 'smtp.exmail.qq.com:587'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'OPcoYoqHGYuiqwd'
smtp_require_tls: false
smtp_require_tls: true

# 定义模板信息
templates:
Expand Down
6 changes: 3 additions & 3 deletions 24-kube-prometheus-0.4.0/manifests/bug/12-warning/email.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
告警级别: {{ .Labels.severity }} 级<br>
告警类型: {{ .Labels.alertname }}<br>
告警主题: {{ .Annotations.summary }}<br>
⏱ : {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
告警⏱ : {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
</pre>
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
Expand All @@ -14,8 +14,8 @@
告警级别: {{ .Labels.severity }} 级<br>
告警类型: {{ .Labels.alertname }}<br>
恢复主题: {{ .Annotations.summary }}<br>
⏱ : {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
恢复时间:{{ .EndsAt.Format "2006-01-02 15:04:05" }} <br>
告警⏱ : {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
恢复时间:{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br>
</pre>
{{ end }}{{ end -}}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
rules:
- alert: NodeFilesystemUsage
expr: |
100 - (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 > 90
100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot"}) * 100 > 90
for: 1m
labels:
severity: warning
Expand All @@ -22,7 +22,8 @@ spec:

#根据当前一小时内磁盘增长量,判断磁盘在 4 个小时内会不会被写满
- alert: HostDiskWillFillIn4Hours
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
#expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
expr: predict_linear(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot"}[1h], 4 * 3600) < 0
for: 5m
labels:
severity: warning
Expand All @@ -41,14 +42,24 @@ spec:

- alert: NodeMemoryUsage
expr: |
100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
#100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} 内存使用率过高"
description: "{{ $labels.instance }}内存使用大于90% (当前值: {{ $value }})"

- alert: HostCpuLoadAvage
expr: sum(node_load5) by (instance) > 5
for: 1m
annotations:
summary: "5分钟内CPU负载过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU负载超过5 (当前值:{{ $value }})"
labels:
severity: 'warning'

- alert: NodeCPUUsage
expr: |
100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 90
Expand All @@ -69,50 +80,50 @@ spec:
message: '{{ $labels.node }} 已经有10多分钟没有准备好了.'

- alert: UnusualDiskReadRate
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 10
expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 25
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual disk read rate (instance {{ $labels.instance }})"
description: "Disk is probably reading too much data (> 10 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
description: "Disk is probably reading too much data (> 20 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

- alert: UnusualDiskWriteRate
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 10
expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 20
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual disk write rate (instance {{ $labels.instance }})"
description: "Disk is probably writing too much data (> 10 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
description: "Disk is probably writing too much data (> 20 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

- alert: UnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.3 and rate(node_disk_reads_completed_total[1m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual disk read latency (instance {{ $labels.instance }})"
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
description: "Disk latency is growing (read operations > 300ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"


- alert: UnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.3 and rate(node_disk_writes_completed_total[1m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Unusual disk write latency (instance {{ $labels.instance }})"
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
description: "Disk latency is growing (write operations > 300ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

- alert: DiskReadIOPS
expr: rate(node_disk_reads_completed_total[3m]) > 300
expr: rate(node_disk_reads_completed_total[3m]) > 750
for: 5m
labels:
severity: warning
annotations:
summary: "disk read iops too high (instance {{ $labels.instance }})"
description: "Disk read iops is growing (read iops operations > 300)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
description: "Disk read iops is growing (read iops operations > 750)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

- alert: DiskWriteIOPS
expr: rate(node_disk_writes_completed_total[3m]) > 350
Expand All @@ -123,6 +134,15 @@ spec:
summary: "disk write iops too high (instance {{ $labels.instance }})"
description: "Disk write iops is growing (write iops operations > 350)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

- alert: HostIOWait
expr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10
for: 1m
annotations:
summary: "磁盘负载过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内磁盘负载过高 (当前负载值:{{ $value }})"
labels:
severity: 'warning'


- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[5m]) > 0
Expand All @@ -132,3 +152,30 @@ spec:
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected VALUE = {{ $value }} LABELS: {{ $labels }}"

- alert: HostNetworkConnection-ESTABLISHED
expr: sum(node_netstat_Tcp_CurrEstab) by (instance) > 1500
for: 5m
labels:
severity: 'warning'
annotations:
summary: "主机ESTABLISHED连接数过高"
description: "主机: 【{{ $labels.instance }}】 ESTABLISHED连接数超过1500, 当前ESTABLISHED连接数: {{ $value }}"

- alert: HostNetworkConnection-TIME_WAIT
expr: sum(node_sockstat_TCP_tw) by (instance) > 3000
for: 5m
labels:
severity: 'warning'
annotations:
summary: "主机TIME_WAIT连接数过高"
description: "主机: 【{{ $labels.instance }}】 TIME_WAIT连接数超过3000, 当前TIME_WAIT连接数: {{ $value }}"

- alert: HostOutOfInodes
expr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } * 100 < 10
for: 2m
labels:
severity: 'warning'
annotations:
summary: "主机分区Inode节点不足"
description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区inode节点不足 (可用值小于{{ $value }}%)"
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ spec:
rules:
- alert: PodCPUUsage
expr: |
sum by(pod, namespace) (rate(container_cpu_usage_seconds_total{image!=""}[5m]) * 100) > 80
sum by(pod, namespace) (rate(container_cpu_usage_seconds_total{image!=""}[5m]) * 100) > 90
for: 5m
labels:
severity: warning
annotations:
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于80% (当前值: {{ $value }})"
summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于90% (当前值: {{ $value }})"

- alert: PodMemoryUsage
expr: |
Expand Down
Loading

0 comments on commit cb3cd63

Please sign in to comment.