diff --git a/kustomize/monitoring/grafana/dashboards/postgresql_details.json b/kustomize/monitoring/grafana/dashboards/postgresql_details.json index 90866777..1df1621b 100644 --- a/kustomize/monitoring/grafana/dashboards/postgresql_details.json +++ b/kustomize/monitoring/grafana/dashboards/postgresql_details.json @@ -151,7 +151,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ", + "expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}, ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}, ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ", "format": "time_series", "interval": "", "intervalFactor": 1, diff --git a/kustomize/monitoring/grafana/dashboards/postgresql_overview.json b/kustomize/monitoring/grafana/dashboards/postgresql_overview.json index b965463f..f9bf2e94 100644 --- a/kustomize/monitoring/grafana/dashboards/postgresql_overview.json +++ b/kustomize/monitoring/grafana/dashboards/postgresql_overview.json @@ -163,7 +163,7 @@ "targets": [ { "$hashKey": "object:243", - "expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})", + "expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(patroni_postgres_running{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})", "format": "time_series", "interval": "", "intervalFactor": 1, diff --git a/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json b/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json index f41aa481..e0090cf8 100644 --- a/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json +++ b/kustomize/monitoring/grafana/dashboards/prometheus_alerts.json @@ -136,7 +136,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "count(count by (kubernetes_namespace) (pg_up)) or count(count by (kubernetes_namespace) (up))", + "expr": "sum(count by (kubernetes_namespace) (pg_up{pg_cluster!=''})) + sum(count by (kubernetes_namespace) (patroni_postgres_running{pg_cluster!=''}))", "format": "time_series", "instant": true, "interval": "", @@ -208,7 +208,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "count(count by (pg_cluster) (pg_up)) or count(count by (pg_cluster) (up))", + "expr": "sum(count by (pg_cluster) (pg_up{pg_cluster!=''})) + sum(count by (pg_cluster) (patroni_postgres_running{pg_cluster!=''}))", "format": "time_series", "instant": true, "interval": "", @@ -280,7 +280,7 @@ "pluginVersion": "7.4.5", "targets": [ { - "expr": "count(pg_up) or count(up)", + "expr": "sum(count(pg_up{pg_cluster!=''})) + sum(count(patroni_postgres_running{pg_cluster!=''}))", "format": "time_series", "instant": true, "interval": "", diff --git a/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml b/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml index 83f666e4..b68a197e 100644 --- a/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml +++ b/kustomize/monitoring/prometheus/config/crunchy-alert-rules-pg.yml @@ -22,7 +22,7 @@ groups: ########## SYSTEM RULES ########## - alert: ExporterDown - expr: avg_over_time(up[5m]) < 0.5 + expr: avg_over_time(up{job=~"crunchy-otel-collector|crunchy-postgres-exporter",exported_job!="patroni"}[5m]) < 0.5 for: 10s labels: service: system @@ -35,15 +35,35 @@ groups: ########## POSTGRESQL RULES ########## - alert: PGIsUp - expr: "pg_up < 1 or up < 1" + expr: "pg_up < 1 or patroni_postgres_running < 1" for: 60s labels: service: postgresql severity: critical severity_num: 300 annotations: - summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database' + summary: 'Metrics exporter running on {{ $labels.job }} is unable to communicate with the configured database' + - alert: PGNoPrimary + expr: max by (cluster_name) (ccp_is_in_recovery_status) < 2 + for: 30s + labels: + service: postgresql + severity: critical + severity_num: 300 + annotations: + summary: 'cluster {{ $labels.cluster_name }} does not have a primary instance' + +# Alert on missing or absent replicas +# - alert: PGNoReplica +# expr: min by (cluster_name) (ccp_is_in_recovery_status) > 1 +# for: 30s +# labels: +# service: postgresql +# severity: critical +# severity_num: 300 +# annotations: +# summary: 'cluster {{ $labels.cluster_name }} does not have a replica instance' # Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num". #