Skip to content

Commit

Permalink
monitoring: remove instance label from ceph-cluster.json completely
Browse files Browse the repository at this point in the history
The `instance` label is only useful if

- the exporter returns only data about its node or instance
- the exporter provides an instance label and then may return data about
  other nodes

In this case, it's about the Prometheus mgr module, which is a single
exporter providing data about a whole cluster, so not only data related
to the node (or instance) the mgr module is running on.  It is
completely irrelevant on which node the exporter runs on, the data
provided doesn't change.  The exporter also doesn't provide `instance`
labels (which Prometheus wouldn't change due to our configuration, see
"honor_labels" setting).

(Actually there's one exception where `instance` labels are provided by
the Ceph mgr module, but that doesn't affect the Ceph Cluster
dashboard.)

Note that keeping that instance label on this particular dashboard would
enable the user to switch between a previously failed mgr instance and
the data collected from there and the currently running mgr instance (on
which the Prometheus mgr module runs on).  That'd split the data, which
I don't think is a useful feature, but rather looks broken.

Fixes: https://tracker.ceph.com/issues/51212

Signed-off-by: Patrick Seidensal <[email protected]>
  • Loading branch information
p-se committed Jun 16, 2021
1 parent 4270a13 commit 0374107
Showing 1 changed file with 20 additions and 43 deletions.
63 changes: 20 additions & 43 deletions monitoring/grafana/dashboards/ceph-cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@
"tableColumn": "",
"targets": [
{
"expr": "ceph_health_status{instance=~'$instance'}",
"expr": "ceph_health_status",
"format": "time_series",
"instant": true,
"interval": "$interval",
Expand Down Expand Up @@ -175,7 +175,7 @@
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "count(ceph_osd_metadata{instance=~\"$instance\"})",
"expr": "count(ceph_osd_metadata)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "All",
Expand All @@ -190,7 +190,7 @@
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "sum(ceph_osds_in{instance=~\"$instance\"})",
"expr": "sum(ceph_osds_in)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "In",
Expand All @@ -205,7 +205,7 @@
"displayAliasType": "Warning / Critical",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "sum(ceph_osd_in{instance=~\"$instance\"} == bool 0)",
"expr": "sum(ceph_osd_in == bool 0)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand All @@ -222,7 +222,7 @@
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "sum(ceph_osd_up{instance=~\"$instance\"})",
"expr": "sum(ceph_osd_up)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Up",
Expand All @@ -238,7 +238,7 @@
"displayAliasType": "Warning / Critical",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "sum(ceph_osd_up{instance=~\"$instance\"} == bool 0)",
"expr": "sum(ceph_osd_up == bool 0)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Down",
Expand Down Expand Up @@ -313,7 +313,7 @@
"tableColumn": "",
"targets": [
{
"expr": "sum(ceph_osd_stat_bytes_used{instance=~\"$instance\"})/sum(ceph_osd_stat_bytes{instance=~\"$instance\"})",
"expr": "sum(ceph_osd_stat_bytes_used)/sum(ceph_osd_stat_bytes)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Used",
Expand Down Expand Up @@ -531,28 +531,28 @@
"steppedLine": false,
"targets": [
{
"expr": "quantile(0.95, ceph_osd_apply_latency_ms{instance=~\"$instance\"})",
"expr": "quantile(0.95, ceph_osd_apply_latency_ms)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Apply Latency P_95",
"refId": "A"
},
{
"expr": "quantile(0.95, ceph_osd_commit_latency_ms{instance=~\"$instance\"})",
"expr": "quantile(0.95, ceph_osd_commit_latency_ms)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Commit Latency P_95",
"refId": "B"
},
{
"expr": "avg(ceph_osd_apply_latency_ms{instance=~\"$instance\"})",
"expr": "avg(ceph_osd_apply_latency_ms)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Avg Apply Latency",
"refId": "C"
},
{
"expr": "avg(ceph_osd_commit_latency_ms{instance=~\"$instance\"})",
"expr": "avg(ceph_osd_commit_latency_ms)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Avg Commit Latency",
Expand Down Expand Up @@ -630,7 +630,7 @@
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "sum(ceph_mon_quorum_status{instance=~\"$instance\"})",
"expr": "sum(ceph_mon_quorum_status)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
Expand All @@ -647,7 +647,7 @@
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "count(ceph_mon_quorum_status{instance=~\"$instance\"})",
"expr": "count(ceph_mon_quorum_status)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Total",
Expand All @@ -664,7 +664,7 @@
"displayAliasType": "Warning / Critical",
"displayType": "Annotation",
"displayValueWithAlias": "Never",
"expr": "count(ceph_mon_quorum_status{instance=~\"$instance\"}) / sum(ceph_mon_quorum_status{instance=~\"$instance\"})",
"expr": "count(ceph_mon_quorum_status) / sum(ceph_mon_quorum_status)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "MONs out of Quorum",
Expand Down Expand Up @@ -711,7 +711,7 @@
"displayAliasType": "Always",
"displayType": "Regular",
"displayValueWithAlias": "When Alias Displayed",
"expr": "ceph_mds_server_handle_client_session{instance=~\"$instance\"}",
"expr": "ceph_mds_server_handle_client_session",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Clients",
Expand Down Expand Up @@ -765,14 +765,14 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(ceph_osd_op_w_in_bytes{instance=~\"$instance\"}[1m]))",
"expr": "sum(irate(ceph_osd_op_w_in_bytes[1m]))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Writes",
"refId": "A"
},
{
"expr": "sum(irate(ceph_osd_op_r_out_bytes{instance=~\"$instance\"}[1m]))",
"expr": "sum(irate(ceph_osd_op_r_out_bytes[1m]))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Reads",
Expand Down Expand Up @@ -852,7 +852,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(deriv(ceph_pool_stored{instance=~\"$instance\"}[1m]))",
"expr": "sum(deriv(ceph_pool_stored[1m]))",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
Expand Down Expand Up @@ -925,7 +925,7 @@
"span": 12,
"targets": [
{
"expr": "ceph_osd_stat_bytes_used{instance=~'$instance'} / ceph_osd_stat_bytes{instance=~'$instance'}",
"expr": "ceph_osd_stat_bytes_used / ceph_osd_stat_bytes",
"format": "time_series",
"interval": "1m",
"intervalFactor": 1,
Expand Down Expand Up @@ -987,7 +987,7 @@
"links": [],
"targets": [
{
"expr": "ceph_osd_numpg{instance=~\"$instance\"}",
"expr": "ceph_osd_numpg",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "#PGs",
Expand Down Expand Up @@ -1191,29 +1191,6 @@
"query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
"refresh": 2,
"type": "interval"
},
{
"allFormat": "glob",
"allValue": null,
"current": {},
"datasource": "$datasource",
"hide": 0,
"hideLabel": false,
"includeAll": true,
"label": "Exporter Instance",
"multi": false,
"multiFormat": "glob",
"name": "instance",
"options": [],
"query": "label_values(ceph_health_status, instance)",
"refresh": 1,
"regex": "",
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
Expand Down

0 comments on commit 0374107

Please sign in to comment.