-
Notifications
You must be signed in to change notification settings - Fork 45
/
Copy path240-pg_slot.yml
338 lines (331 loc) · 17 KB
/
240-pg_slot.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
##
# SYNOPSIS
# pg_slot.pg_slot_14_*
#
# DESCRIPTION
# PostgreSQL replication slot metrics v14 with wal safe size and status
#
# OPTIONS
# Tags [cluster, primary]
# TTL 10
# Priority 0
# Timeout 100ms
# Fatal false
# Version 140000 ~ higher
# Source 240-pg_slot.yml
#
# METRICS
# slot_name (LABEL)
# A unique, cluster-wide identifier for the replication slot
# datname (LABEL)
# The name of the database this slot is associated with, or null for physical slot.
# active (GAUGE)
# True(1) if this slot is currently actively being used
# temporary (GAUGE)
# True(1) if this is a temporary replication slot.
# xmin (COUNTER)
# The oldest transaction that this slot needs the database to retain.
# catalog_xmin (COUNTER)
# The oldest transaction affecting the system catalogs that this slot needs the database to retain.
# restart_lsn (COUNTER)
# The address (LSN) of oldest WAL which still might be required by the consumer of this slot
# confirm_lsn (COUNTER)
# The address (LSN) up to which the logical slot's consumer has confirmed receiving data.
# retained_bytes (GAUGE)
# Size of bytes that retained for this slot
# safe_wal_size (GAUGE)
# bytes that can be written to WAL which will not make slot into lost
# wal_status (GAUGE)
# WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other
# spill_txns (COUNTER)
# Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)
# spill_count (COUNTER)
# Xacts that spilled to disk due to logical decode mem exceeding (a xact can be spilled multiple times)
# spill_bytes (COUNTER)
# Bytes that spilled to disk due to logical decode mem exceeding
# stream_txns (COUNTER)
# Xacts that streamed to decoding output plugin after mem exceed
# stream_count (COUNTER)
# Xacts that streamed to decoding output plugin after mem exceed (a xact can be streamed multiple times)
# stream_bytes (COUNTER)
# Bytes that streamed to decoding output plugin after mem exceed
# total_txns (COUNTER)
# Number of decoded xacts sent to the decoding output plugin for this slot
# total_bytes (COUNTER)
# Number of decoded bytes sent to the decoding output plugin for this slot
# reset_time (COUNTER)
# When statistics were last reset
#
pg_slot_14:
name: pg_slot
desc: PostgreSQL replication slot metrics v14 with wal safe size and status
query: |
SELECT s.slot_name, database AS datname,active,temporary,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin,
restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn,
CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes,
safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status,
spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,extract(EPOCH FROM now() - stats_reset) AS reset_time
FROM pg_replication_slots s LEFT OUTER JOIN pg_stat_replication_slots ss ON s.slot_name = ss.slot_name;
ttl: 10
min_version: 140000
tags:
- cluster
- primary
metrics:
- slot_name:
usage: LABEL
description: A unique, cluster-wide identifier for the replication slot
- datname:
usage: LABEL
description: The name of the database this slot is associated with, or null for physical slot.
# Only logical slots have an associated database.
- active:
usage: GAUGE
description: True(1) if this slot is currently actively being used
- temporary:
usage: GAUGE
description: True(1) if this is a temporary replication slot.
# Temporary slots are not saved to disk and are automatically dropped on error or when the session has finished.
- xmin:
usage: COUNTER
description: The oldest transaction that this slot needs the database to retain.
# VACUUM cannot remove tuples deleted by any later transaction.
- catalog_xmin:
usage: COUNTER
description: The oldest transaction affecting the system catalogs that this slot needs the database to retain.
# VACUUM cannot remove catalog tuples deleted by any later transaction.
- restart_lsn:
usage: COUNTER
description: The address (LSN) of oldest WAL which still might be required by the consumer of this slot
# The address (LSN) of oldest WAL which still might be required by the consumer of this slot and thus won't be automatically removed
# during checkpoints unless this LSN gets behind more than max_slot_wal_keep_size from the current LSN. NULL if the LSN of this slot has never been reserved.
- confirm_lsn:
usage: COUNTER
description: The address (LSN) up to which the logical slot's consumer has confirmed receiving data.
# Data older than this is not available anymore. NULL for physical slots.
- retained_bytes:
usage: GAUGE
description: Size of bytes that retained for this slot
- safe_wal_size:
usage: GAUGE
description: bytes that can be written to WAL which will not make slot into lost
- wal_status:
usage: GAUGE
description: WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other
- spill_txns:
usage: COUNTER
description: Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)
# Number of transactions spilled to disk once the memory used by logical decoding to decode changes from WAL has exceeded logical_decoding_work_mem. The counter gets incremented for both toplevel transactions and subtransactions.
- spill_count:
usage: COUNTER
description: Xacts that spilled to disk due to logical decode mem exceeding (a xact can be spilled multiple times)
# Number of times transactions were spilled to disk while decoding changes from WAL for this slot. This counter is incremented each time a transaction is spilled, and the same transaction may be spilled multiple times.
- spill_bytes:
usage: COUNTER
description: Bytes that spilled to disk due to logical decode mem exceeding
# Amount of decoded transaction data spilled to disk while performing decoding of changes from WAL for this slot. This and other spill counters can be used to gauge the I/O which occurred during logical decoding and allow tuning logical_decoding_work_mem.
- stream_txns:
usage: COUNTER
description: Xacts that streamed to decoding output plugin after mem exceed
# Number of in-progress transactions streamed to the decoding output plugin after the memory used by logical decoding to decode changes from WAL for this slot has exceeded logical_decoding_work_mem. Streaming only works with toplevel transactions (subtransactions can't be streamed independently), so the counter is not incremented for subtransactions.
- stream_count:
usage: COUNTER
description: Xacts that streamed to decoding output plugin after mem exceed (a xact can be streamed multiple times)
# Number of times in-progress transactions were streamed to the decoding output plugin while decoding changes from WAL for this slot. This counter is incremented each time a transaction is streamed, and the same transaction may be streamed multiple times.
- stream_bytes:
usage: COUNTER
description: Bytes that streamed to decoding output plugin after mem exceed
# Amount of transaction data decoded for streaming in-progress transactions to the decoding output plugin while decoding changes from WAL for this slot. This and other streaming counters for this slot can be used to tune logical_decoding_work_mem.
- total_txns:
usage: COUNTER
description: Number of decoded xacts sent to the decoding output plugin for this slot
# Number of decoded transactions sent to the decoding output plugin for this slot. This counts toplevel transactions only, and is not incremented for subtransactions. Note that this includes the transactions that are streamed and/or spilled.
- total_bytes:
usage: COUNTER
description: Number of decoded bytes sent to the decoding output plugin for this slot
# Amount of transaction data decoded for sending transactions to the decoding output plugin while decoding changes from WAL for this slot. Note that this includes data that is streamed and/or spilled.
- reset_time:
usage: COUNTER
description: When statistics were last reset
##
# SYNOPSIS
# pg_slot.pg_slot_13_*
#
# DESCRIPTION
# PostgreSQL replication slot metrics v13 (wal safe size and status)
#
# OPTIONS
# Tags [cluster, primary]
# TTL 10
# Priority 0
# Timeout 100ms
# Fatal false
# Version 130000 ~ 140000
# Source 240-pg_slot.yml
#
# METRICS
# slot_name (LABEL)
# A unique, cluster-wide identifier for the replication slot
# datname (LABEL)
# The name of the database this slot is associated with, or null for physical slot.
# active (GAUGE)
# True(1) if this slot is currently actively being used
# temporary (GAUGE)
# True(1) if this is a temporary replication slot.
# xmin (COUNTER)
# The oldest transaction that this slot needs the database to retain.
# catalog_xmin (COUNTER)
# The oldest transaction affecting the system catalogs that this slot needs the database to retain.
# restart_lsn (COUNTER)
# The address (LSN) of oldest WAL which still might be required by the consumer of this slot
# confirm_lsn (COUNTER)
# The address (LSN) up to which the logical slot's consumer has confirmed receiving data.
# retained_bytes (GAUGE)
# Size of bytes that retained for this slot
# safe_wal_size (GAUGE)
# bytes that can be written to WAL which will not make slot into lost
# wal_status (GAUGE)
# WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other
#
pg_slot_13:
name: pg_slot
desc: PostgreSQL replication slot metrics v13 (wal safe size and status)
query: |
SELECT slot_name, database AS datname,active,temporary,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin,
restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn,
CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes,
safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status
FROM pg_replication_slots;
ttl: 10
min_version: 130000
max_version: 140000
tags:
- cluster
- primary
metrics:
- slot_name:
usage: LABEL
description: A unique, cluster-wide identifier for the replication slot
- datname:
usage: LABEL
description: The name of the database this slot is associated with, or null for physical slot.
# Only logical slots have an associated database.
- active:
usage: GAUGE
description: True(1) if this slot is currently actively being used
- temporary:
usage: GAUGE
description: True(1) if this is a temporary replication slot.
# Temporary slots are not saved to disk and are automatically dropped on error or when the session has finished.
- xmin:
usage: COUNTER
description: The oldest transaction that this slot needs the database to retain.
# VACUUM cannot remove tuples deleted by any later transaction.
- catalog_xmin:
usage: COUNTER
description: The oldest transaction affecting the system catalogs that this slot needs the database to retain.
# VACUUM cannot remove catalog tuples deleted by any later transaction.
- restart_lsn:
usage: COUNTER
description: The address (LSN) of oldest WAL which still might be required by the consumer of this slot
# The address (LSN) of oldest WAL which still might be required by the consumer of this slot and thus won't be automatically removed
# during checkpoints unless this LSN gets behind more than max_slot_wal_keep_size from the current LSN. NULL if the LSN of this slot has never been reserved.
- confirm_lsn:
usage: COUNTER
description: The address (LSN) up to which the logical slot's consumer has confirmed receiving data.
# Data older than this is not available anymore. NULL for physical slots.
- retained_bytes:
usage: GAUGE
description: Size of bytes that retained for this slot
- safe_wal_size:
usage: GAUGE
description: bytes that can be written to WAL which will not make slot into lost
- wal_status:
usage: GAUGE
description: WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other
##
# SYNOPSIS
# pg_slot.pg_slot_10_12_*
#
# DESCRIPTION
# PostgreSQL replication slot metrics v10 v11 v12
#
# OPTIONS
# Tags [cluster, primary]
# TTL 10
# Priority 0
# Timeout 100ms
# Fatal false
# Version 100000 ~ 130000
# Source 240-pg_slot.yml
#
# METRICS
# slot_name (LABEL)
# A unique, cluster-wide identifier for the replication slot
# datname (LABEL)
# The name of the database this slot is associated with, or null for physical slot.
# active (GAUGE)
# True(1) if this slot is currently actively being used
# temporary (GAUGE)
# True(1) if this is a temporary replication slot.
# xmin (COUNTER)
# The oldest transaction that this slot needs the database to retain.
# catalog_xmin (COUNTER)
# The oldest transaction affecting the system catalogs that this slot needs the database to retain.
# restart_lsn (COUNTER)
# The address (LSN) of oldest WAL which still might be required by the consumer of this slot
# confirm_lsn (COUNTER)
# The address (LSN) up to which the logical slot's consumer has confirmed receiving data.
# retained_bytes (GAUGE)
# Size of bytes that retained for this slot
#
pg_slot_10_12:
name: pg_slot
desc: PostgreSQL replication slot metrics v10 v11 v12
query: |
SELECT slot_name, database AS datname,active,temporary,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin,
restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn,
CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes
FROM pg_replication_slots;
ttl: 10
min_version: 100000
max_version: 130000
tags:
- cluster
- primary
# can not create replication slot on replica for now
metrics:
- slot_name:
usage: LABEL
description: A unique, cluster-wide identifier for the replication slot
- datname:
usage: LABEL
description: The name of the database this slot is associated with, or null for physical slot.
# Only logical slots have an associated database.
- active:
usage: GAUGE
description: True(1) if this slot is currently actively being used
- temporary:
usage: GAUGE
description: True(1) if this is a temporary replication slot.
# Temporary slots are not saved to disk and are automatically dropped on error or when the session has finished.
- xmin:
usage: COUNTER
description: The oldest transaction that this slot needs the database to retain.
# VACUUM cannot remove tuples deleted by any later transaction.
- catalog_xmin:
usage: COUNTER
description: The oldest transaction affecting the system catalogs that this slot needs the database to retain.
# VACUUM cannot remove catalog tuples deleted by any later transaction.
- restart_lsn:
usage: COUNTER
description: The address (LSN) of oldest WAL which still might be required by the consumer of this slot
# The address (LSN) of oldest WAL which still might be required by the consumer of this slot and thus won't be automatically removed
# during checkpoints unless this LSN gets behind more than max_slot_wal_keep_size from the current LSN. NULL if the LSN of this slot has never been reserved.
- confirm_lsn:
usage: COUNTER
description: The address (LSN) up to which the logical slot's consumer has confirmed receiving data.
# Data older than this is not available anymore. NULL for physical slots.
- retained_bytes:
usage: GAUGE
description: Size of bytes that retained for this slot