-
Notifications
You must be signed in to change notification settings - Fork 45
/
pg_exporter.yml
5721 lines (5488 loc) · 237 KB
/
pg_exporter.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#==============================================================#
# Author: Vonng (rh@vonng.com)
# Desc : pg_exporter metrics collector definition
# Ver : PostgreSQL 10 ~ 17+ and pgbouncer 1.9+
# Ctime : 2019-12-09
# Mtime : 2024-06-19
# Copyright (C) 2019-2024 Ruohang Feng
#==============================================================#
#==============================================================#
# 1. Config File
#==============================================================#
# The configuration file for pg_exporter is a YAML file.
# Default configuration are retrieved via following precedence:
# 1. command line args: --config=<config path>
# 2. environment variables: PG_EXPORTER_CONFIG=<config path>
# 3. pg_exporter.yml (Current directory)
# 4. /etc/pg_exporter.yml (config file)
# 5. /etc/pg_exporter (config dir)
#==============================================================#
# 2. Config Format
#==============================================================#
# pg_exporter config could be a single YAML file, or a directory containing a series of separated YAML files.
# each YAML config file is consist of one or more metrics Collector definition. Which are top-level objects
# If a directory is provided, all YAML in that directory will be merged in alphabetic order.
# Collector definition examples are shown below.
#==============================================================#
# 3. Collector Example
#==============================================================#
# # Here is an example of a metrics collector definition
# pg_primary_only: # Collector branch name. Must be UNIQUE among the entire configuration
# name: pg # Collector namespace, used as METRIC PREFIX, set to branch name by default, can be override
# # the same namespace may contain multiple collector branches. It`s the user`s responsibility
# # to make sure that AT MOST ONE collector is picked for each namespace.
#
# desc: PostgreSQL basic information (on primary) # Collector description
# query: | # Metrics Query SQL
#
# SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp,
# pg_current_wal_lsn() - `0/0` AS lsn,
# pg_current_wal_insert_lsn() - `0/0` AS insert_lsn,
# pg_current_wal_lsn() - `0/0` AS write_lsn,
# pg_current_wal_flush_lsn() - `0/0` AS flush_lsn,
# extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime,
# extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time,
# pg_is_in_backup() AS is_in_backup,
# extract(EPOCH FROM now() - pg_backup_start_time()) AS backup_time;
#
# # [OPTIONAL] metadata fields, control collector behavior
# ttl: 10 # Cache TTL: in seconds, how long will pg_exporter cache this collector`s query result.
# timeout: 0.1 # Query Timeout: in seconds, query that exceed this limit will be canceled.
# min_version: 100000 # minimal supported version, boundary IS included. In server version number format,
# max_version: 130000 # maximal supported version, boundary NOT included, In server version number format
# fatal: false # Collector marked `fatal` fails, the entire scrape will abort immediately and marked as failed
# skip: false # Collector marked `skip` will not be installed during the planning procedure
#
# tags: [cluster, primary] # Collector tags, used for planning and scheduling
#
# # tags are list of strings, which could be:
# # * `cluster` marks this query as cluster level, so it will only execute once for the same PostgreSQL Server
# # * `primary` or `master` mark this query can only run on a primary instance (WILL NOT execute if pg_is_in_recovery())
# # * `standby` or `replica` mark this query can only run on a replica instance (WILL execute if pg_is_in_recovery())
# # some special tag prefix have special interpretation:
# # * `dbname:<dbname>` means this query will ONLY be executed on database with name `<dbname>`
# # * `username:<user>` means this query will only be executed when connect with user `<user>`
# # * `extension:<extname>` means this query will only be executed when extension `<extname>` is installed
# # * `schema:<nspname>` means this query will only by executed when schema `<nspname>` exist
# # * `not:<negtag>` means this query WILL NOT be executed when exporter is tagged with `<negtag>`
# # * `<tag>` means this query WILL be executed when exporter is tagged with `<tag>`
# # ( <tag> could not be cluster,primary,standby,master,replica,etc...)
#
# # One or more "predicate queries" may be defined for a metric query. These
# # are run before the main metric query (after any cache hit check). If all
# # of them, when run sequentially, return a single row with a single column
# # boolean true result, the main metric query is executed. If any of them
# # return false or return zero rows, the main query is skipped. If any
# # predicate query returns more than one row, a non-boolean result, or fails
# # with an error the whole query is marked failed. Predicate queries can be
# # used to check for the presence of specific functions, tables, extensions,
# # settings, vendor-specific postgres features etc before running the main query.
#
# predicate_queries:
# - name: predicate query name
# predicate_query: |
# SELECT EXISTS (SELECT 1 FROM information_schema.routines WHERE routine_schema = 'pg_catalog' AND routine_name = 'pg_backup_start_time');
#
# metrics: # List of returned columns, each column must have a `name` and `usage`, `rename` and `description` are optional
# - timestamp: # Column name, should be exactly the same as returned column name
# usage: GAUGE # Metric type, `usage` could be
# * DISCARD: completely ignoring this field
# * LABEL: use columnName=columnValue as a label in metric
# * GAUGE: Mark column as a gauge metric, full name will be `<query.name>_<column.name>`
# * COUNTER: Same as above, except it is a counter rather than a gauge.
# rename: ts # [OPTIONAL] Alias, optional, the alias will be used instead of the column name
# description: xxxx # [OPTIONAL] Description of the column, will be used as a metric description
# default: 0 # [OPTIONAL] Default value, will be used when column is NULL
# scale: 1000 # [OPTIONAL] Scale the value by this factor
# - lsn:
# usage: COUNTER
# description: log sequence number, current write location (on primary)
# - insert_lsn:
# usage: COUNTER
# description: primary only, location of current wal inserting
# - write_lsn:
# usage: COUNTER
# description: primary only, location of current wal writing
# - flush_lsn:
# usage: COUNTER
# description: primary only, location of current wal syncing
# - uptime:
# usage: GAUGE
# description: seconds since postmaster start
# - conf_reload_time:
# usage: GAUGE
# description: seconds since last configuration reload
# - is_in_backup:
# usage: GAUGE
# description: 1 if backup is in progress
# - backup_time:
# usage: GAUGE
# description: seconds since the current backup start. null if don`t have one
#
# .... # you can also use rename & scale to customize the metric name and value:
# - checkpoint_write_time:
# rename: write_time
# usage: COUNTER
# scale: 1e-3
# description: Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds
#==============================================================#
# 4. Collector Presets
#==============================================================#
# pg_exporter is shipped with a series of preset collectors (already numbered and ordered by filename)
#
# 1xx Basic metrics: basic info, metadata, settings
# 2xx Replication metrics: replication, walreceiver, downstream, sync standby, slots, subscription
# 3xx Persist metrics: size, wal, background writer, checkpointer, ssl, checkpoint, recovery, slru cache, shmem usage
# 4xx Activity metrics: backend count group by state, wait event, locks, xacts, queries
# 5xx Progress metrics: clustering, vacuuming, indexing, basebackup, copy
# 6xx Database metrics: pg_database, publication, subscription
# 7xx Object metrics: pg_class, table, index, function, sequence, default partition
# 8xx Optional metrics: optional metrics collector (disable by default, slow queries)
# 9xx Pgbouncer metrics: metrics from pgbouncer admin database `pgbouncer`
#
# 100-599 Metrics for entire database cluster (scrape once)
# 600-899 Metrics for single database instance (scrape for each database ,except for pg_db itself)
#==============================================================#
# 5. Cache TTL
#==============================================================#
# Cache can be used for reducing query overhead, it can be enabled by setting a non-zero value for `ttl`
# It is highly recommended to use cache to avoid duplicate scrapes. Especially when you got multiple Prometheus
# scraping the same instance with slow monitoring queries. Setting `ttl` to zero or leaving blank will disable
# result caching, which is the default behavior
#
# TTL has to be smaller than your scrape interval. 15s scrape interval and 10s TTL is a good start for
# production environment. Some expensive monitoring queries (such as size/bloat check) will have longer `ttl`
# which can also be used as a mechanism to achieve `different scrape frequency`
#==============================================================#
# 6. Query Timeout
#==============================================================#
# Collectors can be configured with an optional Timeout. If the collector`s query executes more than that
# timeout, it will be canceled immediately. Setting the `timeout` to 0 or leaving blank will reset it to
# default timeout 0.1 (100ms). Setting it to any negative number will disable the query timeout feature.
# All queries have a default timeout of 100ms, if exceeded, the query will be canceled immediately to avoid
# avalanche. You can explicitly overwrite that option. but beware: in some extreme cases, if all your
# timeout sum up greater your scrape/cache interval (usually 15s), the queries may still be jammed.
# or, you can just disable potential slow queries.
#==============================================================#
# 7. Version Compatibility
#==============================================================#
# Each collector has two optional version compatibility parameters: `min_version` and `max_version`.
# These two parameters specify the version compatibility of the collector. If target postgres/pgbouncer
# version is less than `min_version`, or higher than `max_version`, the collector will not be installed.
# These two parameters are using PostgreSQL server version number format, which is a 6-digit integer
# format as <major:2 digit><minor:2 digit>:<release: 2 digit>.
# For example, 090600 stands for 9.6 and 120100 stands for 12.1
# And beware that version compatibility range is left-inclusive right exclusive: [min, max), set to zero or
# leaving blank will affect as -inf or +inf
#==============================================================#
# 8. Fatality
#==============================================================#
# If a collector is marked with `fatal` falls, the entire scrape operation will be marked as fail and key metrics
# `pg_up` / `pgbouncer_up` will be reset to 0. It is always a good practice to set up AT LEAST ONE fatal
# collector for pg_exporter. `pg.pg_primary_only` and `pgbouncer_list` are the default fatal collector.
#
# If a collector without `fatal` flag fails, it will increase global fail counters. But the scrape operation
# will carry on. The entire scrape result will not be marked as faile, thus will not affect the `<xx>_up` metric.
#==============================================================#
# 9. Skip
#==============================================================#
# Collector with `skip` flag set to true will NOT be installed.
# This could be a handy option to disable collectors
#==============================================================#
# 10. Tags and Planning
#==============================================================#
# Tags are designed for collector planning & schedule. It can be handy to customize which queries run
# on which instances. And thus you can use one-single monolith config for multiple environments
#
# Tags are a list of strings, each string could be:
# Pre-defined special tags
# * `cluster` marks this collector as cluster level, so it will ONLY BE EXECUTED ONCE for the same PostgreSQL Server
# * `primary` or `master` mark this collector as primary-only, so it WILL NOT work iff pg_is_in_recovery()
# * `standby` or `replica` mark this collector as replica-only, so it WILL work iff pg_is_in_recovery()
# Special tag prefix which have different interpretation:
# * `dbname:<dbname>` means this collector will ONLY work on database with name `<dbname>`
# * `username:<user>` means this collector will ONLY work when connect with user `<user>`
# * `extension:<extname>` means this collector will ONLY work when extension `<extname>` is installed
# * `schema:<nspname>` means this collector will only work when schema `<nspname>` exists
# Customized positive tags (filter) and negative tags (taint)
# * `not:<negtag>` means this collector WILL NOT work when exporter is tagged with `<negtag>`
# * `<tag>` means this query WILL work if exporter is tagged with `<tag>` (special tags not included)
#
# pg_exporter will trigger the Planning procedure after connecting to the target. It will gather database facts
# and match them with tags and other metadata (such as supported version range). Collector will only
# be installed if and only if it is compatible with the target server.
##
# SYNOPSIS
# pg.pg_primary_only_*
#
# DESCRIPTION
# PostgreSQL basic information (on primary)
#
# OPTIONS
# Tags [cluster, primary]
# TTL 1
# Priority 0
# Timeout 100ms
# Fatal true
# Version 100000 ~ higher
# Source 110-pg.yml
#
# METRICS
# timestamp (GAUGE)
# current database timestamp in unix epoch
# uptime (GAUGE)
# seconds since postmaster start
# boot_time (GAUGE)
# postmaster boot timestamp in unix epoch
# lsn (COUNTER)
# log sequence number, current write location
# insert_lsn (COUNTER)
# primary only, location of current wal inserting
# write_lsn (COUNTER)
# primary only, location of current wal writing
# flush_lsn (COUNTER)
# primary only, location of current wal syncing
# receive_lsn (COUNTER)
# replica only, location of wal synced to disk
# replay_lsn (COUNTER)
# replica only, location of wal applied
# conf_reload_time (GAUGE)
# seconds since last configuration reload
# last_replay_time (GAUGE)
# time when last transaction been replayed
# lag (GAUGE)
# replica only, replication lag in seconds
# is_in_recovery (GAUGE)
# 1 if in recovery mode
# is_wal_replay_paused (GAUGE)
# 1 if wal play is paused
# is_in_backup (GAUGE)
# 1 if backup is in progress
# backup_time (GAUGE)
# seconds since current backup start
#
pg_primary_only:
name: pg
desc: PostgreSQL basic information (on primary)
query: |
SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp,
extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime,
extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time,
pg_current_wal_lsn() - '0/0' AS lsn,
pg_current_wal_insert_lsn() - '0/0' AS insert_lsn,
pg_current_wal_lsn() - '0/0' AS write_lsn,
pg_current_wal_flush_lsn() - '0/0' AS flush_lsn,
NULL::BIGINT AS receive_lsn,
NULL::BIGINT AS replay_lsn,
extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time,
NULL::FLOAT AS last_replay_time,
0::FLOAT AS lag,
pg_is_in_recovery() AS is_in_recovery,
FALSE AS is_wal_replay_paused;
tags:
- cluster
- primary
ttl: 1
# timeout: 0.1
min_version: 100000
# max_version: 0
fatal: true
skip: false
metrics:
- timestamp:
name: timestamp
description: current database timestamp in unix epoch
usage: GAUGE
- uptime:
name: uptime
description: seconds since postmaster start
usage: GAUGE
- boot_time:
name: boot_time
description: postmaster boot timestamp in unix epoch
usage: GAUGE
- lsn:
name: lsn
description: log sequence number, current write location
usage: COUNTER
- insert_lsn:
name: insert_lsn
description: primary only, location of current wal inserting
usage: COUNTER
- write_lsn:
name: write_lsn
description: primary only, location of current wal writing
usage: COUNTER
- flush_lsn:
name: flush_lsn
description: primary only, location of current wal syncing
usage: COUNTER
- receive_lsn:
name: receive_lsn
description: replica only, location of wal synced to disk
usage: COUNTER
- replay_lsn:
name: replay_lsn
description: replica only, location of wal applied
usage: COUNTER
- conf_reload_time:
name: conf_reload_time
description: seconds since last configuration reload
usage: GAUGE
- last_replay_time:
name: last_replay_time
description: time when last transaction been replayed
usage: GAUGE
- lag:
name: lag
description: replica only, replication lag in seconds
usage: GAUGE
- is_in_recovery:
name: is_in_recovery
description: 1 if in recovery mode
usage: GAUGE
- is_wal_replay_paused:
name: is_wal_replay_paused
description: 1 if wal play is paused
usage: GAUGE
##
# SYNOPSIS
# pg.pg_replica_only_*
#
# DESCRIPTION
# PostgreSQL basic information (on replica)
#
# OPTIONS
# Tags [cluster, replica]
# TTL 1
# Priority 0
# Timeout 100ms
# Fatal true
# Version 100000 ~ higher
# Source 110-pg.yml
#
# METRICS
# timestamp (GAUGE)
# database current timestamp
# uptime (GAUGE)
# seconds since postmaster start
# boot_time (GAUGE)
# unix timestamp when postmaster boot
# lsn (COUNTER)
# log sequence number, current write location
# insert_lsn (COUNTER)
# primary only, location of current wal inserting
# write_lsn (COUNTER)
# primary only, location of current wal writing
# flush_lsn (COUNTER)
# primary only, location of current wal syncing
# receive_lsn (COUNTER)
# replica only, location of wal synced to disk
# replay_lsn (COUNTER)
# replica only, location of wal applied
# conf_reload_time (GAUGE)
# seconds since last configuration reload
# last_replay_time (GAUGE)
# time when last transaction been replayed
# lag (GAUGE)
# replica only, replication lag in seconds
# is_in_recovery (GAUGE)
# 1 if in recovery mode
# is_wal_replay_paused (GAUGE)
# 1 if wal play paused
# is_in_backup (GAUGE)
# 1 if backup is in progress
# backup_time (GAUGE)
# seconds since current backup start
#
pg_replica_only:
name: pg
desc: PostgreSQL basic information (on replica)
query: |
SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp,
extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime,
extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time,
pg_last_wal_replay_lsn() - '0/0' AS lsn,
NULL::BIGINT AS insert_lsn,
NULL::BIGINT AS write_lsn,
NULL::BIGINT AS flush_lsn,
pg_last_wal_receive_lsn() - '0/0' AS receive_lsn,
pg_last_wal_replay_lsn() - '0/0' AS replay_lsn,
extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time,
extract(EPOCH FROM pg_last_xact_replay_timestamp()) AS last_replay_time,
CASE
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
ELSE EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS lag,
pg_is_in_recovery() AS is_in_recovery,
pg_is_wal_replay_paused() AS is_wal_replay_paused;
tags:
- cluster
- replica
ttl: 1
# timeout: 0.1
min_version: 100000
# max_version: 0
fatal: true
skip: false
metrics:
- timestamp:
name: timestamp
description: database current timestamp
usage: GAUGE
- uptime:
name: uptime
description: seconds since postmaster start
usage: GAUGE
- boot_time:
name: boot_time
description: unix timestamp when postmaster boot
usage: GAUGE
- lsn:
name: lsn
description: log sequence number, current write location
usage: COUNTER
- insert_lsn:
name: insert_lsn
description: primary only, location of current wal inserting
usage: COUNTER
- write_lsn:
name: write_lsn
description: primary only, location of current wal writing
usage: COUNTER
- flush_lsn:
name: flush_lsn
description: primary only, location of current wal syncing
usage: COUNTER
- receive_lsn:
name: receive_lsn
description: replica only, location of wal synced to disk
usage: COUNTER
- replay_lsn:
name: replay_lsn
description: replica only, location of wal applied
usage: COUNTER
- conf_reload_time:
name: conf_reload_time
description: seconds since last configuration reload
usage: GAUGE
- last_replay_time:
name: last_replay_time
description: time when last transaction been replayed
usage: GAUGE
- lag:
name: lag
description: replica only, replication lag in seconds
usage: GAUGE
- is_in_recovery:
name: is_in_recovery
description: 1 if in recovery mode
usage: GAUGE
- is_wal_replay_paused:
name: is_wal_replay_paused
description: 1 if wal play paused
usage: GAUGE
##
# SYNOPSIS
# pg_meta_*
#
# DESCRIPTION
# PostgreSQL meta info
#
# OPTIONS
# Tags [cluster]
# TTL 10
# Priority 0
# Timeout 100ms
# Fatal false
# Version 90600 ~ 130000
# Source 120-pg_meta.yml
#
# METRICS
# cluster_id (LABEL)
# cluster system identifier
# cluster_name (LABEL)
# cluster name
# listen_port (LABEL)
# listen port
# data_dir (LABEL)
# data directory path
# conf_path (LABEL)
# postgresql.conf path
# hba_path (LABEL)
# pg_hba.conf path
# wal_level (LABEL)
# wal level
# version (LABEL)
# server version in human-readable format
# ver_num (LABEL)
# server version number in machine-readable format
# extensions (LABEL)
# server installed preload libraries
# primary_conninfo (LABEL)
# connection string to upstream (do not set password here)
# info (GAUGE)
# constant 1
#
pg_meta:
name: pg_meta
desc: PostgreSQL meta info
query: |
SELECT (SELECT system_identifier FROM pg_control_system()) AS cluster_id,
current_setting('cluster_name') AS cluster_name,
current_setting('port') AS listen_port,
current_setting('data_directory') AS data_dir,
current_setting('config_file') AS conf_path,
current_setting('hba_file') AS hba_path,
current_setting('wal_level') AS wal_level,
current_setting('server_version') AS version,
current_setting('server_version_num') AS ver_num,
current_setting('shared_preload_libraries') AS extensions,
'N/A' AS primary_conninfo,
1 AS info
ttl: 10
min_version: 090600
max_version: 130000
tags:
- cluster
metrics:
- cluster_id:
usage: LABEL
description: cluster system identifier
- cluster_name:
usage: LABEL
description: cluster name
- listen_port:
usage: LABEL
description: listen port
- data_dir:
usage: LABEL
description: data directory path
- conf_path:
usage: LABEL
description: postgresql.conf path
- hba_path:
usage: LABEL
description: pg_hba.conf path
- wal_level:
usage: LABEL
description: wal level
- version:
usage: LABEL
description: server version in human-readable format
- ver_num:
usage: LABEL
description: server version number in machine-readable format
- extensions:
usage: LABEL
description: server installed preload libraries
- primary_conninfo:
usage: LABEL
description: connection string to upstream (do not set password here)
- info:
usage: GAUGE
description: constant 1
##
# SYNOPSIS
# pg_meta.pg_meta_13_*
#
# DESCRIPTION
# PostgreSQL meta info for pg 13, with extra primary conninfo
#
# OPTIONS
# Tags [cluster]
# TTL 10
# Priority 0
# Timeout 100ms
# Fatal false
# Version 130000 ~ higher
# Source 120-pg_meta.yml
#
# METRICS
# cluster_id (LABEL)
# cluster system identifier
# cluster_name (LABEL)
# cluster name
# listen_port (LABEL)
# listen port
# data_dir (LABEL)
# data directory path
# conf_path (LABEL)
# postgresql.conf path
# hba_path (LABEL)
# pg_hba.conf path
# wal_level (LABEL)
# wal level
# version (LABEL)
# server version in human-readable format
# ver_num (LABEL)
# server version number in machine-readable format
# extensions (LABEL)
# server installed preload libraries
# primary_conninfo (LABEL)
# connection string to upstream (do not set password here)
# info (GAUGE)
# constant 1
#
pg_meta_13:
name: pg_meta
desc: PostgreSQL meta info for pg 13, with extra primary conninfo
query: |
SELECT (SELECT system_identifier FROM pg_control_system()) AS cluster_id,
current_setting('cluster_name') AS cluster_name,
current_setting('port') AS listen_port,
current_setting('data_directory') AS data_dir,
current_setting('config_file') AS conf_path,
current_setting('hba_file') AS hba_path,
current_setting('wal_level') AS wal_level,
current_setting('server_version') AS version,
current_setting('server_version_num') AS ver_num,
current_setting('shared_preload_libraries') AS extensions,
current_setting('primary_conninfo') AS primary_conninfo,
1 AS info
ttl: 10
min_version: 130000
tags:
- cluster
metrics:
- cluster_id:
usage: LABEL
description: cluster system identifier
- cluster_name:
usage: LABEL
description: cluster name
- listen_port:
usage: LABEL
description: listen port
- data_dir:
usage: LABEL
description: data directory path
- conf_path:
usage: LABEL
description: postgresql.conf path
- hba_path:
usage: LABEL
description: pg_hba.conf path
- wal_level:
usage: LABEL
description: wal level
- version:
usage: LABEL
description: server version in human-readable format
- ver_num:
usage: LABEL
description: server version number in machine-readable format
- extensions:
usage: LABEL
description: server installed preload libraries
- primary_conninfo:
usage: LABEL
description: connection string to upstream (do not set password here)
- info:
usage: GAUGE
description: constant 1
##
# SYNOPSIS
# pg_setting_*
#
# DESCRIPTION
# Important postgres setting entries that must kept same on entire cluster
#
# OPTIONS
# Tags [cluster]
# TTL 10
# Priority 0
# Timeout 100ms
# Fatal false
# Version 90600 ~ higher
# Source 130-pg_setting.yml
#
# METRICS
# max_connections (GAUGE)
# number of concurrent connections to the database server
# max_prepared_transactions (GAUGE)
# maximum number of transactions that can be in the prepared state simultaneously
# max_worker_processes (GAUGE)
# maximum number of background processes that the system can support
# max_replication_slots (GAUGE)
# maximum number of replication slots
# max_wal_senders (GAUGE)
# maximum number of concurrent connections from standby servers
# max_locks_per_transaction (GAUGE)
# no more than this many distinct objects can be locked at any one time
# block_size (GAUGE)
# pg page block size, 8192 by default
# data_checksums (GAUGE)
# whether data checksum is enabled, 1 enabled 0 disabled
# wal_log_hints (GAUGE)
# whether wal_log_hints is enabled, 1 enabled 0 disabled
#
pg_setting:
name: pg_setting
desc: Important postgres setting entries that must kept same on entire cluster
query: |
SELECT current_setting('max_connections') AS max_connections,
current_setting('max_prepared_transactions') AS max_prepared_transactions,
current_setting('max_worker_processes') AS max_worker_processes,
current_setting('max_replication_slots') AS max_replication_slots,
current_setting('max_wal_senders') AS max_wal_senders,
current_setting('max_locks_per_transaction') AS max_locks_per_transaction,
current_setting('block_size') AS block_size,
CASE current_setting('data_checksums') WHEN 'on' THEN 1 ELSE 0 END AS data_checksums,
CASE current_setting('wal_log_hints') WHEN 'on' THEN 1 ELSE 0 END AS wal_log_hints;
ttl: 10
min_version: 090600
tags:
- cluster
metrics:
- max_connections:
usage: GAUGE
description: number of concurrent connections to the database server
- max_prepared_transactions:
usage: GAUGE
description: maximum number of transactions that can be in the prepared state simultaneously
- max_worker_processes:
usage: GAUGE
description: maximum number of background processes that the system can support
- max_replication_slots:
usage: GAUGE
description: maximum number of replication slots
- max_wal_senders:
usage: GAUGE
description: maximum number of concurrent connections from standby servers
- max_locks_per_transaction:
usage: GAUGE
description: no more than this many distinct objects can be locked at any one time
- block_size:
usage: GAUGE
description: pg page block size, 8192 by default
- data_checksums:
usage: GAUGE
description: whether data checksum is enabled, 1 enabled 0 disabled
- wal_log_hints:
usage: GAUGE
description: whether wal_log_hints is enabled, 1 enabled 0 disabled
##
# SYNOPSIS
# pg_repl.pg_repl_12_*
#
# DESCRIPTION
# PostgreSQL replication stat metrics 12+
#
# OPTIONS
# Tags [cluster]
# TTL 10
# Priority 0
# Timeout 100ms
# Fatal false
# Version 120000 ~ higher
# Source 210-pg_repl.yml
#
# METRICS
# appname (LABEL)
# Name of the application that is connected to this WAL sender
# usename (LABEL)
# Name of the user logged into this WAL sender process
# address (LABEL)
# IP address of the client connected to this WAL sender, localhost for unix socket
# pid (LABEL)
# Process ID of the WAL sender process
# client_port (GAUGE)
# TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used
# state (GAUGE)
# Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping
# sync_state (GAUGE)
# Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum
# sync_priority (GAUGE)
# Priority of this standby server for being chosen as the synchronous standby
# backend_xmin (COUNTER)
# This standby's xmin horizon reported by hot_standby_feedback.
# lsn (COUNTER)
# Current log position on this server
# sent_diff (GAUGE)
# Last log position sent to this standby server diff with current lsn
# write_diff (GAUGE)
# Last log position written to disk by this standby server diff with current lsn
# flush_diff (GAUGE)
# Last log position flushed to disk by this standby server diff with current lsn
# replay_diff (GAUGE)
# Last log position replayed into the database on this standby server diff with current lsn
# sent_lsn (COUNTER)
# Last write-ahead log location sent on this connection
# write_lsn (COUNTER)
# Last write-ahead log location written to disk by this standby server
# flush_lsn (COUNTER)
# Last write-ahead log location flushed to disk by this standby server
# replay_lsn (COUNTER)
# Last write-ahead log location replayed into the database on this standby server
# write_lag (GAUGE)
# Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it
# flush_lag (GAUGE)
# Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it
# replay_lag (GAUGE)
# Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it
# time (COUNTER)
# Current timestamp in unix epoch
# launch_time (COUNTER)
# Time when this process was started, i.e., when the client connected to this WAL sender
# reply_time (GAUGE)
# Send time of last reply message received from standby server
#
pg_repl_12:
name: pg_repl
desc: PostgreSQL replication stat metrics 12+
query: |
SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port,
CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state,
CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state,
sync_priority, backend_xmin::TEXT::BIGINT AS backend_xmin, current.lsn - '0/0' AS lsn,
current.lsn - sent_lsn AS sent_diff, current.lsn - write_lsn AS write_diff, current.lsn - flush_lsn AS flush_diff, current.lsn - replay_lsn AS replay_diff,
sent_lsn - '0/0' AS sent_lsn, write_lsn - '0/0' AS write_lsn, flush_lsn - '0/0' AS flush_lsn, replay_lsn - '0/0' AS replay_lsn,
coalesce(extract(EPOCH FROM write_lag), 0) AS write_lag, coalesce(extract(EPOCH FROM flush_lag), 0) AS flush_lag, coalesce(extract(EPOCH FROM replay_lag), 0) AS replay_lag,
extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time, extract(EPOCH FROM reply_time) AS reply_time
FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END AS lsn) current;
ttl: 10
min_version: 120000
tags:
- cluster
metrics:
- appname:
usage: LABEL
description: Name of the application that is connected to this WAL sender
- usename:
usage: LABEL
description: Name of the user logged into this WAL sender process
- address:
usage: LABEL
description: IP address of the client connected to this WAL sender, localhost for unix socket
# IP address of the client connected to this WAL sender. If this field is null, it indicates that the client is connected via a Unix socket on the server machine.
- pid:
usage: LABEL
description: Process ID of the WAL sender process
- client_port:
usage: GAUGE
description: TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used
- state:
usage: GAUGE
description: Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping
# Current WAL sender state. Possible values are: streaming|startup|catchup|backup|stopping
- sync_state:
usage: GAUGE
description: Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum
# Synchronous state of this standby server. Possible values are: async|potential|sync|quorum
- sync_priority:
usage: GAUGE
description: Priority of this standby server for being chosen as the synchronous standby
# Priority of this standby server for being chosen as the synchronous standby in a priority-based synchronous replication. This has no effect in a quorum-based synchronous replication.
- backend_xmin:
usage: COUNTER
description: This standby's xmin horizon reported by hot_standby_feedback.
- lsn:
usage: COUNTER
description: Current log position on this server
- sent_diff:
usage: GAUGE
description: Last log position sent to this standby server diff with current lsn
- write_diff:
usage: GAUGE
description: Last log position written to disk by this standby server diff with current lsn
- flush_diff:
usage: GAUGE
description: Last log position flushed to disk by this standby server diff with current lsn
- replay_diff:
usage: GAUGE
description: Last log position replayed into the database on this standby server diff with current lsn
- sent_lsn:
usage: COUNTER
description: Last write-ahead log location sent on this connection
- write_lsn:
usage: COUNTER
description: Last write-ahead log location written to disk by this standby server
- flush_lsn:
usage: COUNTER
description: Last write-ahead log location flushed to disk by this standby server
- replay_lsn:
usage: COUNTER
description: Last write-ahead log location replayed into the database on this standby server
- write_lag:
usage: GAUGE
description: Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it
# Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (but not yet flushed it or applied it). This can be used to gauge the delay that synchronous_commit level remote_write incurred while committing if this server was configured as a synchronous standby.
- flush_lag:
usage: GAUGE
description: Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it
# Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it (but not yet applied it). This can be used to gauge the delay that synchronous_commit level on incurred while committing if this server was configured as a synchronous standby.
- replay_lag:
usage: GAUGE
description: Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it
# Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it. This can be used to gauge the delay that synchronous_commit level remote_apply incurred while committing if this server was configured as a synchronous standby.
- time:
usage: COUNTER
description: Current timestamp in unix epoch
- launch_time:
usage: COUNTER
description: Time when this process was started, i.e., when the client connected to this WAL sender
- reply_time:
usage: GAUGE
description: Send time of last reply message received from standby server
# Time when this process was started, i.e., when the client connected to this WAL sender
##
# SYNOPSIS
# pg_repl.pg_repl_10_11_*
#
# DESCRIPTION
# PostgreSQL replication stat metrics v10 v11
#
# OPTIONS
# Tags [cluster]
# TTL 10
# Priority 0
# Timeout 100ms
# Fatal false
# Version 100000 ~ 120000
# Source 210-pg_repl.yml
#
# METRICS
# appname (LABEL)
# Name of the application that is connected to this WAL sender
# usename (LABEL)
# Name of the user logged into this WAL sender process
# address (LABEL)
# IP address of the client connected to this WAL sender, localhost for unix socket
# pid (LABEL)
# Process ID of the WAL sender process
# client_port (GAUGE)
# TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used
# state (GAUGE)
# Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping
# sync_state (GAUGE)
# Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum
# sync_priority (GAUGE)
# Priority of this standby server for being chosen as the synchronous standby
# backend_xmin (COUNTER)
# This standby's xmin horizon reported by hot_standby_feedback.
# lsn (COUNTER)
# Current log position on this server
# sent_diff (GAUGE)
# Last log position sent to this standby server diff with current lsn
# write_diff (GAUGE)
# Last log position written to disk by this standby server diff with current lsn
# flush_diff (GAUGE)
# Last log position flushed to disk by this standby server diff with current lsn
# replay_diff (GAUGE)
# Last log position replayed into the database on this standby server diff with current lsn
# sent_lsn (COUNTER)
# Last write-ahead log location sent on this connection
# write_lsn (COUNTER)