From 1ceab07bc97254dadb99956e6c5df9771bc53c77 Mon Sep 17 00:00:00 2001 From: Christophe Courtois Date: Wed, 30 Jan 2019 15:22:49 +0100 Subject: [PATCH] Release 2.4 --- CHANGELOG.md | 30 +-- README | 472 ++++++++++++++++++++++++++++-------------- README.pod | 392 ++++++++++++++++++++++++----------- RELEASING.md | 2 +- check_pgactivity | 4 +- check_pgactivity.spec | 7 +- 6 files changed, 609 insertions(+), 298 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1896fa14..b599b9df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,26 @@ Changelog ========= -2018-10-XX v2.4: +2019-01-30 v2.4: + - add a new uptime service + - add ability to filter by application_name in longest_query and oldest_idlexact service + - add minimal delta size to pgdump_backup service to avoid alert when backup grows small in size + - allow psql connections without providing connection arguments: + rely on the binary default behaviour and environment variables + - returns CRITICAL if connection fails for service `connection`, instead of UNKNOWN + - add documentation example for pgback in pgdump_service + - add documentation for archive_folder + - add information on necessary priviledges for all services + - replication_slots service handle wal files and pg_replslots files separately + - take account of the new BRIN summarize state of autovacuum + - avoid warning for -dev versions in pga_version service + - ignore startup and backup replication states in service streaming_delta - fix handling or file reading errors in archive_folder service - fix wal magic number for version 10 - fix service stat_snapshot_age to output the correct age - fix archiver and replication_slots services to work properly on a standby node + - fix archiver to raise OK on a slave - fix is_replay_paused for PostgreSQL 10 - fix max_nb_wal calculation in wal_files service - fix uninitialized bug in hit_ratio when database do not yet have statistics @@ -14,17 +28,9 @@ Changelog - fix service sequences_exhausted to take account of sequence's minvalue - fix sequences_exhausted to take account of sequences only in the current db - fix exclude option in backends_status service - - ignore startup and backup replication states in service streaming_delta - - avoid warning for -dev versions in pga_version service - - add a new uptime service - - replication_slots service handle wal files and pg_replslots files separately - - add ability to filter by application_name in longest_query and oldest_idlexact service - - add minimal delta size to pgdump_backup service to avoid alert when backup grows small in size - - add documentation example for pgback in pgdump_service - - add documentation for archive_folder - - add privileges information for all services - - allow psql connections without providing connection arguments: rely on the binary default behaviour - - take account of the new BRIN summarize state of autovacuum + - fix archive_folder: timeline numbers are hexadecimal + - fix head levels in man page + - check for errors when saving status 2017-11-13 v2.3: - add complete support for PostgreSQL 10, including non-privileged monitoring diff --git a/README b/README index de4760c5..e2682567 100644 --- a/README +++ b/README @@ -1,12 +1,12 @@ -check_pgactivity +NAME check_pgactivity - PostgreSQL plugin for Nagios - SYNOPSIS +SYNOPSIS check_pgactivity {-w|--warning THRESHOLD} {-c|--critical THRESHOLD} [-s|--service SERVICE ] [-h|--host HOST] [-U|--username ROLE] [-p|--port PORT] [-d|--dbname DATABASE] [-S|--dbservice SERVICE_NAME] [-P|--psql PATH] [--debug] [--status-file FILE] [--path PATH] [-t|--timemout TIMEOUT] check_pgactivity [-l|--list] check_pgactivity [--help] - DESCRIPTION +DESCRIPTION check_pgactivity is designed to monitor PostgreSQL clusters from Nagios. It offers many options to measure and monitor useful performance metrics. @@ -17,40 +17,51 @@ check_pgactivity description list. -h, --host HOST - Database server host or socket directory (default: "localhost"). + Database server host or socket directory (default: $PGHOST or + "localhost") + + See section "CONNECTIONS" for more informations. -U, --username ROLE - Database user name (default: "postgres"). + Database user name (default: $PGUSER or "postgres"). + + See section "CONNECTIONS" for more informations. -p, --port PORT - Database server port (default: "5432"). + Database server port (default: $PGPORT or "5432"). + + See section "CONNECTIONS" for more informations. -d, --dbname DATABASE - Database name to connect to (default: "template1"). + Database name to connect to (default: $PGDATABASE or "template1"). WARNING! This is not necessarily one of the database that will be checked. See "--dbinclude" and "--dbexclude" . + See section "CONNECTIONS" for more informations. + -S, --dbservice SERVICE_NAME The connection service name from pg_service.conf to use. + See section "CONNECTIONS" for more informations. + --dbexclude REGEXP Some services automatically check all the databases of your cluster (note: that does not mean they always need to connect on all of them - to check them though). "--dbexclude" allows to exclude any database - whose name matches the given Perl regular expression. You can repeat - this option as many time as needed. + to check them though). "--dbexclude" excludes any database whose + name matches the given Perl regular expression. Repeat this option + as many time as needed. See "--dbinclude" as well. If a database match both dbexclude and dbinclude arguments, it is excluded. --dbinclude REGEXP Some services automatically check all the databases of your cluster - (note: that does not mean they always need to connect on all of them - to check them though). Some always exclude the 'postgres' database - and templates. "--dbinclude" allows to ONLY check databases whose - names match the given Perl regular expression. You can repeat this - option as many time as needed. + (note: that does not imply that they always need to connect to all + of them though). Some always exclude the 'postgres' database and + templates. "--dbinclude" checks ONLY databases whose names match the + given Perl regular expression. Repeat this option as many time as + needed. See "--dbexclude" as well. If a database match both dbexclude and dbinclude arguments, it is excluded. @@ -83,9 +94,9 @@ check_pgactivity Path to the "psql" executable (default: "psql"). --status-file PATH - Path to the file where service status information will be kept - between successive calls. Default is to save check_pgactivity.data - in the same directory as the script. + Path to the file where service status information is kept between + successive calls. Default is to save check_pgactivity.data in the + same directory as the script. --dump-status-file Dump the content of the status file and exit. This is useful for @@ -97,10 +108,9 @@ check_pgactivity "check_pgactivity.out" in the same directory as the script. -t, --timeout TIMEOUT - Timeout to use (default: "30s"). It can be specified as raw (in - seconds) or as an interval. This timeout will be used as - "statement_timeout" for psql and URL timeout for "minor_version" - service. + Timeout (default: "30s"), as raw (in seconds) or as an interval. + This timeout will be used as "statement_timeout" for psql and URL + timeout for "minor_version" service. -l, --list List available services. @@ -120,7 +130,7 @@ check_pgactivity more formats (eg. a size and a percentage). Percentage - If threshold is a percentage, the value should end with a '%' (no + If THRESHOLD is a percentage, the value should end with a '%' (no space). For instance: 95%. Interval @@ -140,13 +150,13 @@ check_pgactivity CONNECTIONS check_pgactivity allows two different connection specifications: by - service, or by specifying values for host, user, port, and database. - Some services can run on multiple hosts, or needs to connect to multiple + service or by specifying values for host, user, port, and database. Some + services can run on multiple hosts, or needs to connect to multiple hosts. - You must specify one of the parameters below if the service needs to - connect to your PostgreSQL instance. In other words, check_pgactivity - will NOT look for the "libpq" environment variables. + You might specify one of the parameters below to connect to your + PostgreSQL instance. If you don't, no connection parameters are given to + psql: connection relies on binary defaults and environment. The format for connection parameters is: @@ -158,8 +168,9 @@ check_pgactivity Parameters "--host HOST", "--port PORT", "--user ROLE" or "--dbname DATABASE" - One of these parameters is enough to define a new host. If some - parameters are missing, default values are used. + One parameter is enough to define a new host. Usual environment + variables (PGHOST, PGPORT, PGDATABASE, PGUSER, PGSERVICE) or default + values are used for missing parameters. If multiple values are given, define as many host as maximum given values. @@ -183,8 +194,8 @@ check_pgactivity --dbservice s1 --host h1 --port 5433 - Means use "service=s1" and "host=h1 port=5433" in this order. If the - service supports only one host, the second is ignored. + means: use "service=s1" and "host=h1 port=5433" in this order. If + the service supports only one host, the second host is ignored. Mutual exclusion between both methods You can not overwrite services connections variables with parameters @@ -201,11 +212,14 @@ check_pgactivity one, for performance consideration. This service requires the argument "--path" on the command line to - specify the archive folder path to check. + specify the archive folder path to check. Obviously, it must have + access to this folder at the filesystem level: you may have to + execute it on the archiving server rather than on the PostgreSQL + instance. - Optional argument "--suffix" allows you define the suffix of your - archived WALs. Useful if they are compressed with an extension (eg. - .gz, .bz2, ...). Default is no suffix. + The optional argument "--suffix" defines the suffix of your archived + WALs; this is useful for compressed WALs (eg. .gz, .bz2, ...). + Default is no suffix. This service needs to read the header of one of the archives to define how many segments a WAL owns. Check_pgactivity automatically @@ -218,9 +232,8 @@ check_pgactivity unzip -qqp 7z x -so - If needed, you can provide your own command that writes the - uncompressed file to standard output by using the "--unarchiver" - argument. + If needed, provide your own command that writes the uncompressed + file to standard output with the "--unarchiver" argument. Optional argument "--ignore-wal-size" skips the WAL size check. This is useful if your archived WALs are compressed and check_pgactivity @@ -240,6 +253,9 @@ check_pgactivity Critical and Warning define the max age of the latest archived WAL as an interval (eg. 5m or 300s ). + Required privileges: unprivileged role; the system user needs read + access to archived WAL files. + Sample commands: check_pgactivity -s archive_folder --path /path/to/archives -w 15m -c 30m @@ -261,6 +277,8 @@ check_pgactivity archiver process did not archive the oldest waiting WAL to be archived since last call. + Required privileges: unprivileged role (10+); superuser (<10). + autovacuum (8.1+) Check the autovacuum activity on the cluster. @@ -270,6 +288,8 @@ check_pgactivity Thresholds, if any, are ignored. + Required privileges: unprivileged role. + backends (all) Check the total number of connections in the PostgreSQL cluster. @@ -280,6 +300,10 @@ check_pgactivity compared to the difference between the cluster parameters "max_connections" and "superuser_reserved_connections". + Required privileges: an unprivileged user only sees its own queries; + a pg_monitor (10+) or superuser (<10) role is required to see all + queries. + backends_status (8.2+) Check the status of all backends. Depending on your PostgreSQL version, statuses are: "idle", "idle in transaction", "idle in @@ -289,7 +313,7 @@ check_pgactivity are not allowed to see the statuses of other connections. This service supports the argument "--exclude REGEX" to exclude - queries matching the given regular expression from the check. + queries matching the given regular expression. You can use multiple "--exclude REGEX" arguments. @@ -307,6 +331,10 @@ check_pgactivity Note that the number of backends reported in Nagios message includes excluded backends. + Required privileges: an unprivileged user only sees its own queries; + a pg_monitor (10+) or superuser (<10) role is required to see all + queries. + backup_label_age (8.1+) Check the age of the backup label file. @@ -316,18 +344,22 @@ check_pgactivity Critical and Warning thresholds only accept an interval (eg. 1h30m25s). + Required privileges: unprivileged role (9.3+); superuser (<9.3) + bgwriter (8.3+) Check the percentage of pages written by backends since last check. This service uses the status file (see "--status-file" parameter). Perfdata contains the ratio per second for each "pg_stat_bgwriter" - counters since last execution. Units Nps for checkpoints, max - written clean and fsyncs are the number of "events" per second. + counter since last execution. Units Nps for checkpoints, max written + clean and fsyncs are the number of "events" per second. Critical and Warning thresholds are optional. If set, they *only* accept a percentage. + Required privileges: unprivileged role. + btree_bloat Estimate bloat on B-tree indexes. @@ -343,26 +375,27 @@ check_pgactivity excluded. It also supports a "--exclude REGEX" parameter to exclude relations - matching the given regular expression. The regular expression - applies to "database.schema_name.relation_name". This allows you to - filter either on a relation name for all schemas and databases, - filter on a qualified named relation (schema + relation) for all - databases or filter on a qualified named relation in only one - database. + matching a regular expression. The regular expression applies to + "database.schema_name.relation_name". This enables you to filter + either on a relation name for all schemas and databases, on a + qualified named relation (schema + relation) for all databases or on + a qualified named relation in only one database. You can use multiple "--exclude REGEX" parameters. Perfdata will return the number of indexes of concern, by warning and critical threshold per database. - A list of the bloated indexes detail will be returned after the - perfdata. This list contains the fully qualified bloated index name, - the estimated bloat size, the index size and the bloat percentage. + A list of the bloated indexes will be returned after the perfdata. + This list contains the fully qualified bloated index name, the + estimated bloat size, the index size and the bloat percentage. - This service will work with PostgreSQL 10+ without superuser - privileges if you grant SELECT on table pg_statistic to the - pg_monitor role, in each database of the cluster : "GRANT SELECT ON - pg_statistic TO pg_monitor;" + Required privileges: superuser (<10) able to log in all databases, + or at least those in "--dbinclude"; superuser (<10); on PostgreSQL + 10+, a user with the role pg_monitor suffices, provided that you + grant SELECT on the system table pg_statistic to the pg_monitor + role, in each database of the cluster: "GRANT SELECT ON pg_statistic + TO pg_monitor;" commit_ratio (all) Check the commit and rollback rate per second since last call. @@ -379,6 +412,8 @@ check_pgactivity each database. Warning or critical will be raised if the reported value is greater than rollbacks, rollback_rate or rollback_ratio. + Required privileges: unprivileged role. + configuration (8.0+) Check the most important settings. @@ -389,19 +424,23 @@ check_pgactivity "--effective_cache_size", "--no_check_autovacuum", "--no_check_fsync", "--no_check_enable", "--no_check_track_counts". + Required privileges: unprivileged role. + connection (all) Perform a simple connection test. No perfdata is returned. - This service ignore critical and warning arguments. + This service ignores critical and warning arguments. + + Required privileges: unprivileged role. custom_query (all) Perform the given user query. - The query is specified with the "--query" parameter. The first - column will be used to perform the test for the status if warning - and critical are provided. + Specify the query with "--query". The first column will be used to + perform the test for the status if warning and critical are + provided. The warning and critical arguments are optional. They can be of format integer (default), size or time depending on the "--type" @@ -416,6 +455,8 @@ check_pgactivity SELECT pg_database_size('postgres'), pg_database_size('postgres')||'B' AS db_size + Required privileges: unprivileged role (depends on the query). + database_size (8.1+) Check the variation of database sizes, and return the size of every databases. @@ -432,6 +473,8 @@ check_pgactivity This service supports both "--dbexclude" and "--dbinclude" parameters. + Required privileges: unprivileged role. + hit_ratio (all) Check the cache hit ratio on the cluster. @@ -447,6 +490,8 @@ check_pgactivity This service supports both "--dbexclude" and "--dbinclude" parameters. + Required privileges: unprivileged role. + hot_standby_delta (9.0) Check the data delta between a cluster and its hot standbys. @@ -461,9 +506,11 @@ check_pgactivity first one applies to received data, the second one to replayed ones. These thresholds only accept a size (eg. 2.5G). - This service raise a Critical if it doesn't find exactly ONE valid + This service raises a Critical if it doesn't find exactly ONE valid master cluster (ie. critical when 0 or 2 and more masters). + Required privileges: unprivileged role. + is_hot_standby (9.0+) Checks if the cluster is in recovery and accepts read only queries. @@ -471,6 +518,8 @@ check_pgactivity No perfdata is returned. + Required privileges: unprivileged role. + is_master (all) Checks if the cluster accepts read and/or write queries. This state is reported as "in production" by pg_controldata. @@ -479,8 +528,10 @@ check_pgactivity No perfdata is returned. + Required privileges: unprivileged role. + invalid_indexes - Check if there is any invalid indexes in a database. + Check if there is there are invalid indexes in a database. A critical alert is raised if an invalid index is detected. @@ -489,20 +540,21 @@ check_pgactivity excluded. This service supports a "--exclude REGEX" parameter to exclude - indexes matching the given regular expression. The regular - expression applies to "database.schema_name.index_name". This allows - you to filter either on a relation name for all schemas and - databases, filter on a qualified named index (schema + index) for - all databases or filter on a qualified named index in only one - database. + indexes matching a regular expression. The regular expression + applies to "database.schema_name.index_name". This enables you to + filter either on a relation name for all schemas and databases, on a + qualified named index (schema + index) for all databases or on a + qualified named index in only one database. You can use multiple "--exclude REGEX" parameters. Perfdata will return the number of invalid indexes per database. - A list of invalid indexes detail will be returned after the - perfdata. This list contains the fully qualified index name. If - excluded index is set, the number of exclude indexes is returned. + A list of invalid indexes will be returned after the perfdata. This + list contains the fully qualified index name. If excluded index is + set, the number of exclude indexes is returned. + + Required privileges: unprivileged role able to log in all databases. is_replay_paused (9.1+) Checks if the replication is paused. The service will return UNKNOWN @@ -521,6 +573,8 @@ check_pgactivity Perfdata returned: * paused status (0 no, 1 yes, NaN if master) * lag time (in second) * data delta with master (0 no, 1 yes) + Required privileges: unprivileged role. + last_analyze (8.2+) Check on each databases that the oldest "analyze" (from autovacuum or not) is not older than the given threshold. @@ -539,6 +593,8 @@ check_pgactivity parameters. The 'postgres' database and templates are always excluded. + Required privileges: unprivileged role able to log in all databases. + last_vacuum (8.2+) Check that the oldest vacuum (from autovacuum or otherwise) in each database in the cluster is not older than the given threshold. @@ -557,6 +613,8 @@ check_pgactivity parameters. The 'postgres' database and templates are always excluded. + Required privileges: unprivileged role able to log in all databases. + locks (all) Check the number of locks on the hosts. @@ -577,6 +635,8 @@ check_pgactivity max_locks_per_transaction * (max_connections + max_prepared_transactions) or max_pred_locks_per_transaction * (max_connections + max_prepared_transactions) + Required privileges: unprivileged role. + longest_query (all) Check the longest running query in the cluster. @@ -591,65 +651,74 @@ check_pgactivity It also supports argument "--exclude REGEX" to exclude queries matching the given regular expression from the check. + Above 9.0, it also supports "--exclude REGEX" to filter out + application_name. + You can use multiple "--exclude REGEX" parameters. + Required privileges: an unprivileged role only checks its own + queries; a pg_monitor (10+) or superuser (<10) role is required to + check all queries. + max_freeze_age (all) Checks oldest database by transaction age. Critical and Warning thresholds are optional. They accept either a raw number or percentage for PostgreSQL 8.2 and more. If percentage is given, the thresholds are computed based on the - "autovacuum_freeze_max_age" parameter. 100% means some table(s) + "autovacuum_freeze_max_age" parameter. 100% means that some table(s) reached the maximum age and will trigger an autovacuum freeze. Percentage thresholds should therefore be greater than 100%. Even with no threshold, this service will raise a critical alert if - one database has a negative age. + a database has a negative age. - Perfdata return the age of each database. + Perfdata returns the age of each database. This service supports both "--dbexclude" and "--dbinclude" parameters. + Required privileges: unprivileged role. + minor_version (all) Check if the cluster is running the most recent minor version of PostgreSQL. - Latest version of PostgreSQL can be fetched from PostgreSQL official - website if check_pgactivity can access it, or is given as a - parameter. + Latest versions of PostgreSQL can be fetched from PostgreSQL + official website if check_pgactivity has access to it, or must be + given as a parameter. Without "--critical" or "--warning" parameters, this service - attempts to fetch the latest version online. You can optionally set - the path to your prefered program using the parameter "--path" (eg. - "--path '/usr/bin/wget'"). Supported programs are: GET, wget, curl, - fetch, lynx, links, links2. + attempts to fetch the latest version numbers online. A critical + alert is raised if the minor version is not the most recent. - For the online version, a critical alert is raised if the minor - version is not the most recent. + You can optionally set the path to your prefered retrieval tool + using the "--path" parameter (eg. "--path '/usr/bin/wget'"). + Supported programs are: GET, wget, curl, fetch, lynx, links, links2. - If you do not want to (or cannot) query the PostgreSQL website, you - must provide the expected version using either "--warning" OR - "--critical". The given format must be one or more MINOR versions - seperated by anything but a '.'. + If you do not want to (or cannot) query the PostgreSQL website, + provide the expected versions using either "--warning" OR + "--critical", depending on which return value you want to raise. - For instance, the following parameters are all equivalent: + The given string must contain one or more MINOR versions separated + by anything but a '.'. For instance, the following parameters are + all equivalent: --critical "10.1 9.6.6 9.5.10 9.4.15 9.3.20 9.2.24 9.1.24 9.0.23 8.4.22" --critical "10.1, 9.6.6, 9.5.10, 9.4.15, 9.3.20, 9.2.24, 9.1.24, 9.0.23, 8.4.22" --critical "10.1,9.6.6,9.5.10,9.4.15,9.3.20,9.2.24,9.1.24,9.0.23,8.4.22" --critical "10.1/9.6.6/9.5.10/9.4.15/9.3.20/9.2.24/9.1.24/9.0.23/8.4.22" - Any value other than 3 numbers separated by dots (before version + Any other value than 3 numbers separated by dots (before version 10.x) or 2 numbers separated by dots (version 10 and above) will be ignored. If the running PostgreSQL major version is not found, the service raises an unknown status. - Using the offline version raises either a critical or a warning - depending on which one has been set. - Perfdata returns the numerical version of PostgreSQL. + Required privileges: unprivileged role; access to + http://www.postgresql.org required to download version numbers. + oldest_2pc (8.1+) Check the oldest *two-phase commit transaction* (aka. prepared transaction) in the cluster. @@ -659,6 +728,8 @@ check_pgactivity Critical and Warning thresholds only accept an interval. + Required privileges: unprivileged role. + oldest_idlexact (8.3+) Check the oldest *idle* transaction. @@ -670,15 +741,23 @@ check_pgactivity This service supports both "--dbexclude" and "--dbinclude" parameters. + Above 9.2, it supports "--exclude" to filter out connections. Eg., + to filter out pg_dump and pg_dumpall, set this to + 'pg_dump,pg_dumpall'. + + Required privileges: an unprivileged role checks only its own + queries; a pg_monitor (10+) or superuser (<10) role is required to + check all queries. + pg_dump_backup Check the age and size of backups. This service uses the status file (see "--status-file" parameter). The "--path" argument contains the location to the backup folder. - The supported format is a glob pattern to match every folder or file - you need to check. If appropriate, the probe should be run as a user - with sufficient privileges to check for the existence of files. + The supported format is a glob pattern matching every folder or file + that you need to check. If appropriate, the probe should be run as a + user with sufficient privileges to check for the existence of files. The "--pattern" is required, and must contain a regular expression matching the backup file name, extracting the database name from the @@ -694,55 +773,82 @@ check_pgactivity Optionally, a "--global-pattern" option can be supplied to check for an additional global file. + Tip : For compatibility with pg_back, you should use "--path" + '/path/*{dump,sql}' "--pattern" '(\w+)_[0-9-_]+.dump' + "--global-pattern" 'pg_global_[0-9-_]+.sql' + The "--critical" and "--warning" thresholds are optional. They accept a list of 'metric=value' separated by a comma. Available - metric are "oldest" and "newest", respectively the age of the oldest - and newest backups, and "size", which must be the maximum variation - of size since the last check, expressed as a size or a percentage. + metrics are "oldest" and "newest", respectively the age of the + oldest and newest backups, and "size", which must be the maximum + variation of size since the last check, expressed as a size or a + percentage. "mindeltasize", expressed in B, is the minimum variation + of size needed to raise an alert. - This service supports the arguments "--dbinclude" and "--dbexclude", + This service supports the "--dbinclude" and "--dbexclude" arguments, to respectively test for the presence of include or exclude files. - The argument "--exclude" allows to exclude files younger than the - given interval. This is useful to ignore files from a backup in + The argument "--exclude" enables you to exclude files younger than + an interval. This is useful to ignore files from a backup in progress. Eg., if your backup process takes 2h, set this to '125m'. Perfdata returns the age of the oldest and newest backups, as well as the size of the newest backups. + Required privileges: unprivileged role; the system user needs read + access on the directory containing the dumps (but not on the dumps + themselves). + pga_version - Checks if this script is running the given version of + Check if this script is running the given version of check_pgactivity. You must provide the expected version using either "--warning" OR "--critical". No perfdata is returned. + Required privileges: none. + pgdata_permission (8.2+) - Check that the data directory of the instance has 700 as permission, - and belongs to the system user running postgresql currently. + Check that the instance data directory rights are 700, and belongs + to the system user currently running postgresql. - Checking permission works on all Unix systems. + The check on rights works on all Unix systems. - Checking user works only in Linux systems (it uses /proc to not add - dependencies). Before 9.3, you need to give the expected owner using - the "--uid" argument. Without this argument, the owner will not be - checked. + Checking the user only works on Linux systems (it uses /proc to + avoid dependencies). Before 9.3, you need to provide the expected + owner using the "--uid" argument, or the owner will not be checked. - It has to be executed locally on the monitored server. + Required privileges: <11:superuser v11: user with pg_monitor or + pg_read_all_setting The system user must also be able to read the + folder containing PGDATA: the service has to be executed locally on + the monitored server. replication_slots (9.4+) - Check the number of WAL files retained by each replication slots. + Check the number of WAL files and pg_replslot files retained by each + replication slots. + + Perfdata returns the number of WAL and pg_replslot files that each + replication slot has to keep. This service needs superuser + privileges since v10 to obtain pg_replslot files. Unless + replslot_files will be at 0. + + Critical and Warning thresholds are optional. They accept either a + raw number (for backward compatibility, only wal threshold will be + used) or a list 'wal=value' and 'replslot=value'. Respectively + number of kept wal files or number of files in pg_replslot for each + slot. - Perfdata returns the number of WAL that each replication slot has to - keep. + Required privileges: <10: unprivileged role v10: unprivileged role, + or superuser to monitor logical replication v11: unpriviledged user + with GRANT EXECUTE on function pg_ls_dir(text) - Critical and Warning thresholds are optional. If provided, the - number of WAL kept by each replication slot will be compared to the - threshold. These thresholds only accept a raw number. + Here is an example: + + -w 'wal=50,replslot=20' -c 'wal=100,replslot=40' settings (9.0+) - Check if the settings changed compared to the known ones from last - call of this service. + Check if the current settings have changed since they were stored in + the service file. The "known" settings are recorded during the very first call of the service. To update the known settings after a configuration change, @@ -752,29 +858,37 @@ check_pgactivity Critical and Warning thresholds are ignored. - A CRITICAL is raised if at least one parameter changed. + A Critical is raised if at least one parameter changed. + + Required privileges: unprivileged role. sequences_exhausted (7.4+) - Check all sequences assigned to a column (the smallserial,serial and - bigserial types), and raise an alarm if the column or sequences gets - too close to its maximum value. + Check all sequences assigned to a column (the smallserial, serial + and bigserial types), and raise an alarm if the column or sequences + gets too close to the maximum value. - Perfdata returns the sequence(s) that may have trigger the alert. + Perfdata returns the sequences that trigger the alert. - The 'postgres' database and templates are always excluded. + This service supports both "--dbexclude" and "--dbinclude" + parameters. The 'postgres' database and templates are always + excluded. Critical and Warning thresholds accept a percentage of the sequence filled. + Required privileges: unprivileged role able to log in all databases + stat_snapshot_age (9.5+) Check the age of the statistics snapshot (statistics collector's - statistics). This probe help to detect a frozen stats collector + statistics). This probe helps to detect a frozen stats collector process. Perfdata returns the statistics snapshot age. Critical and Warning thresholds accept a raw number of seconds. + Required privileges: unprivileged role. + streaming_delta (9.1+) Check the data delta between a cluster and its standbys in streaming replication. @@ -783,16 +897,16 @@ check_pgactivity MUST be connected. This argument can be used as many times as desired to check multiple slave connections, or you can specify multiple slaves connections at one time, using comma separated - values. Both methods can be used in a single call. The given values - must be of the form "APPLICATION_NAME IP". Either of the two - following examples will check for the presence of two slaves: + values. Both methods can be used in a single call. The provided + values must be of the form "APPLICATION_NAME IP". Both following + examples will check for the presence of two slaves: --slave 'slave1 192.168.1.11' --slave 'slave2 192.168.1.12' --slave 'slave1 192.168.1.11','slave2 192.168.1.12' This service supports a "--exclude REGEX" parameter to exclude every - result matching the given regular expression on application_name or - address ip fields. + result matching a regular expression on application_name or IP + address fields. You can use multiple "--exclude REGEX" parameters. @@ -806,6 +920,8 @@ check_pgactivity supplied, the first one applies to flushed data, the second one to replayed data. These thresholds only accept a size (eg. 2.5G). + Required privileges: unprivileged role. + table_unlogged (9.5+) Check if tables are changed to unlogged. In 9.5, you can switch between logged and unlogged. @@ -820,20 +936,22 @@ check_pgactivity excluded. This service supports a "--exclude REGEX" parameter to exclude - relations matching the given regular expression. The regular - expression applies to "database.schema_name.relation_name". This - allows you to filter either on a relation name for all schemas and - databases, filter on a qualified named relation (schema + relation) - for all databases or filter on a qualified named relation in only - one database. + relations matching a regular expression. The regular expression + applies to "database.schema_name.relation_name". This enables you to + filter either on a relation name for all schemas and databases, on a + qualified named relation (schema + relation) for all databases or on + a qualified named relation in only one database. You can use multiple "--exclude REGEX" parameters. Perfdata will return the number of unlogged tables per database. - A list of the unlogged tables detail will be returned after the - perfdata. This list contains the fully qualified table name. If - "--exclude REGEX" is set, the number of excluded tables is returned. + A list of the unlogged tables will be returned after the perfdata. + This list contains the fully qualified table name. If "--exclude + REGEX" is set, the number of excluded tables is returned. + + Required privileges: unprivileged role able to log in all databases, + or at least those in "--dbinclude". table_bloat Estimate bloat on tables. @@ -852,27 +970,28 @@ check_pgactivity This service supports a "--exclude REGEX" parameter to exclude relations matching the given regular expression. The regular expression applies to "database.schema_name.relation_name". This - allows you to filter either on a relation name for all schemas and - databases, filter on a qualified named relation (schema + relation) - for all databases or filter on a qualified named relation in only - one database. + enables you to filter either on a relation name for all schemas and + databases, on a qualified named relation (schema + relation) for all + databases or on a qualified named relation in only one database. You can use multiple "--exclude REGEX" parameters. Warning: With a non-superuser role, this service can only check the - tables the given role is granted to read! + tables that the given role is granted to read! Perfdata will return the number of tables matching the warning and critical thresholds, per database. - A list of the bloated tables detail will be returned after the - perfdata. This list contains the fully qualified bloated table name, - the estimated bloat size, the table size and the bloat percentage. + A list of the bloated tables will be returned after the perfdata. + This list contains the fully qualified bloated table name, the + estimated bloat size, the table size and the bloat percentage. - This service will work with PostgreSQL 10+ without superuser - privileges if you grant SELECT on table pg_statistic to the - pg_monitor role, in each database of the cluster : "GRANT SELECT ON - pg_statistic TO pg_monitor;" + Required privileges: superuser (<10) able to log in all databases, + or at least those in "--dbinclude"; superuser (<10); on PostgreSQL + 10+, a user with the role pg_monitor suffices, provided that you + grant SELECT on the system table pg_statistic to the pg_monitor + role, in each database of the cluster: "GRANT SELECT ON pg_statistic + TO pg_monitor;" temp_files (8.1+) Check the number and size of temp files. @@ -893,19 +1012,50 @@ check_pgactivity number of file (raw value), a size (unit is mandatory to define a size) or both values separated by a comma. - Threshols applied on current temp files being created AND the + Thresholds are applied on current temp files being created AND the number/size of temp files created since last execution. - This service works with PostgreSQL 10+ without superuser privileges - but it will not monitor live temp files. + Required privileges: <10: superuser v10: an unprivileged role is + possible but it will not monitor databases that it cannot access, + nor live temp files v11: an unprivileged role is possible but must + be granted EXECUTE on functions pg_ls_dir(text), pg_read_file(text), + pg_stat_file(text); the same restrictions than on v10 will still + apply + + uptime (8.1+) + Returns time since postmaster start ("uptime", from 8.1), since + configuration reload (from 8.4), and since shared memory + initialization (from 10). + + Please note that the uptime is unaffected when the postmaster resets + all its children (for example after a kill -9 on a process or a + failure). + + From 10+, the 'time since shared memory init' aims at detecting this + situation: in fact we use the age of the oldest non-client child + process (usually checkpointer, writer or startup). This needs + pg_monitor access to read pg_stat_activity. + + Critical and Warning thresholds are optional. If both are set, + Critical is raised when the postmaster uptime or the time since + shared memory initialization is less than the critical threshold. + Warning is raised when the time since configuration reload is less + than the warning threshold. If only a warning or critical threshold + is given, it will be used for both cases. Obviously these alerts + will disappear from themselves once enough time has passed. + + Perfdata contain the three values (when available). + + Required privileges: pg_monitor on PG10+; otherwise unprivileged + role. wal_files (8.1+) Check the number of WAL files. Perfdata returns the total number of WAL files, current number of written WAL, the current number of recycled WAL, the rate of WAL - written to disk since last execution on master clusters and the - current timeline. + written to disk since the last execution on the master cluster and + the current timeline. Critical and Warning thresholds accept either a raw number of files or a percentage. In case of percentage, the limit is computed based @@ -928,6 +1078,10 @@ check_pgactivity 100% = max_wal_size (as a number of WAL) + wal_keep_segments (if set) + Required privileges: <10:superuser (<10) v10:unprivileged user with + pg_monitor v11:unprivileged user with pg_monitor, or with grant + EXECUTE on function pg_ls_waldir + EXAMPLES Execute service "last_vacuum" on host "host=localhost port=5432": check_pgactivity -h localhost -p 5432 -s last_vacuum -w 30m -c 1h30m @@ -948,14 +1102,14 @@ check_pgactivity databases matching the regexp "importantone": check_pgactivity -p 5433 -h slave --service hit_ratio --dbinclude importantone -w 90% -c 80% - VERSION - check_pgactivity version 2.3, released on Mon Nov 13 2017. +VERSION + check_pgactivity version 2.4, released on Wed Jan 30 2019 - LICENSING +LICENSING This program is open source, licensed under the PostgreSQL license. For license terms, see the LICENSE provided with the sources. - AUTHORS +AUTHORS Author: Open PostgreSQL Monitoring Development Group Copyright: (C) 2012-2018 Open PostgreSQL Monitoring Development Group diff --git a/README.pod b/README.pod index db52227e..739ca246 100644 --- a/README.pod +++ b/README.pod @@ -1,14 +1,14 @@ -=head1 check_pgactivity +=head1 NAME check_pgactivity - PostgreSQL plugin for Nagios -=head2 SYNOPSIS +=head1 SYNOPSIS check_pgactivity {-w|--warning THRESHOLD} {-c|--critical THRESHOLD} [-s|--service SERVICE ] [-h|--host HOST] [-U|--username ROLE] [-p|--port PORT] [-d|--dbname DATABASE] [-S|--dbservice SERVICE_NAME] [-P|--psql PATH] [--debug] [--status-file FILE] [--path PATH] [-t|--timemout TIMEOUT] check_pgactivity [-l|--list] check_pgactivity [--help] -=head2 DESCRIPTION +=head1 DESCRIPTION check_pgactivity is designed to monitor PostgreSQL clusters from Nagios. It offers many options to measure and monitor useful performance metrics. @@ -23,34 +23,44 @@ list. =item B<-h>, B<--host> HOST -Database server host or socket directory (default: "localhost"). +Database server host or socket directory (default: $PGHOST or "localhost") + +See section C for more informations. =item B<-U>, B<--username> ROLE -Database user name (default: "postgres"). +Database user name (default: $PGUSER or "postgres"). + +See section C for more informations. =item B<-p>, B<--port> PORT -Database server port (default: "5432"). +Database server port (default: $PGPORT or "5432"). + +See section C for more informations. =item B<-d>, B<--dbname> DATABASE -Database name to connect to (default: "template1"). +Database name to connect to (default: $PGDATABASE or "template1"). B! This is not necessarily one of the database that will be checked. See C<--dbinclude> and C<--dbexclude> . +See section C for more informations. + =item B<-S>, B<--dbservice> SERVICE_NAME The connection service name from pg_service.conf to use. +See section C for more informations. + =item B<--dbexclude> REGEXP Some services automatically check all the databases of your cluster (note: that does not mean they always need to connect on all -of them to check them though). C<--dbexclude> allows to exclude any -database whose name matches the given Perl regular expression. You -can repeat this option as many time as needed. +of them to check them though). C<--dbexclude> excludes any +database whose name matches the given Perl regular expression. +Repeat this option as many time as needed. See C<--dbinclude> as well. If a database match both dbexclude and dbinclude arguments, it is excluded. @@ -58,11 +68,11 @@ dbinclude arguments, it is excluded. =item B<--dbinclude> REGEXP Some services automatically check all the databases of your -cluster (note: that does not mean they always need to connect on all -of them to check them though). Some always exclude the 'postgres' -database and templates. C<--dbinclude> allows to B check -databases whose names match the given Perl regular expression. You -can repeat this option as many time as needed. +cluster (note: that does not imply that they always need to connect to all +of them though). Some always exclude the 'postgres' +database and templates. C<--dbinclude> checks B +databases whose names match the given Perl regular expression. +Repeat this option as many time as needed. See C<--dbexclude> as well. If a database match both dbexclude and dbinclude arguments, it is excluded. @@ -99,7 +109,7 @@ Path to the C executable (default: "psql"). =item B<--status-file> PATH -Path to the file where service status information will be kept between +Path to the file where service status information is kept between successive calls. Default is to save check_pgactivity.data in the same directory as the script. @@ -115,7 +125,7 @@ C in the same directory as the script. =item B<-t>, B<--timeout> TIMEOUT -Timeout to use (default: "30s"). It can be specified as raw (in seconds) or as +Timeout (default: "30s"), as raw (in seconds) or as an interval. This timeout will be used as C for psql and URL timeout for C service. @@ -147,7 +157,7 @@ formats (eg. a size and a percentage). =item B -If threshold is a percentage, the value should end with a '%' (no space). +If THRESHOLD is a percentage, the value should end with a '%' (no space). For instance: 95%. =item B @@ -171,13 +181,13 @@ The factor between units is 1024 bytes. Eg. C<1g = 1G = 1024*1024*1024.> =head2 CONNECTIONS -check_pgactivity allows two different connection specifications: by service, or +check_pgactivity allows two different connection specifications: by service or by specifying values for host, user, port, and database. Some services can run on multiple hosts, or needs to connect to multiple hosts. -You must specify one of the parameters below if the service needs to connect -to your PostgreSQL instance. In other words, check_pgactivity will NOT look for -the C environment variables. +You might specify one of the parameters below to connect to your PostgreSQL instance. +If you don't, no connection parameters are given to psql: connection relies on binary +defaults and environment. The format for connection parameters is: @@ -192,8 +202,8 @@ listing multiple services separated by a comma. Eg. =item B C<--host HOST>, C<--port PORT>, C<--user ROLE> or C<--dbname DATABASE> -One of these parameters is enough to define a new host. If some -parameters are missing, default values are used. +One parameter is enough to define a new host. Usual environment variables (PGHOST, PGPORT, PGDATABASE, PGUSER, PGSERVICE) or default values +are used for missing parameters. If multiple values are given, define as many host as maximum given values. @@ -216,8 +226,8 @@ For instance: --dbservice s1 --host h1 --port 5433 -Means use "service=s1" and "host=h1 port=5433" in this order. If the service -supports only one host, the second is ignored. +means: use "service=s1" and "host=h1 port=5433" in this order. If the service +supports only one host, the second host is ignored. =item B @@ -240,10 +250,12 @@ files from ONE cluster. The version of PostgreSQL that created the archives is only checked on the last one, for performance consideration. This service requires the argument C<--path> on the command line to specify the -archive folder path to check. +archive folder path to check. Obviously, it must have access to this +folder at the filesystem level: you may have to execute it on the archiving +server rather than on the PostgreSQL instance. -Optional argument C<--suffix> allows you define the suffix of your archived -WALs. Useful if they are compressed with an extension (eg. .gz, .bz2, ...). +The optional argument C<--suffix> defines the suffix of your archived +WALs; this is useful for compressed WALs (eg. .gz, .bz2, ...). Default is no suffix. This service needs to read the header of one of the archives to define how many @@ -256,8 +268,8 @@ extensions .gz, .bz2, .xz, .zip or .7z using the following commands: unzip -qqp 7z x -so -If needed, you can provide your own command that writes the uncompressed file -to standard output by using the C<--unarchiver> argument. +If needed, provide your own command that writes the uncompressed file +to standard output with the C<--unarchiver> argument. Optional argument C<--ignore-wal-size> skips the WAL size check. This is useful if your archived WALs are compressed and check_pgactivity is unable to guess the @@ -276,6 +288,9 @@ one. Critical and Warning define the max age of the latest archived WAL as an interval (eg. 5m or 300s ). +Required privileges: unprivileged role; the system user needs read access +to archived WAL files. + Sample commands: check_pgactivity -s archive_folder --path /path/to/archives -w 15m -c 30m @@ -296,6 +311,8 @@ waiting to be archived. They only accept a raw number of files. Whatever the given threshold, a critical alert is raised if the archiver process did not archive the oldest waiting WAL to be archived since last call. +Required privileges: unprivileged role (10+); superuser (<10). + =item B (8.1+) Check the autovacuum activity on the cluster. @@ -305,6 +322,8 @@ by type (VACUUM, VACUUM ANALYZE, ANALYZE, VACUUM FREEZE). Thresholds, if any, are ignored. +Required privileges: unprivileged role. + =item B (all) Check the total number of connections in the PostgreSQL cluster. @@ -316,6 +335,9 @@ Critical and Warning thresholds accept either a raw number or a percentage (eg. between the cluster parameters C and C. +Required privileges: an unprivileged user only sees its own queries; +a pg_monitor (10+) or superuser (<10) role is required to see all queries. + =item B (8.2+) Check the status of all backends. Depending on your PostgreSQL version, @@ -326,7 +348,7 @@ B appears when you are not allowed to see the statuses of other connections. This service supports the argument C<--exclude REGEX> to exclude queries -matching the given regular expression from the check. +matching the given regular expression. You can use multiple C<--exclude REGEX> arguments. @@ -343,6 +365,9 @@ each of them, for 8.2+. Note that the number of backends reported in Nagios message B excluded backends. +Required privileges: an unprivileged user only sees its own queries; +a pg_monitor (10+) or superuser (<10) role is required to see all queries. + =item B (8.1+) Check the age of the backup label file. @@ -351,19 +376,23 @@ Perfdata returns the age of the backup_label file, -1 if not present. Critical and Warning thresholds only accept an interval (eg. 1h30m25s). +Required privileges: unprivileged role (9.3+); superuser (<9.3) + =item B (8.3+) Check the percentage of pages written by backends since last check. This service uses the status file (see C<--status-file> parameter). -Perfdata contains the ratio per second for each C counters +Perfdata contains the ratio per second for each C counter since last execution. Units Nps for checkpoints, max written clean and fsyncs are the number of "events" per second. Critical and Warning thresholds are optional. If set, they I accept a percentage. +Required privileges: unprivileged role. + =item B Estimate bloat on B-tree indexes. @@ -378,10 +407,10 @@ This service supports both C<--dbexclude> and C<--dbinclude> parameters. The 'postgres' database and templates are always excluded. It also supports a C<--exclude REGEX> parameter to exclude relations matching -the given regular expression. The regular expression applies to -"database.schema_name.relation_name". This allows you to filter either on a -relation name for all schemas and databases, filter on a qualified named relation -(schema + relation) for all databases or filter on a qualified named relation in +a regular expression. The regular expression applies to +"database.schema_name.relation_name". This enables you to filter either on a +relation name for all schemas and databases, on a qualified named relation +(schema + relation) for all databases or on a qualified named relation in only one database. You can use multiple C<--exclude REGEX> parameters. @@ -389,13 +418,16 @@ You can use multiple C<--exclude REGEX> parameters. Perfdata will return the number of indexes of concern, by warning and critical threshold per database. -A list of the bloated indexes detail will be returned after the +A list of the bloated indexes will be returned after the perfdata. This list contains the fully qualified bloated index name, the estimated bloat size, the index size and the bloat percentage. -This service will work with PostgreSQL 10+ without superuser privileges -if you grant SELECT on table pg_statistic to the pg_monitor role, in -each database of the cluster : C +Required privileges: superuser (<10) able to log in all databases, or at least +those in C<--dbinclude>; superuser (<10); +on PostgreSQL 10+, a user with the role pg_monitor suffices, +provided that you grant SELECT on the system table pg_statistic +to the pg_monitor role, in each database of the cluster: +C =item B (all) @@ -413,6 +445,8 @@ rollback rate and the rollback ratio of each database. Warning or critical will be raised if the reported value is greater than B, B or B. +Required privileges: unprivileged role. + =item B (8.0+) Check the most important settings. @@ -424,19 +458,23 @@ C<--work_mem>, C<--maintenance_work_mem>, C<--shared_buffers>,C<--wal_buffers>, C<--checkpoint_segments>, C<--effective_cache_size>, C<--no_check_autovacuum>, C<--no_check_fsync>, C<--no_check_enable>, C<--no_check_track_counts>. +Required privileges: unprivileged role. + =item B (all) Perform a simple connection test. No perfdata is returned. -This service ignore critical and warning arguments. +This service ignores critical and warning arguments. + +Required privileges: unprivileged role. =item B (all) Perform the given user query. -The query is specified with the C<--query> parameter. The first column will be +Specify the query with C<--query>. The first column will be used to perform the test for the status if warning and critical are provided. The warning and critical arguments are optional. They can be of format integer @@ -451,6 +489,8 @@ and its unit appended to it. You can add as many fields as needed. Eg.: SELECT pg_database_size('postgres'), pg_database_size('postgres')||'B' AS db_size +Required privileges: unprivileged role (depends on the query). + =item B (8.1+) B of database sizes, and B of every @@ -461,12 +501,14 @@ This service uses the status file (see C<--status-file> parameter). Perfdata contains the size of each database. Critical and Warning thresholds accept either a raw number, a percentage, or a -size (eg. 2.5G). They are applied on the size difference for each database +size (eg. 2.5G). They are applied on the size difference for each database since the last execution. The aim is to detect unexpected database size variation. This service supports both C<--dbexclude> and C<--dbinclude> parameters. +Required privileges: unprivileged role. + =item B (all) Check the cache hit ratio on the cluster. @@ -481,6 +523,8 @@ Critical and Warning thresholds are optional. They only accept a percentage. This service supports both C<--dbexclude> and C<--dbinclude> parameters. +Required privileges: unprivileged role. + =item B (9.0) Check the data delta between a cluster and its hot standbys. @@ -496,9 +540,11 @@ replayed data. If two values are given, the first one applies to received data, the second one to replayed ones. These thresholds only accept a size (eg. 2.5G). -This service raise a Critical if it doesn't find exactly ONE valid master +This service raises a Critical if it doesn't find exactly ONE valid master cluster (ie. critical when 0 or 2 and more masters). +Required privileges: unprivileged role. + =item B (9.0+) Checks if the cluster is in recovery and accepts read only queries. @@ -507,6 +553,8 @@ This service ignores critical and warning arguments. No perfdata is returned. +Required privileges: unprivileged role. + =item B (all) Checks if the cluster accepts read and/or write queries. This state is reported @@ -516,9 +564,11 @@ This service ignores critical and warning arguments. No perfdata is returned. +Required privileges: unprivileged role. + =item B -Check if there is any invalid indexes in a database. +Check if there is there are invalid indexes in a database. A critical alert is raised if an invalid index is detected. @@ -526,20 +576,22 @@ This service supports both C<--dbexclude> and C<--dbinclude> parameters. The 'postgres' database and templates are always excluded. This service supports a C<--exclude REGEX> parameter to exclude indexes -matching the given regular expression. The regular expression applies to -"database.schema_name.index_name". This allows you to filter either on a -relation name for all schemas and databases, filter on a qualified named -index (schema + index) for all databases or filter on a qualified named +matching a regular expression. The regular expression applies to +"database.schema_name.index_name". This enables you to filter either on a +relation name for all schemas and databases, on a qualified named +index (schema + index) for all databases or on a qualified named index in only one database. You can use multiple C<--exclude REGEX> parameters. Perfdata will return the number of invalid indexes per database. -A list of invalid indexes detail will be returned after the +A list of invalid indexes will be returned after the perfdata. This list contains the fully qualified index name. If excluded index is set, the number of exclude indexes is returned. +Required privileges: unprivileged role able to log in all databases. + =item B (9.1+) Checks if the replication is paused. The service will return UNKNOWN if @@ -560,6 +612,8 @@ Perfdata returned: * lag time (in second) * data delta with master (0 no, 1 yes) +Required privileges: unprivileged role. + =item B (8.2+) Check on each databases that the oldest C (from autovacuum or not) is not @@ -578,6 +632,8 @@ and apply to the oldest execution of analyse. This service supports both C<--dbexclude> and C<--dbinclude> parameters. The 'postgres' database and templates are always excluded. +Required privileges: unprivileged role able to log in all databases. + =item B (8.2+) Check that the oldest vacuum (from autovacuum or otherwise) in each database @@ -596,6 +652,7 @@ and apply to the oldest vacuum. This service supports both C<--dbexclude> and C<--dbinclude> parameters. The 'postgres' database and templates are always excluded. +Required privileges: unprivileged role able to log in all databases. =item B (all) @@ -618,6 +675,8 @@ for 9.1+, regarding lockmode : max_locks_per_transaction * (max_connections + max_prepared_transactions) or max_pred_locks_per_transaction * (max_connections + max_prepared_transactions) +Required privileges: unprivileged role. + =item B (all) Check the longest running query in the cluster. @@ -632,8 +691,13 @@ This service supports both C<--dbexclude> and C<--dbinclude> parameters. It also supports argument C<--exclude REGEX> to exclude queries matching the given regular expression from the check. +Above 9.0, it also supports C<--exclude REGEX> to filter out application_name. + You can use multiple C<--exclude REGEX> parameters. +Required privileges: an unprivileged role only checks its own queries; +a pg_monitor (10+) or superuser (<10) role is required to check all queries. + =item B (all) Checks oldest database by transaction age. @@ -641,54 +705,55 @@ Checks oldest database by transaction age. Critical and Warning thresholds are optional. They accept either a raw number or percentage for PostgreSQL 8.2 and more. If percentage is given, the thresholds are computed based on the "autovacuum_freeze_max_age" parameter. -100% means some table(s) reached the maximum age and will trigger an autovacuum +100% means that some table(s) reached the maximum age and will trigger an autovacuum freeze. Percentage thresholds should therefore be greater than 100%. -Even with no threshold, this service will raise a critical alert if one database +Even with no threshold, this service will raise a critical alert if a database has a negative age. -Perfdata return the age of each database. +Perfdata returns the age of each database. This service supports both C<--dbexclude> and C<--dbinclude> parameters. +Required privileges: unprivileged role. + =item B (all) Check if the cluster is running the most recent minor version of PostgreSQL. -Latest version of PostgreSQL can be fetched from PostgreSQL official -website if check_pgactivity can access it, or is given as a parameter. +Latest versions of PostgreSQL can be fetched from PostgreSQL official +website if check_pgactivity has access to it, or must be given as a parameter. Without C<--critical> or C<--warning> parameters, this service attempts -to fetch the latest version online. You can optionally set the path to -your prefered program using the parameter C<--path> (eg. -C<--path '/usr/bin/wget'>). Supported programs are: GET, wget, curl, -fetch, lynx, links, links2. +to fetch the latest version numbers online. A critical alert is raised if the +minor version is not the most recent. -For the online version, a critical alert is raised if the minor version is not -the most recent. +You can optionally set the path to your prefered retrieval tool using +the C<--path> parameter (eg. C<--path '/usr/bin/wget'>). Supported programs are: +GET, wget, curl, fetch, lynx, links, links2. -If you do not want to (or cannot) query the PostgreSQL website, you -must provide the expected version using either C<--warning> OR -C<--critical>. The given format must be one or more MINOR versions -seperated by anything but a '.'. +If you do not want to (or cannot) query the PostgreSQL website, +provide the expected versions using either C<--warning> OR +C<--critical>, depending on which return value you want to raise. -For instance, the following parameters are all equivalent: +The given string must contain one or more MINOR versions separated by anything +but a '.'. For instance, the following parameters are all equivalent: --critical "10.1 9.6.6 9.5.10 9.4.15 9.3.20 9.2.24 9.1.24 9.0.23 8.4.22" --critical "10.1, 9.6.6, 9.5.10, 9.4.15, 9.3.20, 9.2.24, 9.1.24, 9.0.23, 8.4.22" --critical "10.1,9.6.6,9.5.10,9.4.15,9.3.20,9.2.24,9.1.24,9.0.23,8.4.22" --critical "10.1/9.6.6/9.5.10/9.4.15/9.3.20/9.2.24/9.1.24/9.0.23/8.4.22" -Any value other than 3 numbers separated by dots (before version 10.x) +Any other value than 3 numbers separated by dots (before version 10.x) or 2 numbers separated by dots (version 10 and above) will be ignored. If the running PostgreSQL major version is not found, the service raises an unknown status. -Using the offline version raises either a critical or a warning depending -on which one has been set. - Perfdata returns the numerical version of PostgreSQL. +Required privileges: unprivileged role; access to http://www.postgresql.org +required to download version numbers. + =item B (8.1+) Check the oldest I (aka. prepared transaction) in @@ -699,6 +764,8 @@ transactions per databases. Critical and Warning thresholds only accept an interval. +Required privileges: unprivileged role. + =item B (8.3+) Check the oldest I transaction. @@ -710,6 +777,12 @@ Critical and Warning thresholds only accept an interval. This service supports both C<--dbexclude> and C<--dbinclude> parameters. +Above 9.2, it supports C<--exclude> to filter out connections. Eg., to +filter out pg_dump and pg_dumpall, set this to 'pg_dump,pg_dumpall'. + +Required privileges: an unprivileged role checks only its own queries; +a pg_monitor (10+) or superuser (<10) role is required to check all queries. + =item B Check the age and size of backups. @@ -717,7 +790,7 @@ Check the age and size of backups. This service uses the status file (see C<--status-file> parameter). The C<--path> argument contains the location to the backup folder. The supported -format is a glob pattern to match every folder or file you need to check. If +format is a glob pattern matching every folder or file that you need to check. If appropriate, the probe should be run as a user with sufficient privileges to check for the existence of files. @@ -735,57 +808,83 @@ the form: Optionally, a C<--global-pattern> option can be supplied to check for an additional global file. +Tip : For compatibility with pg_back, you should use + C<--path> '/path/*{dump,sql}' + C<--pattern> '(\w+)_[0-9-_]+.dump' + C<--global-pattern> 'pg_global_[0-9-_]+.sql' + The C<--critical> and C<--warning> thresholds are optional. They accept a list -of 'metric=value' separated by a comma. Available metric are C and +of 'metric=value' separated by a comma. Available metrics are C and C, respectively the age of the oldest and newest backups, and C, which must be the maximum variation of size since the last check, expressed -as a size or a percentage. +as a size or a percentage. C, expressed in B, is the minimum variation +of size needed to raise an alert. -This service supports the arguments C<--dbinclude> and C<--dbexclude>, to +This service supports the C<--dbinclude> and C<--dbexclude> arguments, to respectively test for the presence of include or exclude files. -The argument C<--exclude> allows to exclude files younger than the given +The argument C<--exclude> enables you to exclude files younger than an interval. This is useful to ignore files from a backup in progress. Eg., if your backup process takes 2h, set this to '125m'. Perfdata returns the age of the oldest and newest backups, as well as the size of the newest backups. +Required privileges: unprivileged role; the system user needs read access +on the directory containing the dumps (but not on the dumps themselves). + =item B -Checks if this script is running the given version of check_pgactivity. +Check if this script is running the given version of check_pgactivity. You must provide the expected version using either C<--warning> OR C<--critical>. No perfdata is returned. +Required privileges: none. + =item B (8.2+) -Check that the data directory of the instance has 700 as permission, and belongs -to the system user running postgresql currently. +Check that the instance data directory rights are 700, and belongs +to the system user currently running postgresql. -Checking permission works on all Unix systems. +The check on rights works on all Unix systems. -Checking user works only in Linux systems (it uses /proc to not add -dependencies). Before 9.3, you need to give the expected owner using the -C<--uid> argument. Without this argument, the owner will not be checked. +Checking the user only works on Linux systems (it uses /proc to avoid +dependencies). Before 9.3, you need to provide the expected owner using the +C<--uid> argument, or the owner will not be checked. -B +Required privileges: + <11:superuser + v11: user with pg_monitor or pg_read_all_setting +The system user must also be able to read the folder containing +PGDATA: B =item B (9.4+) -Check the number of WAL files retained by each replication slots. +Check the number of WAL files and pg_replslot files retained by each replication slots. + +Perfdata returns the number of WAL and pg_replslot files that each replication +slot has to keep. This service needs superuser privileges since v10 to obtain +pg_replslot files. Unless replslot_files will be at 0. + +Critical and Warning thresholds are optional. They accept either a raw number (for +backward compatibility, only wal threshold will be used) or a list 'wal=value' +and 'replslot=value'. Respectively number of kept wal files or number of files +in pg_replslot for each slot. + +Required privileges: + <10: unprivileged role + v10: unprivileged role, or superuser to monitor logical replication + v11: unpriviledged user with GRANT EXECUTE on function pg_ls_dir(text) -Perfdata returns the number of WAL that each replication slot has to keep. +Here is an example: -Critical and Warning thresholds are optional. If provided, the number of WAL -kept by each replication slot will be compared to the threshold. -These thresholds only accept a raw number. + -w 'wal=50,replslot=20' -c 'wal=100,replslot=40' =item B (9.0+) -Check if the settings changed compared to the known ones from last call of this -service. +Check if the current settings have changed since they were stored in the service file. The "known" settings are recorded during the very first call of the service. To update the known settings after a configuration change, call this service @@ -795,28 +894,35 @@ No perfdata. Critical and Warning thresholds are ignored. -A CRITICAL is raised if at least one parameter changed. +A Critical is raised if at least one parameter changed. + +Required privileges: unprivileged role. =item B (7.4+) -Check all sequences assigned to a column (the smallserial,serial and bigserial types), -and raise an alarm if the column or sequences gets too close to its maximum value. +Check all sequences assigned to a column (the smallserial, serial and bigserial types), +and raise an alarm if the column or sequences gets too close to the maximum value. -Perfdata returns the sequence(s) that may have trigger the alert. +Perfdata returns the sequences that trigger the alert. +This service supports both C<--dbexclude> and C<--dbinclude> parameters. The 'postgres' database and templates are always excluded. Critical and Warning thresholds accept a percentage of the sequence filled. +Required privileges: unprivileged role able to log in all databases + =item B (9.5+) Check the age of the statistics snapshot (statistics collector's statistics). -This probe help to detect a frozen stats collector process. +This probe helps to detect a frozen stats collector process. Perfdata returns the statistics snapshot age. Critical and Warning thresholds accept a raw number of seconds. +Required privileges: unprivileged role. + =item B (9.1+) Check the data delta between a cluster and its standbys in streaming replication. @@ -825,14 +931,14 @@ Optional argument C<--slave> allows you to specify some slaves that MUST be connected. This argument can be used as many times as desired to check multiple slave connections, or you can specify multiple slaves connections at one time, using comma separated values. Both methods can be used in a single call. The -given values must be of the form "APPLICATION_NAME IP". -Either of the two following examples will check for the presence of two slaves: +provided values must be of the form "APPLICATION_NAME IP". +Both following examples will check for the presence of two slaves: --slave 'slave1 192.168.1.11' --slave 'slave2 192.168.1.12' --slave 'slave1 192.168.1.11','slave2 192.168.1.12' -This service supports a C<--exclude REGEX> parameter to exclude every result -matching the given regular expression on application_name or address ip fields. +This service supports a C<--exclude REGEX> parameter to exclude every result +matching a regular expression on application_name or IP address fields. You can use multiple C<--exclude REGEX> parameters. @@ -845,6 +951,8 @@ and replayed data. If two values are supplied, the first one applies to flushed data, the second one to replayed data. These thresholds only accept a size (eg. 2.5G). +Required privileges: unprivileged role. + =item B (9.5+) Check if tables are changed to unlogged. In 9.5, you can switch between logged and unlogged. @@ -858,20 +966,22 @@ This service supports both C<--dbexclude> and C<--dbinclude> parameters. The 'postgres' database and templates are always excluded. This service supports a C<--exclude REGEX> parameter to exclude relations -matching the given regular expression. The regular expression applies to -"database.schema_name.relation_name". This allows you to filter either on a -relation name for all schemas and databases, filter on a qualified named relation -(schema + relation) for all databases or filter on a qualified named relation in +matching a regular expression. The regular expression applies to +"database.schema_name.relation_name". This enables you to filter either on a +relation name for all schemas and databases, on a qualified named relation +(schema + relation) for all databases or on a qualified named relation in only one database. You can use multiple C<--exclude REGEX> parameters. Perfdata will return the number of unlogged tables per database. -A list of the unlogged tables detail will be returned after the +A list of the unlogged tables will be returned after the perfdata. This list contains the fully qualified table name. If C<--exclude REGEX> is set, the number of excluded tables is returned. +Required privileges: unprivileged role able to log in all databases, +or at least those in C<--dbinclude>. =item B @@ -889,26 +999,29 @@ The 'postgres' database and templates are always excluded. This service supports a C<--exclude REGEX> parameter to exclude relations matching the given regular expression. The regular expression applies to -"database.schema_name.relation_name". This allows you to filter either on a -relation name for all schemas and databases, filter on a qualified named relation -(schema + relation) for all databases or filter on a qualified named relation in +"database.schema_name.relation_name". This enables you to filter either on a +relation name for all schemas and databases, on a qualified named relation +(schema + relation) for all databases or on a qualified named relation in only one database. You can use multiple C<--exclude REGEX> parameters. B: With a non-superuser role, this service can only check the tables -the given role is granted to read! +that the given role is granted to read! Perfdata will return the number of tables matching the warning and critical thresholds, per database. -A list of the bloated tables detail will be returned after the +A list of the bloated tables will be returned after the perfdata. This list contains the fully qualified bloated table name, the estimated bloat size, the table size and the bloat percentage. -This service will work with PostgreSQL 10+ without superuser privileges -if you grant SELECT on table pg_statistic to the pg_monitor role, in -each database of the cluster : C +Required privileges: superuser (<10) able to log in all databases, or at least +those in C<--dbinclude>; superuser (<10); +on PostgreSQL 10+, a user with the role pg_monitor suffices, +provided that you grant SELECT on the system table pg_statistic +to the pg_monitor role, in each database of the cluster: +C =item B (8.1+) @@ -928,19 +1041,48 @@ Critical and Warning thresholds are optional. They accept either a number of file (raw value), a size (unit is B to define a size) or both values separated by a comma. -Threshols applied on current temp files being created AND the number/size +Thresholds are applied on current temp files being created AND the number/size of temp files created since last execution. -This service works with PostgreSQL 10+ without superuser privileges but it will -not monitor live temp files. +Required privileges: + <10: superuser + v10: an unprivileged role is possible but it will not monitor databases +that it cannot access, nor live temp files + v11: an unprivileged role is possible but must be granted EXECUTE +on functions pg_ls_dir(text), pg_read_file(text), pg_stat_file(text); +the same restrictions than on v10 will still apply + +=item B (8.1+) + +Returns time since postmaster start ("uptime", from 8.1), +since configuration reload (from 8.4), +and since shared memory initialization (from 10). + +Please note that the uptime is unaffected when the postmaster resets +all its children (for example after a kill -9 on a process or a failure). + +From 10+, the 'time since shared memory init' aims at detecting this situation: +in fact we use the age of the oldest non-client child process (usually checkpointer, +writer or startup). This needs pg_monitor access to read pg_stat_activity. + +Critical and Warning thresholds are optional. If both are set, Critical is +raised when the postmaster uptime or the time since shared memory initialization +is less than the critical threshold. +Warning is raised when the time since configuration reload is less than the warning threshold. +If only a warning or critical threshold is given, it will be used for both cases. +Obviously these alerts will disappear from themselves once enough time has passed. + +Perfdata contain the three values (when available). + +Required privileges: pg_monitor on PG10+; otherwise unprivileged role. =item B (8.1+) Check the number of WAL files. Perfdata returns the total number of WAL files, current number of written WAL, -the current number of recycled WAL, the rate of WAL written to disk since -last execution on master clusters and the current timeline. +the current number of recycled WAL, the rate of WAL written to disk since the +last execution on the master cluster and the current timeline. Critical and Warning thresholds accept either a raw number of files or a percentage. In case of percentage, the limit is computed based on: @@ -962,6 +1104,12 @@ For 9.5 and above, the limit is: 100% = max_wal_size (as a number of WAL) + wal_keep_segments (if set) +Required privileges: + <10:superuser (<10) + v10:unprivileged user with pg_monitor + v11:unprivileged user with pg_monitor, or with grant EXECUTE on function +pg_ls_waldir + =back =head2 EXAMPLES @@ -990,16 +1138,16 @@ For 9.5 and above, the limit is: =back -=head2 VERSION +=head1 VERSION -check_pgactivity version 2.3, released on Mon Nov 13 2017. +check_pgactivity version 2.4, released on Wed Jan 30 2019 -=head2 LICENSING +=head1 LICENSING This program is open source, licensed under the PostgreSQL license. For license terms, see the LICENSE provided with the sources. -=head2 AUTHORS +=head1 AUTHORS Author: Open PostgreSQL Monitoring Development Group Copyright: (C) 2012-2018 Open PostgreSQL Monitoring Development Group diff --git a/RELEASING.md b/RELEASING.md index 285a5a6c..4f0b01e5 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -24,7 +24,7 @@ podselect check_pgactivity > README.pod ## Tagging and building tar file ``` -TAG=REL2_3 +TAG=REL2_4 git -a $TAG git push --tags git archive --prefix=check_pgactivity-$TAG/ -o /tmp/check_pgactivity-$TAG.tgz $TAG diff --git a/check_pgactivity b/check_pgactivity index cc3a699b..a7aa56de 100755 --- a/check_pgactivity +++ b/check_pgactivity @@ -57,7 +57,7 @@ delete $ENV{'LANGUAGE'}; $| = 1; -$VERSION = '2.4dev'; +$VERSION = '2.4'; $PROGRAM = 'check_pgactivity'; my $PG_VERSION_MIN = 70400; @@ -7939,7 +7939,7 @@ __END__ =head1 VERSION -check_pgactivity version 2.3, released on Mon Nov 13 2017. +check_pgactivity version 2.4, released on Wed Jan 30 2019 =head1 LICENSING diff --git a/check_pgactivity.spec b/check_pgactivity.spec index 65aac511..ceaa9909 100644 --- a/check_pgactivity.spec +++ b/check_pgactivity.spec @@ -1,7 +1,7 @@ -%global _tag REL2_3 +%global _tag REL2_4 Name: nagios-plugins-pgactivity -Version: 2.3 +Version: 2.4 Release: 1 Summary: PostgreSQL monitoring plugin for Nagios License: PostgreSQL @@ -32,6 +32,9 @@ install -D -p -m 0755 check_pgactivity %{buildroot}/%{_libdir}/nagios/plugins/ch %doc README LICENSE %changelog +* Wed Jan 30 2019 Christophe Courtois 2.4-1 +- new major release 2.4 + * Mon Nov 13 2017 Thomas Reiss 2.3-1 - new major release 2.3