From 4fa2c3c2850e1152da0c1916c445efc8c3de1f29 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Wed, 12 Feb 2025 09:56:43 -0500 Subject: [PATCH 01/22] Adds OTel metrics for Postgres Issue: PGO-2036 --- .../collector/generated/gte_pg16_metrics.json | 1 + .../collector/generated/gte_pg17_metrics.json | 1 + .../collector/generated/lt_pg16_metrics.json | 1 + .../collector/generated/lt_pg17_metrics.json | 1 + .../generated/pgbackrest_metrics.json | 1 + .../generated/pgbouncer_metrics_queries.json | 2 +- .../generated/postgres_5m_metrics.json | 1 + .../generated/postgres_5s_metrics.json | 1 + internal/collector/gte_pg16_metrics.yaml | 126 +++ internal/collector/gte_pg17_metrics.yaml | 71 ++ internal/collector/lt_pg16_metrics.yaml | 134 +++ internal/collector/lt_pg17_metrics.yaml | 71 ++ internal/collector/naming.go | 13 +- internal/collector/patroni.go | 2 +- internal/collector/pgbackrest_metrics.yaml | 169 ++++ internal/collector/pgbouncer.go | 2 +- .../collector/pgbouncer_metrics_queries.yaml | 16 +- internal/collector/postgres.go | 3 +- internal/collector/postgres_5m_metrics.yaml | 143 +++ internal/collector/postgres_5s_metrics.yaml | 842 ++++++++++++++++++ internal/collector/postgres_metrics.go | 110 +++ .../controller/postgrescluster/controller.go | 3 +- .../controller/postgrescluster/instance.go | 24 +- .../postgrescluster/metrics_setup.sql | 72 ++ .../controller/postgrescluster/pgmonitor.go | 30 +- 25 files changed, 1812 insertions(+), 28 deletions(-) create mode 100644 internal/collector/generated/gte_pg16_metrics.json create mode 100644 internal/collector/generated/gte_pg17_metrics.json create mode 100644 internal/collector/generated/lt_pg16_metrics.json create mode 100644 internal/collector/generated/lt_pg17_metrics.json create mode 100644 internal/collector/generated/pgbackrest_metrics.json create mode 100644 internal/collector/generated/postgres_5m_metrics.json create mode 100644 internal/collector/generated/postgres_5s_metrics.json create mode 100644 internal/collector/gte_pg16_metrics.yaml create mode 100644 internal/collector/gte_pg17_metrics.yaml create mode 100644 internal/collector/lt_pg16_metrics.yaml create mode 100644 internal/collector/lt_pg17_metrics.yaml create mode 100644 internal/collector/pgbackrest_metrics.yaml create mode 100644 internal/collector/postgres_5m_metrics.yaml create mode 100644 internal/collector/postgres_5s_metrics.yaml create mode 100644 internal/collector/postgres_metrics.go create mode 100644 internal/controller/postgrescluster/metrics_setup.sql diff --git a/internal/collector/generated/gte_pg16_metrics.json b/internal/collector/generated/gte_pg16_metrics.json new file mode 100644 index 0000000000..3b27be7bc0 --- /dev/null +++ b/internal/collector/generated/gte_pg16_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been manually analyzed","metric_name":"ccp_stat_user_tables_analyze_count","static_attributes":{"server":"localhost:5432"},"value_column":"analyze_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been analyzed by the autovacuum daemon","metric_name":"ccp_stat_user_tables_autoanalyze_count","static_attributes":{"server":"localhost:5432"},"value_column":"autoanalyze_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been vacuumed by the autovacuum daemon","metric_name":"ccp_stat_user_tables_autovacuum_count","static_attributes":{"server":"localhost:5432"},"value_column":"autovacuum_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of index scans initiated on this table","metric_name":"ccp_stat_user_tables_idx_scan","static_attributes":{"server":"localhost:5432"},"value_column":"idx_scan"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of live rows fetched by index scans","metric_name":"ccp_stat_user_tables_idx_tup_fetch","static_attributes":{"server":"localhost:5432"},"value_column":"idx_tup_fetch"},{"attribute_columns":["dbname","relname","schemaname"],"description":"Estimated number of dead rows","metric_name":"ccp_stat_user_tables_n_dead_tup","static_attributes":{"server":"localhost:5432"},"value_column":"n_dead_tup"},{"attribute_columns":["dbname","relname","schemaname"],"description":"Estimated number of live rows","metric_name":"ccp_stat_user_tables_n_live_tup","static_attributes":{"server":"localhost:5432"},"value_column":"n_live_tup"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows deleted","metric_name":"ccp_stat_user_tables_n_tup_del","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_del"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows HOT updated (i.e., with no separate index update required)","metric_name":"ccp_stat_user_tables_n_tup_hot_upd","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_hot_upd"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows inserted","metric_name":"ccp_stat_user_tables_n_tup_ins","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_ins"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows updated","metric_name":"ccp_stat_user_tables_n_tup_upd","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_upd"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of sequential scans initiated on this table","metric_name":"ccp_stat_user_tables_seq_scan","static_attributes":{"server":"localhost:5432"},"value_column":"seq_scan"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of live rows fetched by sequential scans","metric_name":"ccp_stat_user_tables_seq_tup_read","static_attributes":{"server":"localhost:5432"},"value_column":"seq_tup_read"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been manually vacuumed (not counting VACUUM FULL)","metric_name":"ccp_stat_user_tables_vacuum_count","static_attributes":{"server":"localhost:5432"},"value_column":"vacuum_count"}],"sql":"SELECT\n current_database() as dbname\n , p.schemaname\n , p.relname\n , p.seq_scan\n , p.seq_tup_read\n , COALESCE(p.idx_scan, 0) AS idx_scan\n , COALESCE(p.idx_tup_fetch, 0) as idx_tup_fetch\n , p.n_tup_ins\n , p.n_tup_upd\n , p.n_tup_del\n , p.n_tup_hot_upd\n , p.n_tup_newpage_upd\n , p.n_live_tup\n , p.n_dead_tup\n , p.vacuum_count\n , p.autovacuum_count\n , p.analyze_count\n , p.autoanalyze_count\n FROM pg_catalog.pg_stat_user_tables p;\n"}] diff --git a/internal/collector/generated/gte_pg17_metrics.json b/internal/collector/generated/gte_pg17_metrics.json new file mode 100644 index 0000000000..de39cf6cca --- /dev/null +++ b/internal/collector/generated/gte_pg17_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"data_type":"sum","description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_bgwriter_buffers_checkpoint","static_attributes":{"server":"localhost:5432"}}],"sql":"SELECT c.buffers_written FROM pg_catalog.pg_stat_checkpointer c;\n"},{"metrics":[{"data_type":"sum","description":"Number of write operations, each of the size specified in op_bytes.","metric_name":"ccp_stat_bgwriter_buffers_backend","static_attributes":{"server":"localhost:5432"},"value_column":"writes"},{"data_type":"sum","description":"Number of fsync calls. These are only tracked in context normal.","metric_name":"ccp_stat_bgwriter_buffers_backend_fsync","static_attributes":{"server":"localhost:5432"},"value_column":"fsyncs"}],"sql":"SELECT\n s.writes\n , s.fsyncs\nFROM pg_catalog.pg_stat_io s WHERE backend_type = 'background writer';\n"},{"metrics":[{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_sync_time","static_attributes":{"server":"localhost:5432"},"value_column":"sync_time"},{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_write_time","static_attributes":{"server":"localhost:5432"},"value_column":"write_time","value_type":"double"},{"description":"Number of requested checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_req","static_attributes":{"server":"localhost:5432"},"value_column":"num_requested"},{"description":"Number of scheduled checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_timed","static_attributes":{"server":"localhost:5432"},"value_column":"num_timed"},{"description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_checkpointer_buffers_written","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_written"}],"sql":"SELECT\n c.num_timed\n , c.num_requested\n , c.write_time\n , c.sync_time\n , c.buffers_written\nFROM pg_catalog.pg_stat_checkpointer c;\n"}] diff --git a/internal/collector/generated/lt_pg16_metrics.json b/internal/collector/generated/lt_pg16_metrics.json new file mode 100644 index 0000000000..98bb0cc213 --- /dev/null +++ b/internal/collector/generated/lt_pg16_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been manually analyzed","metric_name":"ccp_stat_user_tables_analyze_count","static_attributes":{"server":"localhost:5432"},"value_column":"analyze_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been analyzed by the autovacuum daemon","metric_name":"ccp_stat_user_tables_autoanalyze_count","static_attributes":{"server":"localhost:5432"},"value_column":"autoanalyze_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been vacuumed by the autovacuum daemon","metric_name":"ccp_stat_user_tables_autovacuum_count","static_attributes":{"server":"localhost:5432"},"value_column":"autovacuum_count"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of index scans initiated on this table","metric_name":"ccp_stat_user_tables_idx_scan","static_attributes":{"server":"localhost:5432"},"value_column":"idx_scan"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of live rows fetched by index scans","metric_name":"ccp_stat_user_tables_idx_tup_fetch","static_attributes":{"server":"localhost:5432"},"value_column":"idx_tup_fetch"},{"attribute_columns":["dbname","relname","schemaname"],"description":"Estimated number of dead rows","metric_name":"ccp_stat_user_tables_n_dead_tup","static_attributes":{"server":"localhost:5432"},"value_column":"n_dead_tup"},{"attribute_columns":["dbname","relname","schemaname"],"description":"Estimated number of live rows","metric_name":"ccp_stat_user_tables_n_live_tup","static_attributes":{"server":"localhost:5432"},"value_column":"n_live_tup"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows deleted","metric_name":"ccp_stat_user_tables_n_tup_del","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_del"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows HOT updated (i.e., with no separate index update required)","metric_name":"ccp_stat_user_tables_n_tup_hot_upd","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_hot_upd"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows inserted","metric_name":"ccp_stat_user_tables_n_tup_ins","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_ins"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of rows updated","metric_name":"ccp_stat_user_tables_n_tup_upd","static_attributes":{"server":"localhost:5432"},"value_column":"n_tup_upd"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of sequential scans initiated on this table","metric_name":"ccp_stat_user_tables_seq_scan","static_attributes":{"server":"localhost:5432"},"value_column":"seq_scan"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of live rows fetched by sequential scans","metric_name":"ccp_stat_user_tables_seq_tup_read","static_attributes":{"server":"localhost:5432"},"value_column":"seq_tup_read"},{"attribute_columns":["dbname","relname","schemaname"],"data_type":"sum","description":"Number of times this table has been manually vacuumed (not counting VACUUM FULL)","metric_name":"ccp_stat_user_tables_vacuum_count","static_attributes":{"server":"localhost:5432"},"value_column":"vacuum_count"}],"sql":"SELECT\n current_database() as dbname\n , p.schemaname\n , p.relname\n , p.seq_scan\n , p.seq_tup_read\n , COALESCE(p.idx_scan, 0) AS idx_scan\n , COALESCE(p.idx_tup_fetch, 0) as idx_tup_fetch\n , p.n_tup_ins\n , p.n_tup_upd\n , p.n_tup_del\n , p.n_tup_hot_upd\n , 0::bigint AS n_tup_newpage_upd\n , p.n_live_tup\n , p.n_dead_tup\n , p.vacuum_count\n , p.autovacuum_count\n , p.analyze_count\n , p.autoanalyze_count\nFROM pg_catalog.pg_stat_user_tables p;\n"}] diff --git a/internal/collector/generated/lt_pg17_metrics.json b/internal/collector/generated/lt_pg17_metrics.json new file mode 100644 index 0000000000..d6266ffacb --- /dev/null +++ b/internal/collector/generated/lt_pg17_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"data_type":"sum","description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_bgwriter_buffers_checkpoint","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_written"}],"sql":"SELECT c.buffers_checkpoint AS buffers_written FROM pg_catalog.pg_stat_bgwriter c;\n"},{"metrics":[{"data_type":"sum","description":"Number of write operations, each of the size specified in op_bytes.","metric_name":"ccp_stat_bgwriter_buffers_backend","static_attributes":{"server":"localhost:5432"},"value_column":"writes"},{"data_type":"sum","description":"Number of fsync calls. These are only tracked in context normal.","metric_name":"ccp_stat_bgwriter_buffers_backend_fsync","static_attributes":{"server":"localhost:5432"},"value_column":"fsyncs"}],"sql":"SELECT\n s.buffers_backend AS writes\n , s.buffers_backend_fsync AS fsyncs\nFROM pg_catalog.pg_stat_bgwriter s;\n"},{"metrics":[{"description":"Number of scheduled checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_timed","static_attributes":{"server":"localhost:5432"},"value_column":"num_timed"},{"description":"Number of requested checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_req","static_attributes":{"server":"localhost:5432"},"value_column":"num_requested"},{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_write_time","static_attributes":{"server":"localhost:5432"},"value_column":"write_time","value_type":"double"},{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_sync_time","static_attributes":{"server":"localhost:5432"},"value_column":"sync_time"},{"description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_checkpointer_buffers_written","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_written"}],"sql":"SELECT\n c.checkpoints_timed AS num_timed\n , c.checkpoints_req AS num_requested\n , c.checkpoint_write_time AS write_time\n , c.checkpoint_sync_time AS sync_time\n , c.buffers_checkpoint AS buffers_written\nFROM pg_catalog.pg_stat_bgwriter c;\n"}] diff --git a/internal/collector/generated/pgbackrest_metrics.json b/internal/collector/generated/pgbackrest_metrics.json new file mode 100644 index 0000000000..713f0a8ac1 --- /dev/null +++ b/internal/collector/generated/pgbackrest_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"attribute_columns":["repo"],"description":"Seconds since the last completed full or differential backup. Differential is always based off last full.","metric_name":"ccp_backrest_last_diff_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_diff_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full backup","metric_name":"ccp_backrest_last_full_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_full_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full, differential or incremental backup.\nIncremental is always based off last full or differential.\n","metric_name":"ccp_backrest_last_incr_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_incr_backup"},{"attribute_columns":["backup_type","repo"],"description":"pgBackRest version number when this backup was performed","metric_name":"ccp_backrest_last_info_backrest_repo_version","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backrest_repo_version"},{"attribute_columns":["backup_type","repo"],"description":"An error has been encountered in the backup. Check logs for more information.","metric_name":"ccp_backrest_last_info_backup_error","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backup_error"},{"attribute_columns":["backup_type","repo"],"description":"Total runtime in seconds of this backup","metric_name":"ccp_backrest_last_info_backup_runtime_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"backup_runtime_seconds"},{"attribute_columns":["backup_type","repo"],"description":"Actual size of only this individual backup in the pgbackrest repository","metric_name":"ccp_backrest_last_info_repo_backup_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_backup_size_bytes"},{"attribute_columns":["backup_type","repo"],"description":"Total size of this backup in the pgbackrest repository, including all required previous backups and WAL","metric_name":"ccp_backrest_last_info_repo_total_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_total_size_bytes"},{"attribute_columns":["repo"],"description":"Seconds since the oldest completed full backup","metric_name":"ccp_backrest_oldest_full_backup_time_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_full_backup"}],"sql":"DROP TABLE IF EXISTS pgbackrest_info;\nCREATE TEMPORARY TABLE pgbackrest_info (data json);\n\nCOPY pgbackrest_info (data)\nFROM PROGRAM 'export LC_ALL=C \u0026\u0026 printf \"\\f\" \u0026\u0026 pgbackrest info --log-level-console=info --log-level-stderr=warn --output=json --stanza=db \u0026\u0026 printf \"\\f\"'\nWITH (FORMAT csv, HEADER false, QUOTE E'\\f');\n\nWITH\nall_backups (data) AS (\n SELECT jsonb_array_elements(to_jsonb(data)) FROM pgbackrest_info\n),\nstanza_backups (stanza, backup) AS (\n SELECT data-\u003e\u003e'name', jsonb_array_elements(data-\u003e'backup') FROM all_backups\n),\nordered_backups (stanza, backup, seq_oldest, seq_newest) AS (\n SELECT stanza, backup,\n ROW_NUMBER() OVER (\n PARTITION BY stanza, backup-\u003e'database'-\u003e\u003e'repo-key', backup-\u003e\u003e'type'\n ORDER BY backup-\u003e'timestamp'-\u003e\u003e'start' ASC, backup-\u003e'timestamp'-\u003e\u003e'stop' ASC\n ),\n ROW_NUMBER() OVER (\n PARTITION BY stanza, backup-\u003e'database'-\u003e\u003e'repo-key', backup-\u003e\u003e'type'\n ORDER BY backup-\u003e'timestamp'-\u003e\u003e'start' DESC, backup-\u003e'timestamp'-\u003e\u003e'stop' DESC\n )\n FROM stanza_backups\n),\n\nccp_backrest_last_info AS (\n SELECT\n stanza,\n split_part(backup-\u003e'backrest'-\u003e\u003e'version', '.', 1) || lpad(split_part(backup-\u003e'backrest'-\u003e\u003e'version', '.', 2), 2, '0') || lpad(coalesce(nullif(split_part(backup-\u003e'backrest'-\u003e\u003e'version', '.', 3), ''), '00'), 2, '0') AS backrest_repo_version,\n backup-\u003e'database'-\u003e\u003e'repo-key' AS repo,\n backup-\u003e\u003e'type' AS backup_type,\n backup-\u003e'info'-\u003e'repository'-\u003e\u003e'delta' AS repo_backup_size_bytes,\n backup-\u003e'info'-\u003e'repository'-\u003e\u003e'size' AS repo_total_size_bytes,\n (backup-\u003e'timestamp'-\u003e\u003e'stop')::bigint - (backup-\u003e'timestamp'-\u003e\u003e'start')::bigint AS backup_runtime_seconds,\n CASE WHEN backup-\u003e\u003e'error' = 'true' THEN 1 ELSE 0 END AS backup_error\n FROM ordered_backups\n WHERE seq_newest = 1\n),\n\nccp_backrest_oldest_full_backup AS (\n SELECT\n stanza,\n backup-\u003e'database'-\u003e\u003e'repo-key' AS repo,\n min((backup-\u003e'timestamp'-\u003e\u003e'stop')::bigint) AS time_seconds\n FROM ordered_backups\n WHERE seq_oldest = 1 AND backup-\u003e\u003e'type' IN ('full')\n GROUP BY 1,2\n),\n\nccp_backrest_last_full_backup AS (\n SELECT\n stanza,\n backup-\u003e'database'-\u003e\u003e'repo-key' AS repo,\n EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup-\u003e'timestamp'-\u003e\u003e'stop')::bigint) AS time_since_completion_seconds\n FROM ordered_backups\n WHERE seq_newest = 1 AND backup-\u003e\u003e'type' IN ('full')\n GROUP BY 1,2\n),\n\nccp_backrest_last_diff_backup AS (\n SELECT\n stanza,\n backup-\u003e'database'-\u003e\u003e'repo-key' AS repo,\n EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup-\u003e'timestamp'-\u003e\u003e'stop')::bigint) AS time_since_completion_seconds\n FROM ordered_backups\n WHERE seq_newest = 1 AND backup-\u003e\u003e'type' IN ('full','diff')\n GROUP BY 1,2\n),\n\nccp_backrest_last_incr_backup AS (\n SELECT\n stanza,\n backup-\u003e'database'-\u003e\u003e'repo-key' AS repo,\n EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup-\u003e'timestamp'-\u003e\u003e'stop')::bigint) AS time_since_completion_seconds\n FROM ordered_backups\n WHERE seq_newest = 1 AND backup-\u003e\u003e'type' IN ('full','diff','incr')\n GROUP BY 1,2\n)\n\nSELECT\n ccp_backrest_last_diff_backup.time_since_completion_seconds as last_diff_backup,\n ccp_backrest_last_full_backup.time_since_completion_seconds as last_full_backup,\n ccp_backrest_last_incr_backup.time_since_completion_seconds as last_incr_backup,\n ccp_backrest_last_info.backrest_repo_version as last_info_backrest_repo_version,\n ccp_backrest_last_info.backup_error as last_info_backup_error,\n ccp_backrest_last_info.backup_type as backup_type,\n ccp_backrest_last_info.backup_runtime_seconds as backup_runtime_seconds,\n ccp_backrest_last_info.repo_backup_size_bytes as repo_backup_size_bytes,\n ccp_backrest_last_info.repo_total_size_bytes as repo_total_size_bytes,\n ccp_backrest_oldest_full_backup.time_seconds as oldest_full_backup,\n ccp_backrest_last_incr_backup.repo as repo\n \nFROM\n ccp_backrest_last_diff_backup\n , ccp_backrest_last_full_backup\n , ccp_backrest_last_incr_backup\n , ccp_backrest_last_info\n , ccp_backrest_oldest_full_backup;\n"}] diff --git a/internal/collector/generated/pgbouncer_metrics_queries.json b/internal/collector/generated/pgbouncer_metrics_queries.json index 5b0ed8abc5..0248051d94 100644 --- a/internal/collector/generated/pgbouncer_metrics_queries.json +++ b/internal/collector/generated/pgbouncer_metrics_queries.json @@ -1 +1 @@ -[{"metrics":[{"attribute_columns":["database","user","state","application_name","link"],"description":"Current waiting time in seconds","metric_name":"ccp_pgbouncer_clients_wait_seconds","value_column":"wait"}],"sql":"SHOW CLIENTS"},{"metrics":[{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"Maximum number of server connections","metric_name":"ccp_pgbouncer_databases_pool_size","value_column":"pool_size"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"Minimum number of server connections","metric_name":"ccp_pgbouncer_databases_min_pool_size","value_column":"min_pool_size"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"Maximum number of additional connections for this database","metric_name":"ccp_pgbouncer_databases_reserve_pool","value_column":"reserve_pool"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"Maximum number of allowed connections for this database, as set by max_db_connections, either globally or per database","metric_name":"ccp_pgbouncer_databases_max_connections","value_column":"max_connections"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"Current number of connections for this database","metric_name":"ccp_pgbouncer_databases_current_connections","value_column":"current_connections"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"1 if this database is currently paused, else 0","metric_name":"ccp_pgbouncer_databases_paused","value_column":"paused"},{"attribute_columns":["name","host","port","database","force_user","pool_mode"],"description":"1 if this database is currently disabled, else 0","metric_name":"ccp_pgbouncer_databases_disabled","value_column":"disabled"}],"sql":"SHOW DATABASES"},{"metrics":[{"attribute_columns":["list"],"description":"Count of items registered with pgBouncer","metric_name":"ccp_pgbouncer_lists_item_count","value_column":"items"}],"sql":"SHOW LISTS"},{"metrics":[{"attribute_columns":["database","user"],"description":"Client connections that are either linked to server connections or are idle with no queries waiting to be processed","metric_name":"ccp_pgbouncer_pools_client_active","value_column":"cl_active"},{"attribute_columns":["database","user"],"description":"Client connections that have sent queries but have not yet got a server connection","metric_name":"ccp_pgbouncer_pools_client_waiting","value_column":"cl_waiting"},{"attribute_columns":["database","user"],"description":"Server connections that are linked to a client","metric_name":"ccp_pgbouncer_pools_server_active","value_column":"sv_active"},{"attribute_columns":["database","user"],"description":"Server connections that are unused and immediately usable for client queries","metric_name":"ccp_pgbouncer_pools_server_idle","value_column":"sv_idle"},{"attribute_columns":["database","user"],"description":"Server connections that have been idle for more than server_check_delay, so they need server_check_query to run on them before they can be used again","metric_name":"ccp_pgbouncer_pools_server_used","value_column":"sv_used"}],"sql":"SHOW POOLS"},{"metrics":[{"attribute_columns":["database","user","state","application_name","link"],"description":"1 if the connection will be closed as soon as possible, because a configuration file reload or DNS update changed the connection information or RECONNECT was issued","metric_name":"ccp_pgbouncer_servers_close_needed","value_column":"close_needed"}],"sql":"SHOW SERVERS"}] +[{"metrics":[{"attribute_columns":["database","user","state","application_name","link"],"description":"Current waiting time in seconds","metric_name":"ccp_pgbouncer_clients_wait_seconds","value_column":"wait"}],"sql":"SHOW CLIENTS"},{"metrics":[{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"Maximum number of server connections","metric_name":"ccp_pgbouncer_databases_pool_size","value_column":"pool_size"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"Minimum number of server connections","metric_name":"ccp_pgbouncer_databases_min_pool_size","value_column":"min_pool_size"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"Maximum number of additional connections for this database","metric_name":"ccp_pgbouncer_databases_reserve_pool","value_column":"reserve_pool"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"Maximum number of allowed connections for this database, as set by max_db_connections, either globally or per database","metric_name":"ccp_pgbouncer_databases_max_connections","value_column":"max_connections"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"Current number of connections for this database","metric_name":"ccp_pgbouncer_databases_current_connections","value_column":"current_connections"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"1 if this database is currently paused, else 0","metric_name":"ccp_pgbouncer_databases_paused","value_column":"paused"},{"attribute_columns":["name","port","database","force_user","pool_mode"],"description":"1 if this database is currently disabled, else 0","metric_name":"ccp_pgbouncer_databases_disabled","value_column":"disabled"}],"sql":"SHOW DATABASES"},{"metrics":[{"attribute_columns":["list"],"description":"Count of items registered with pgBouncer","metric_name":"ccp_pgbouncer_lists_item_count","value_column":"items"}],"sql":"SHOW LISTS"},{"metrics":[{"attribute_columns":["database","user"],"description":"Client connections that are either linked to server connections or are idle with no queries waiting to be processed","metric_name":"ccp_pgbouncer_pools_client_active","value_column":"cl_active"},{"attribute_columns":["database","user"],"description":"Client connections that have sent queries but have not yet got a server connection","metric_name":"ccp_pgbouncer_pools_client_waiting","value_column":"cl_waiting"},{"attribute_columns":["database","user"],"description":"Server connections that are linked to a client","metric_name":"ccp_pgbouncer_pools_server_active","value_column":"sv_active"},{"attribute_columns":["database","user"],"description":"Server connections that are unused and immediately usable for client queries","metric_name":"ccp_pgbouncer_pools_server_idle","value_column":"sv_idle"},{"attribute_columns":["database","user"],"description":"Server connections that have been idle for more than server_check_delay, so they need server_check_query to run on them before they can be used again","metric_name":"ccp_pgbouncer_pools_server_used","value_column":"sv_used"}],"sql":"SHOW POOLS"},{"metrics":[{"attribute_columns":["database","user","state","application_name","link"],"description":"1 if the connection will be closed as soon as possible, because a configuration file reload or DNS update changed the connection information or RECONNECT was issued","metric_name":"ccp_pgbouncer_servers_close_needed","value_column":"close_needed"}],"sql":"SHOW SERVERS"}] diff --git a/internal/collector/generated/postgres_5m_metrics.json b/internal/collector/generated/postgres_5m_metrics.json new file mode 100644 index 0000000000..a9a3500a02 --- /dev/null +++ b/internal/collector/generated/postgres_5m_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"attribute_columns":["dbname"],"description":"Database size in bytes","metric_name":"ccp_database_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes"}],"sql":"SELECT datname as dbname , pg_database_size(datname) as bytes FROM pg_catalog.pg_database WHERE datistemplate = false;\n"},{"metrics":[{"description":"Count of sequences that have reached greater than or equal to 75% of their max available numbers.\nFunction monitor.sequence_status() can provide more details if run directly on system.\n","metric_name":"ccp_sequence_exhaustion_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT count(*) AS count FROM (\n SELECT CEIL((s.max_value-min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS slots\n , CEIL((COALESCE(s.last_value,s.min_value)-s.min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS used\n FROM pg_catalog.pg_sequences s\n) x WHERE (ROUND(used/slots*100)::int) \u003e 75;\n"},{"metrics":[{"attribute_columns":["dbname"],"description":"Number of times disk blocks were found already in the buffer cache, so that a read was not necessary","metric_name":"ccp_stat_database_blks_hit","static_attributes":{"server":"localhost:5432"},"value_column":"blks_hit"},{"attribute_columns":["dbname"],"description":"Number of disk blocks read in this database","metric_name":"ccp_stat_database_blks_read","static_attributes":{"server":"localhost:5432"},"value_column":"blks_read"},{"attribute_columns":["dbname"],"description":"Number of queries canceled due to conflicts with recovery in this database","metric_name":"ccp_stat_database_conflicts","static_attributes":{"server":"localhost:5432"},"value_column":"conflicts"},{"attribute_columns":["dbname"],"description":"Number of deadlocks detected in this database","metric_name":"ccp_stat_database_deadlocks","static_attributes":{"server":"localhost:5432"},"value_column":"deadlocks"},{"attribute_columns":["dbname"],"description":"Total amount of data written to temporary files by queries in this database","metric_name":"ccp_stat_database_temp_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"temp_bytes"},{"attribute_columns":["dbname"],"description":"Number of rows deleted by queries in this database","metric_name":"ccp_stat_database_temp_files","static_attributes":{"server":"localhost:5432"},"value_column":"temp_files"},{"attribute_columns":["dbname"],"description":"Number of rows deleted by queries in this database","metric_name":"ccp_stat_database_tup_deleted","static_attributes":{"server":"localhost:5432"},"value_column":"tup_deleted"},{"attribute_columns":["dbname"],"description":"Number of rows fetched by queries in this database","metric_name":"ccp_stat_database_tup_fetched","static_attributes":{"server":"localhost:5432"},"value_column":"tup_fetched"},{"attribute_columns":["dbname"],"description":"Number of rows inserted by queries in this database","metric_name":"ccp_stat_database_tup_inserted","static_attributes":{"server":"localhost:5432"},"value_column":"tup_inserted"},{"attribute_columns":["dbname"],"description":"Number of rows returned by queries in this database","metric_name":"ccp_stat_database_tup_returned","static_attributes":{"server":"localhost:5432"},"value_column":"tup_returned"},{"attribute_columns":["dbname"],"description":"Number of rows updated by queries in this database","metric_name":"ccp_stat_database_tup_updated","static_attributes":{"server":"localhost:5432"},"value_column":"tup_updated"},{"attribute_columns":["dbname"],"description":"Number of transactions in this database that have been committed","metric_name":"ccp_stat_database_xact_commit","static_attributes":{"server":"localhost:5432"},"value_column":"xact_commit"},{"attribute_columns":["dbname"],"description":"Number of transactions in this database that have been rolled back","metric_name":"ccp_stat_database_xact_rollback","static_attributes":{"server":"localhost:5432"},"value_column":"xact_rollback"}],"sql":"SELECT s.datname AS dbname , s.xact_commit , s.xact_rollback , s.blks_read , s.blks_hit , s.tup_returned , s.tup_fetched , s.tup_inserted , s.tup_updated , s.tup_deleted , s.conflicts , s.temp_files , s.temp_bytes , s.deadlocks FROM pg_catalog.pg_stat_database s JOIN pg_catalog.pg_database d ON d.datname = s.datname WHERE d.datistemplate = false;\n"}] diff --git a/internal/collector/generated/postgres_5s_metrics.json b/internal/collector/generated/postgres_5s_metrics.json new file mode 100644 index 0000000000..02b79ce2ed --- /dev/null +++ b/internal/collector/generated/postgres_5s_metrics.json @@ -0,0 +1 @@ +[{"metrics":[{"description":"Seconds since the last successful archive operation","metric_name":"ccp_archive_command_status_seconds_since_last_archive","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_archive","value_type":"double"}],"sql":"SELECT EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)) AS seconds_since_last_archive FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of WAL files that have been successfully archived","metric_name":"ccp_archive_command_status_archived_count","static_attributes":{"server":"localhost:5432"},"value_column":"archived_count"}],"sql":"SELECT archived_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of failed attempts for archiving WAL files","metric_name":"ccp_archive_command_status_failed_count","static_attributes":{"server":"localhost:5432"},"value_column":"failed_count"}],"sql":"SELECT failed_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Seconds since the last recorded failure of the archive_command","metric_name":"ccp_archive_command_status_seconds_since_last_fail","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_fail"}],"sql":"SELECT CASE\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) \u003c 0 THEN 0\n ELSE EXTRACT(epoch from (last_failed_time - last_archived_time))\n END AS seconds_since_last_fail\nFROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Total non-idle connections","metric_name":"ccp_connection_stats_active","static_attributes":{"server":"localhost:5432"},"value_column":"active"},{"description":"Total idle connections","metric_name":"ccp_connection_stats_idle","static_attributes":{"server":"localhost:5432"},"value_column":"idle"},{"description":"Total idle in transaction connections","metric_name":"ccp_connection_stats_idle_in_txn","static_attributes":{"server":"localhost:5432"},"value_column":"idle_in_txn"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_blocked_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_blocked_query_time","value_type":"double"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_connections","static_attributes":{"server":"localhost:5432"},"value_column":"max_connections"},{"description":"Length of time in seconds of the longest idle in transaction session","metric_name":"ccp_connection_stats_max_idle_in_txn_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_idle_in_txn_time","value_type":"double"},{"description":"Length of time in seconds of the longest running query","metric_name":"ccp_connection_stats_max_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_query_time","value_type":"double"},{"description":"Total idle and non-idle connections","metric_name":"ccp_connection_stats_total","static_attributes":{"server":"localhost:5432"},"value_column":"total"}],"sql":"SELECT ((total - idle) - idle_in_txn) as active\n , total\n , idle\n , idle_in_txn\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - state_change))),0) FROM pg_catalog.pg_stat_activity WHERE state = 'idle in transaction') AS max_idle_in_txn_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND state \u003c\u003e 'idle' ) AS max_query_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND wait_event_type = 'Lock' ) AS max_blocked_query_time\n , max_connections\n FROM (\n SELECT COUNT(*) as total\n , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle\n , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x\n JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);\n"},{"metrics":[{"attribute_columns":["dbname"],"description":"Total number of checksum failures on this database","metric_name":"ccp_data_checksum_failure_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"},{"attribute_columns":["dbname"],"description":"Time interval in seconds since the last checksum failure was encountered","metric_name":"ccp_data_checksum_failure_time_since_last_failure_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"time_since_last_failure_seconds","value_type":"double"}],"sql":"SELECT datname AS dbname , checksum_failures AS count , coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds FROM pg_catalog.pg_stat_database WHERE pg_stat_database.datname IS NOT NULL;\n"},{"metrics":[{"attribute_columns":["dbname","mode"],"description":"Return value of 1 means database is in recovery. Otherwise 2 it is a primary.","metric_name":"ccp_locks_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT pg_database.datname as dbname , tmp.mode , COALESCE(count,0) as count FROM (\n VALUES ('accesssharelock'),\n ('rowsharelock'),\n ('rowexclusivelock'),\n ('shareupdateexclusivelock'),\n ('sharelock'),\n ('sharerowexclusivelock'),\n ('exclusivelock'),\n ('accessexclusivelock')\n) AS tmp(mode) CROSS JOIN pg_catalog.pg_database LEFT JOIN\n (SELECT database, lower(mode) AS mode,count(*) AS count\n FROM pg_catalog.pg_locks WHERE database IS NOT NULL\n GROUP BY database, lower(mode)\n) AS tmp2 ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database;\n"},{"metrics":[{"description":"CPU limit value in milli cores","metric_name":"ccp_nodemx_cpu_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"CPU request value in milli cores","metric_name":"ccp_nodemx_cpu_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"}],"sql":"SELECT monitor.kdapi_scalar_bigint('cpu_request') AS request , monitor.kdapi_scalar_bigint('cpu_limit') AS limit\n"},{"metrics":[{"description":"CPU usage in nanoseconds","metric_name":"ccp_nodemx_cpuacct_usage","static_attributes":{"server":"localhost:5432"},"value_column":"usage","value_type":"double"},{"description":"CPU usage snapshot timestamp","metric_name":"ccp_nodemx_cpuacct_usage_ts","static_attributes":{"server":"localhost:5432"},"value_column":"usage_ts","value_type":"double"}],"sql":"SELECT CASE WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('cpuacct.usage')\n ELSE (SELECT val FROM monitor.cgroup_setof_kv('cpu.stat') where key = 'usage_usec') * 1000\n END AS usage,\n extract(epoch from clock_timestamp()) AS usage_ts;\n"},{"metrics":[{"description":"The total available run-time within a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_period_us","static_attributes":{"server":"localhost:5432"},"value_column":"period_us"},{"description":"The length of a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_quota_us","static_attributes":{"server":"localhost:5432"},"value_column":"quota_us","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n monitor.cgroup_scalar_bigint('cpu.cfs_period_us')\n ELSE\n (monitor.cgroup_array_bigint('cpu.max'))[2]\n END AS period_us,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n GREATEST(monitor.cgroup_scalar_bigint('cpu.cfs_quota_us'), 0)\n ELSE\n GREATEST((monitor.cgroup_array_bigint('cpu.max'))[1], 0)\n END AS quota_us;\n"},{"metrics":[{"description":"Number of periods that any thread was runnable","metric_name":"ccp_nodemx_cpustat_nr_periods","static_attributes":{"server":"localhost:5432"},"value_column":"nr_periods","value_type":"double"},{"description":"Number of runnable periods in which the application used its entire quota and was throttled","metric_name":"ccp_nodemx_cpustat_nr_throttled","static_attributes":{"server":"localhost:5432"},"value_column":"nr_throttled"},{"description":"CPU stat snapshot timestamp","metric_name":"ccp_nodemx_cpustat_snap_ts","static_attributes":{"server":"localhost:5432"},"value_column":"snap_ts","value_type":"double"},{"description":"Sum total amount of time individual threads within the monitor.cgroup were throttled","metric_name":"ccp_nodemx_cpustat_throttled_time","static_attributes":{"server":"localhost:5432"},"value_column":"throttled_time","value_type":"double"}],"sql":"WITH d(key, val) AS (select key, val from monitor.cgroup_setof_kv('cpu.stat')) SELECT\n (SELECT val FROM d WHERE key='nr_periods') AS nr_periods,\n (SELECT val FROM d WHERE key='nr_throttled') AS nr_throttled,\n (SELECT val FROM d WHERE key='throttled_usec') AS throttled_time,\n extract(epoch from clock_timestamp()) as snap_ts;\n"},{"metrics":[{"attribute_columns":["fs_type","mount_point"],"description":"Available size in bytes","metric_name":"ccp_nodemx_data_disk_available_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"available_bytes","value_type":"double"},{"attribute_columns":["fs_type","mount_point"],"description":"Available file nodes","metric_name":"ccp_nodemx_data_disk_free_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"free_file_nodes"},{"attribute_columns":["fs_type","mount_point"],"description":"Size in bytes","metric_name":"ccp_nodemx_data_disk_total_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_bytes"},{"attribute_columns":["fs_type","mount_point"],"description":"Total file nodes","metric_name":"ccp_nodemx_data_disk_total_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"total_file_nodes"}],"sql":"SELECT mount_point,fs_type,total_bytes,available_bytes,total_file_nodes,free_file_nodes\n FROM monitor.proc_mountinfo() m\n JOIN monitor.fsinfo(m.mount_point) f USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%'\n"},{"metrics":[{"attribute_columns":["mount_point"],"description":"Total sectors read","metric_name":"ccp_nodemx_disk_activity_sectors_read","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_read"},{"attribute_columns":["mount_point"],"description":"Total sectors written","metric_name":"ccp_nodemx_disk_activity_sectors_written","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_written"}],"sql":"SELECT mount_point,sectors_read,sectors_written\n FROM monitor.proc_mountinfo() m\n JOIN monitor.proc_diskstats() d USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%';\n"},{"metrics":[{"description":"Total bytes of anonymous and swap cache memory on active LRU list","metric_name":"ccp_nodemx_mem_active_anon","static_attributes":{"server":"localhost:5432"},"value_column":"active_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on active LRU list","metric_name":"ccp_nodemx_mem_active_file","static_attributes":{"server":"localhost:5432"},"value_column":"active_file","value_type":"double"},{"description":"Total bytes of page cache memory","metric_name":"ccp_nodemx_mem_cache","static_attributes":{"server":"localhost:5432"},"value_column":"cache","value_type":"double"},{"description":"Total bytes that are waiting to get written back to the disk","metric_name":"ccp_nodemx_mem_dirty","static_attributes":{"server":"localhost:5432"},"value_column":"dirty"},{"description":"Total bytes of anonymous and swap cache memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_anon","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_file","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_file","value_type":"double"},{"description":"Unknown metric from ccp_nodemx_mem","metric_name":"ccp_nodemx_mem_kmem_usage_in_byte","static_attributes":{"server":"localhost:5432"},"value_column":"kmem_usage_in_byte"},{"description":"Memory limit value in bytes","metric_name":"ccp_nodemx_mem_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"Total bytes of mapped file (includes tmpfs/shmem)","metric_name":"ccp_nodemx_mem_mapped_file","static_attributes":{"server":"localhost:5432"},"value_column":"mapped_file"},{"description":"Memory request value in bytes","metric_name":"ccp_nodemx_mem_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"},{"description":"Total bytes of anonymous and swap cache memory","metric_name":"ccp_nodemx_mem_rss","static_attributes":{"server":"localhost:5432"},"value_column":"rss","value_type":"double"},{"description":"Total bytes of shared memory","metric_name":"ccp_nodemx_mem_shmem","static_attributes":{"server":"localhost:5432"},"value_column":"shmem","value_type":"double"},{"description":"Total usage in bytes","metric_name":"ccp_nodemx_mem_usage_in_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"usage_in_bytes"}],"sql":"WITH d(key, val) as (SELECT key, val FROM monitor.cgroup_setof_kv('memory.stat')) SELECT\n monitor.kdapi_scalar_bigint('mem_request') AS request,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.limit_in_bytes') = 9223372036854771712 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.limit_in_bytes') END)\n ELSE\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.max') = 9223372036854775807 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.max') END)\n END AS limit,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='cache')\n ELSE 0\n END as cache,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='rss')\n ELSE 0\n END as RSS,\n (SELECT val FROM d WHERE key='shmem') as shmem,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='mapped_file')\n ELSE 0\n END as mapped_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='dirty')\n ELSE (SELECT val FROM d WHERE key='file_dirty')\n END as dirty,\n (SELECT val FROM d WHERE key='active_anon') as active_anon,\n (SELECT val FROM d WHERE key='inactive_anon') as inactive_anon,\n (SELECT val FROM d WHERE key='active_file') as active_file,\n (SELECT val FROM d WHERE key='inactive_file') as inactive_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.usage_in_bytes')\n ELSE monitor.cgroup_scalar_bigint('memory.current')\n END as usage_in_bytes,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.kmem.usage_in_bytes')\n ELSE 0\n END as kmem_usage_in_byte;\n"},{"metrics":[{"attribute_columns":["interface"],"description":"Number of bytes received","metric_name":"ccp_nodemx_network_rx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"rx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets received","metric_name":"ccp_nodemx_network_rx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"rx_packets"},{"attribute_columns":["interface"],"description":"Number of bytes transmitted","metric_name":"ccp_nodemx_network_tx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"tx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets transmitted","metric_name":"ccp_nodemx_network_tx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"tx_packets"}],"sql":"SELECT interface\n ,tx_bytes\n ,tx_packets\n ,rx_bytes\n ,rx_packets from monitor.proc_network_stats()\n"},{"metrics":[{"description":"Total number of database processes","metric_name":"ccp_nodemx_process_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT monitor.cgroup_process_count() as count;\n"},{"metrics":[{"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_reset_time","static_attributes":{"server":"localhost:5432"},"value_column":"time"}],"sql":"SELECT monitor.pg_stat_statements_reset_info(-1) as time;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Average query runtime in milliseconds","metric_name":"ccp_pg_stat_statements_top_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"top_mean_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max(monitor.mean_exec_time) AS top_mean_exec_time_ms\nFROM monitor GROUP BY 1,2,3,4 ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","role"],"description":"Total number of queries run per user/database","metric_name":"ccp_pg_stat_statements_total_calls_count","static_attributes":{"server":"localhost:5432"},"value_column":"calls_count","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"mean_exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total rows returned from all queries per user/database","metric_name":"ccp_pg_stat_statements_total_row_count","static_attributes":{"server":"localhost:5432"},"value_column":"row_count","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.calls\n , s.total_exec_time\n , s.mean_exec_time\n , s.rows\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , sum(calls) AS calls_count\n , sum(total_exec_time) AS exec_time_ms\n , avg(mean_exec_time) AS mean_exec_time_ms\n , sum(rows) AS row_count\nFROM monitor GROUP BY 1,2;\n"},{"metrics":[{"description":"The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######).","metric_name":"ccp_postgresql_version_current","static_attributes":{"server":"localhost:5432"},"value_column":"current"}],"sql":"SELECT current_setting('server_version_num')::int AS current;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_postmaster_uptime_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"seconds","value_type":"double"}],"sql":"SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_replication_lag_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"}],"sql":"SELECT pg_wal_lsn_diff(sent_lsn, replay_lsn) AS bytes\n FROM pg_catalog.pg_stat_replication;\n"},{"metrics":[{"attribute_columns":["role"],"description":"Length of time since the last WAL file was received and replayed on replica.\nAlways increases, possibly causing false positives if the primary stops writing.\nMonitors for replicas that stop receiving WAL all together.\n","metric_name":"ccp_replication_lag_received_time","static_attributes":{"server":"localhost:5432"},"value_column":"received_time","value_type":"double"},{"attribute_columns":["role"],"description":"Length of time since the last transaction was replayed on replica.\nReturns zero if last WAL received equals last WAL replayed. Avoids\nfalse positives when primary stops writing. Monitors for replicas that\ncannot keep up with primary WAL generation.\n","metric_name":"ccp_replication_lag_replay_time","static_attributes":{"server":"localhost:5432"},"value_column":"replay_time","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END\nAS replay_time , CASE\n WHEN pg_is_in_recovery() = false THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END\nAS received_time , CASE\n WHEN pg_is_in_recovery() = true THEN 'replica'\n ELSE 'primary'\n END\nAS role;\n"},{"metrics":[{"description":"Number of settings from pg_settings catalog in a pending_restart state","metric_name":"ccp_settings_pending_restart_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true;\n"},{"metrics":[{"description":"Number of buffers allocated","metric_name":"ccp_stat_bgwriter_buffers_alloc","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_alloc"},{"data_type":"sum","description":"Number of buffers written by the background writer","metric_name":"ccp_stat_bgwriter_buffers_clean","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_clean"},{"description":"Number of times the background writer stopped a cleaning scan because it had written too many buffers","metric_name":"ccp_stat_bgwriter_maxwritten_clean","static_attributes":{"server":"localhost:5432"},"value_column":"maxwritten_clean"}],"sql":"SELECT\n buffers_clean\n , maxwritten_clean\n , buffers_alloc\nFROM pg_catalog.pg_stat_bgwriter;\n"},{"metrics":[{"description":"Oldest current transaction ID in cluster","metric_name":"ccp_transaction_wraparound_oldest_current_xid","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_current_xid"},{"description":"Percentage towards emergency autovacuum process starting","metric_name":"ccp_transaction_wraparound_percent_towards_emergency_autovac","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_emergency_autovac"},{"description":"Percentage towards transaction ID wraparound","metric_name":"ccp_transaction_wraparound_percent_towards_wraparound","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_wraparound"}],"sql":"WITH max_age AS (\n SELECT 2000000000 as max_old_xid\n , setting AS autovacuum_freeze_max_age\n FROM pg_catalog.pg_settings\n WHERE name = 'autovacuum_freeze_max_age')\n, per_database_stats AS (\n SELECT datname\n , m.max_old_xid::int\n , m.autovacuum_freeze_max_age::int\n , age(d.datfrozenxid) AS oldest_current_xid\n FROM pg_catalog.pg_database d\n JOIN max_age m ON (true)\n WHERE d.datallowconn)\nSELECT max(oldest_current_xid) AS oldest_current_xid , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac FROM per_database_stats;\n"},{"metrics":[{"description":"Current size in bytes of the WAL directory","metric_name":"ccp_wal_activity_total_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_size_bytes"}],"sql":"SELECT last_5_min_size_bytes,\n (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes\n FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification \u003e CURRENT_TIMESTAMP - '5 minutes'::interval) x;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_top_max_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"max_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total time spent in the statement in milliseconds","metric_name":"ccp_pg_stat_statements_top_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"total_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , total_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total amount of WAL generated by the statement in bytes","metric_name":"ccp_pg_stat_statements_top_wal_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL full page images generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_fpi","static_attributes":{"server":"localhost:5432"},"value_column":"fpi","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL records generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_records","static_attributes":{"server":"localhost:5432"},"value_column":"records","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , query\n , queryid\n , records\n , fpi\n , bytes\nFROM monitor ORDER BY bytes DESC LIMIT 20;\n"}] diff --git a/internal/collector/gte_pg16_metrics.yaml b/internal/collector/gte_pg16_metrics.yaml new file mode 100644 index 0000000000..e5aeb7194e --- /dev/null +++ b/internal/collector/gte_pg16_metrics.yaml @@ -0,0 +1,126 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml + +# NOTE: Some of the columns below can return NULL values, for which sqlqueryreceiver will warn. +# Those columns are idx_scan and idx_tup_fetch. + - sql: > + SELECT + current_database() as dbname + , p.schemaname + , p.relname + , p.seq_scan + , p.seq_tup_read + , COALESCE(p.idx_scan, 0) AS idx_scan + , COALESCE(p.idx_tup_fetch, 0) as idx_tup_fetch + , p.n_tup_ins + , p.n_tup_upd + , p.n_tup_del + , p.n_tup_hot_upd + , p.n_tup_newpage_upd + , p.n_live_tup + , p.n_dead_tup + , p.vacuum_count + , p.autovacuum_count + , p.analyze_count + , p.autoanalyze_count + FROM pg_catalog.pg_stat_user_tables p; + metrics: + - metric_name: ccp_stat_user_tables_analyze_count + data_type: sum + value_column: analyze_count + description: Number of times this table has been manually analyzed + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_autoanalyze_count + data_type: sum + value_column: autoanalyze_count + description: Number of times this table has been analyzed by the autovacuum daemon + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_autovacuum_count + data_type: sum + value_column: autovacuum_count + description: Number of times this table has been vacuumed by the autovacuum daemon + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_idx_scan + data_type: sum + value_column: idx_scan + description: Number of index scans initiated on this table + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_idx_tup_fetch + data_type: sum + value_column: idx_tup_fetch + description: Number of live rows fetched by index scans + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_dead_tup + value_column: n_dead_tup + description: Estimated number of dead rows + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_live_tup + value_column: n_live_tup + description: Estimated number of live rows + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_del + data_type: sum + value_column: n_tup_del + description: Number of rows deleted + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_hot_upd + data_type: sum + value_column: n_tup_hot_upd + description: Number of rows HOT updated (i.e., with no separate index update required) + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_ins + data_type: sum + value_column: n_tup_ins + description: Number of rows inserted + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_upd + data_type: sum + value_column: n_tup_upd + description: Number of rows updated + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_seq_scan + data_type: sum + value_column: seq_scan + description: Number of sequential scans initiated on this table + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_seq_tup_read + data_type: sum + value_column: seq_tup_read + description: Number of live rows fetched by sequential scans + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_vacuum_count + data_type: sum + value_column: vacuum_count + description: Number of times this table has been manually vacuumed (not counting VACUUM FULL) + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/gte_pg17_metrics.yaml b/internal/collector/gte_pg17_metrics.yaml new file mode 100644 index 0000000000..c985ec8e4f --- /dev/null +++ b/internal/collector/gte_pg17_metrics.yaml @@ -0,0 +1,71 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml + + - sql: > + SELECT c.buffers_written + FROM pg_catalog.pg_stat_checkpointer c; + metrics: + - metric_name: ccp_stat_bgwriter_buffers_checkpoint + data_type: sum + description: Number of buffers written during checkpoints and restartpoints + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + s.writes + , s.fsyncs + FROM pg_catalog.pg_stat_io s + WHERE backend_type = 'background writer'; + metrics: + - metric_name: ccp_stat_bgwriter_buffers_backend + value_column: writes + data_type: sum + description: Number of write operations, each of the size specified in op_bytes. + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_buffers_backend_fsync + value_column: fsyncs + data_type: sum + description: Number of fsync calls. These are only tracked in context normal. + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + c.num_timed + , c.num_requested + , c.write_time + , c.sync_time + , c.buffers_written + FROM pg_catalog.pg_stat_checkpointer c; + metrics: + - metric_name: ccp_stat_bgwriter_checkpoint_sync_time + value_column: sync_time + description: Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoint_write_time + value_column: write_time + value_type: double + description: Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoints_req + value_column: num_requested + description: Number of requested checkpoints that have been performed + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoints_timed + value_column: num_timed + description: Number of scheduled checkpoints that have been performed + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_checkpointer_buffers_written + description: Number of buffers written during checkpoints and restartpoints + value_column: buffers_written + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/lt_pg16_metrics.yaml b/internal/collector/lt_pg16_metrics.yaml new file mode 100644 index 0000000000..e6dd086497 --- /dev/null +++ b/internal/collector/lt_pg16_metrics.yaml @@ -0,0 +1,134 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml + +# NOTE: Some of the columns below can return NULL values, for which sqlqueryreceiver will warn. +# Those columns are idx_scan and idx_tup_fetch. We now use COALESCE to return 0 as a default. + - sql: > + SELECT + current_database() as dbname + , p.schemaname + , p.relname + , p.seq_scan + , p.seq_tup_read + , COALESCE(p.idx_scan, 0) AS idx_scan + , COALESCE(p.idx_tup_fetch, 0) as idx_tup_fetch + , p.n_tup_ins + , p.n_tup_upd + , p.n_tup_del + , p.n_tup_hot_upd + , 0::bigint AS n_tup_newpage_upd + , p.n_live_tup + , p.n_dead_tup + , p.vacuum_count + , p.autovacuum_count + , p.analyze_count + , p.autoanalyze_count + FROM pg_catalog.pg_stat_user_tables p; + metrics: + - metric_name: ccp_stat_user_tables_analyze_count + data_type: sum + value_column: analyze_count + description: Number of times this table has been manually analyzed + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_autoanalyze_count + data_type: sum + value_column: autoanalyze_count + description: Number of times this table has been analyzed by the autovacuum daemon + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_autovacuum_count + data_type: sum + value_column: autovacuum_count + description: Number of times this table has been vacuumed by the autovacuum daemon + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_idx_scan + data_type: sum + value_column: idx_scan + description: Number of index scans initiated on this table + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_idx_tup_fetch + data_type: sum + value_column: idx_tup_fetch + description: Number of live rows fetched by index scans + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_dead_tup + value_column: n_dead_tup + description: Estimated number of dead rows + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + # FIXME: This metric returns 0, when the query returns 1 for relname="pgbackrest_info",schemaname="pg_temp_33". + # The issue doesn't occur with gte_pg16. + - metric_name: ccp_stat_user_tables_n_live_tup + value_column: n_live_tup + description: Estimated number of live rows + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_del + data_type: sum + value_column: n_tup_del + description: Number of rows deleted + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_hot_upd + data_type: sum + value_column: n_tup_hot_upd + description: Number of rows HOT updated (i.e., with no separate index update required) + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + # FIXME: This metric returns 0, when the query returns 1 for relname="pgbackrest_info",schemaname="pg_temp_33". + # The issue doesn't occur with gte_pg16. + - metric_name: ccp_stat_user_tables_n_tup_ins + data_type: sum + value_column: n_tup_ins + description: Number of rows inserted + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_n_tup_upd + data_type: sum + value_column: n_tup_upd + description: Number of rows updated + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + # FIXME: This metric returns 0, when the query returns 1 for relname="pgbackrest_info",schemaname="pg_temp_33". + # The issue doesn't occur with gte_pg16. + - metric_name: ccp_stat_user_tables_seq_scan + data_type: sum + value_column: seq_scan + description: Number of sequential scans initiated on this table + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + # FIXME: This metric returns 0, when the query returns 1 for relname="pgbackrest_info",schemaname="pg_temp_33". + # The issue doesn't occur with gte_pg16. + - metric_name: ccp_stat_user_tables_seq_tup_read + data_type: sum + value_column: seq_tup_read + description: Number of live rows fetched by sequential scans + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_user_tables_vacuum_count + data_type: sum + value_column: vacuum_count + description: Number of times this table has been manually vacuumed (not counting VACUUM FULL) + attribute_columns: ["dbname", "relname", "schemaname"] + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/lt_pg17_metrics.yaml b/internal/collector/lt_pg17_metrics.yaml new file mode 100644 index 0000000000..330ff7d798 --- /dev/null +++ b/internal/collector/lt_pg17_metrics.yaml @@ -0,0 +1,71 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml + + - sql: > + SELECT c.buffers_checkpoint AS buffers_written + FROM pg_catalog.pg_stat_bgwriter c; + metrics: + - metric_name: ccp_stat_bgwriter_buffers_checkpoint + value_column: buffers_written + data_type: sum + description: Number of buffers written during checkpoints and restartpoints + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + s.buffers_backend AS writes + , s.buffers_backend_fsync AS fsyncs + FROM pg_catalog.pg_stat_bgwriter s; + metrics: + - metric_name: ccp_stat_bgwriter_buffers_backend + value_column: writes + data_type: sum + description: Number of write operations, each of the size specified in op_bytes. + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_buffers_backend_fsync + value_column: fsyncs + data_type: sum + description: Number of fsync calls. These are only tracked in context normal. + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + c.checkpoints_timed AS num_timed + , c.checkpoints_req AS num_requested + , c.checkpoint_write_time AS write_time + , c.checkpoint_sync_time AS sync_time + , c.buffers_checkpoint AS buffers_written + FROM pg_catalog.pg_stat_bgwriter c; + metrics: + - metric_name: ccp_stat_bgwriter_checkpoints_timed + value_column: num_timed + description: Number of scheduled checkpoints that have been performed + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoints_req + value_column: num_requested + description: Number of requested checkpoints that have been performed + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoint_write_time + value_column: write_time + value_type: double + description: Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_checkpoint_sync_time + value_column: sync_time + description: Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_checkpointer_buffers_written + description: Number of buffers written during checkpoints and restartpoints + value_column: buffers_written + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/naming.go b/internal/collector/naming.go index 3dad4205fa..4a414a9bad 100644 --- a/internal/collector/naming.go +++ b/internal/collector/naming.go @@ -9,5 +9,16 @@ const DebugExporter = "debug" const OneSecondBatchProcessor = "batch/1s" const SubSecondBatchProcessor = "batch/200ms" const Prometheus = "prometheus" -const Metrics = "metrics" +const PGBouncerMetrics = "metrics/pgbouncer" +const PostgresMetrics = "metrics/postgres" +const PatroniMetrics = "metrics/patroni" + const SqlQuery = "sqlquery" + +// For slow queries, we'll use pgMonitor's default 5 minute interval. +// https://github.com/CrunchyData/pgmonitor-extension/blob/main/sql/matviews/matviews.sql +const FiveMinuteSqlQuery = "sqlquery/300s" + +// We'll use pgMonitor's Prometheus collection interval for most queries. +// https://github.com/CrunchyData/pgmonitor/blob/development/prometheus/linux/crunchy-prometheus.yml +const FiveSecondSqlQuery = "sqlquery/5s" diff --git a/internal/collector/patroni.go b/internal/collector/patroni.go index 3199d9c0ea..0924167987 100644 --- a/internal/collector/patroni.go +++ b/internal/collector/patroni.go @@ -160,7 +160,7 @@ func EnablePatroniMetrics(ctx context.Context, } // Add Metrics Pipeline - outConfig.Pipelines[Metrics] = Pipeline{ + outConfig.Pipelines[PatroniMetrics] = Pipeline{ Receivers: []ComponentID{Prometheus}, Exporters: []ComponentID{Prometheus}, } diff --git a/internal/collector/pgbackrest_metrics.yaml b/internal/collector/pgbackrest_metrics.yaml new file mode 100644 index 0000000000..cb5dbba5f8 --- /dev/null +++ b/internal/collector/pgbackrest_metrics.yaml @@ -0,0 +1,169 @@ + # FIXME: The repo key is obtained inelegantly. + # The query below runs pgbackrest info and parses the output. + # The --stanza argument matches DefaultStanzaName, defined in internal/pgbackrest/config.go. + - sql: | + DROP TABLE IF EXISTS pgbackrest_info; + CREATE TEMPORARY TABLE pgbackrest_info (data json); + + COPY pgbackrest_info (data) + FROM PROGRAM 'export LC_ALL=C && printf "\f" && pgbackrest info --log-level-console=info --log-level-stderr=warn --output=json --stanza=db && printf "\f"' + WITH (FORMAT csv, HEADER false, QUOTE E'\f'); + + WITH + all_backups (data) AS ( + SELECT jsonb_array_elements(to_jsonb(data)) FROM pgbackrest_info + ), + stanza_backups (stanza, backup) AS ( + SELECT data->>'name', jsonb_array_elements(data->'backup') FROM all_backups + ), + ordered_backups (stanza, backup, seq_oldest, seq_newest) AS ( + SELECT stanza, backup, + ROW_NUMBER() OVER ( + PARTITION BY stanza, backup->'database'->>'repo-key', backup->>'type' + ORDER BY backup->'timestamp'->>'start' ASC, backup->'timestamp'->>'stop' ASC + ), + ROW_NUMBER() OVER ( + PARTITION BY stanza, backup->'database'->>'repo-key', backup->>'type' + ORDER BY backup->'timestamp'->>'start' DESC, backup->'timestamp'->>'stop' DESC + ) + FROM stanza_backups + ), + + ccp_backrest_last_info AS ( + SELECT + stanza, + split_part(backup->'backrest'->>'version', '.', 1) || lpad(split_part(backup->'backrest'->>'version', '.', 2), 2, '0') || lpad(coalesce(nullif(split_part(backup->'backrest'->>'version', '.', 3), ''), '00'), 2, '0') AS backrest_repo_version, + backup->'database'->>'repo-key' AS repo, + backup->>'type' AS backup_type, + backup->'info'->'repository'->>'delta' AS repo_backup_size_bytes, + backup->'info'->'repository'->>'size' AS repo_total_size_bytes, + (backup->'timestamp'->>'stop')::bigint - (backup->'timestamp'->>'start')::bigint AS backup_runtime_seconds, + CASE WHEN backup->>'error' = 'true' THEN 1 ELSE 0 END AS backup_error + FROM ordered_backups + WHERE seq_newest = 1 + ), + + ccp_backrest_oldest_full_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + min((backup->'timestamp'->>'stop')::bigint) AS time_seconds + FROM ordered_backups + WHERE seq_oldest = 1 AND backup->>'type' IN ('full') + GROUP BY 1,2 + ), + + ccp_backrest_last_full_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds + FROM ordered_backups + WHERE seq_newest = 1 AND backup->>'type' IN ('full') + GROUP BY 1,2 + ), + + ccp_backrest_last_diff_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds + FROM ordered_backups + WHERE seq_newest = 1 AND backup->>'type' IN ('full','diff') + GROUP BY 1,2 + ), + + ccp_backrest_last_incr_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds + FROM ordered_backups + WHERE seq_newest = 1 AND backup->>'type' IN ('full','diff','incr') + GROUP BY 1,2 + ) + + SELECT + ccp_backrest_last_diff_backup.time_since_completion_seconds as last_diff_backup, + ccp_backrest_last_full_backup.time_since_completion_seconds as last_full_backup, + ccp_backrest_last_incr_backup.time_since_completion_seconds as last_incr_backup, + ccp_backrest_last_info.backrest_repo_version as last_info_backrest_repo_version, + ccp_backrest_last_info.backup_error as last_info_backup_error, + ccp_backrest_last_info.backup_type as backup_type, + ccp_backrest_last_info.backup_runtime_seconds as backup_runtime_seconds, + ccp_backrest_last_info.repo_backup_size_bytes as repo_backup_size_bytes, + ccp_backrest_last_info.repo_total_size_bytes as repo_total_size_bytes, + ccp_backrest_oldest_full_backup.time_seconds as oldest_full_backup, + ccp_backrest_last_incr_backup.repo as repo + + FROM + ccp_backrest_last_diff_backup + , ccp_backrest_last_full_backup + , ccp_backrest_last_incr_backup + , ccp_backrest_last_info + , ccp_backrest_oldest_full_backup; + metrics: + - metric_name: ccp_backrest_last_diff_backup_time_since_completion_seconds + description: Seconds since the last completed full or differential backup. Differential is always based off last full. + value_column: last_diff_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_full_backup_time_since_completion_seconds + description: Seconds since the last completed full backup + value_column: last_full_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_incr_backup_time_since_completion_seconds + description: | + Seconds since the last completed full, differential or incremental backup. + Incremental is always based off last full or differential. + value_column: last_incr_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_backrest_repo_version + description: pgBackRest version number when this backup was performed + value_column: last_info_backrest_repo_version + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_backup_error + description: An error has been encountered in the backup. Check logs for more information. + value_column: last_info_backup_error + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_backup_runtime_seconds + description: Total runtime in seconds of this backup + value_column: backup_runtime_seconds + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_repo_backup_size_bytes + description: Actual size of only this individual backup in the pgbackrest repository + value_column: repo_backup_size_bytes + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_repo_total_size_bytes + description: Total size of this backup in the pgbackrest repository, including all required previous backups and WAL + value_column: repo_total_size_bytes + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_oldest_full_backup_time_seconds + description: Seconds since the oldest completed full backup + value_column: oldest_full_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/pgbouncer.go b/internal/collector/pgbouncer.go index 610843212b..424683e3af 100644 --- a/internal/collector/pgbouncer.go +++ b/internal/collector/pgbouncer.go @@ -184,7 +184,7 @@ func EnablePgBouncerMetrics(ctx context.Context, config *Config, sqlQueryUsernam } // Add Metrics Pipeline - config.Pipelines[Metrics] = Pipeline{ + config.Pipelines[PGBouncerMetrics] = Pipeline{ Receivers: []ComponentID{SqlQuery}, Exporters: []ComponentID{Prometheus}, } diff --git a/internal/collector/pgbouncer_metrics_queries.yaml b/internal/collector/pgbouncer_metrics_queries.yaml index d1ab237d63..228fef1cc0 100644 --- a/internal/collector/pgbouncer_metrics_queries.yaml +++ b/internal/collector/pgbouncer_metrics_queries.yaml @@ -11,43 +11,45 @@ attribute_columns: ["database", "user", "state", "application_name", "link"] description: "Current waiting time in seconds" + # NOTE: Avoid collecting "host" column because it can be null; the collector will warn against null. + # The host column should always point either to pgBouncer's virtual database (the null case) or to the primary. - sql: "SHOW DATABASES" metrics: - metric_name: ccp_pgbouncer_databases_pool_size value_column: pool_size - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "Maximum number of server connections" - metric_name: ccp_pgbouncer_databases_min_pool_size value_column: min_pool_size - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "Minimum number of server connections" - metric_name: ccp_pgbouncer_databases_reserve_pool value_column: reserve_pool - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "Maximum number of additional connections for this database" - metric_name: ccp_pgbouncer_databases_max_connections value_column: max_connections - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: >- Maximum number of allowed connections for this database, as set by max_db_connections, either globally or per database - metric_name: ccp_pgbouncer_databases_current_connections value_column: current_connections - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "Current number of connections for this database" - metric_name: ccp_pgbouncer_databases_paused value_column: paused - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "1 if this database is currently paused, else 0" - metric_name: ccp_pgbouncer_databases_disabled value_column: disabled - attribute_columns: ["name", "host", "port", "database", "force_user", "pool_mode"] + attribute_columns: ["name", "port", "database", "force_user", "pool_mode"] description: "1 if this database is currently disabled, else 0" - sql: "SHOW LISTS" diff --git a/internal/collector/postgres.go b/internal/collector/postgres.go index 544f0e9feb..416c27ecda 100644 --- a/internal/collector/postgres.go +++ b/internal/collector/postgres.go @@ -23,8 +23,9 @@ func NewConfigForPostgresPod(ctx context.Context, ) *Config { config := NewConfig(inCluster.Spec.Instrumentation) - EnablePatroniLogging(ctx, inCluster, config) + EnablePostgresMetrics(ctx, inCluster, config) EnablePatroniMetrics(ctx, inCluster, config) + EnablePatroniLogging(ctx, inCluster, config) EnablePostgresLogging(ctx, inCluster, config, outParameters) return config diff --git a/internal/collector/postgres_5m_metrics.yaml b/internal/collector/postgres_5m_metrics.yaml new file mode 100644 index 0000000000..9f5c3212dc --- /dev/null +++ b/internal/collector/postgres_5m_metrics.yaml @@ -0,0 +1,143 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml + - sql: > + SELECT datname as dbname + , pg_database_size(datname) as bytes + FROM pg_catalog.pg_database + WHERE datistemplate = false; + metrics: + - metric_name: ccp_database_size_bytes + value_column: bytes + description: Database size in bytes + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + # Returns count of sequences that have used up 75% of what's available. + # https://github.com/CrunchyData/pgmonitor-extension/blob/main/sql/functions/functions.sql#L67 + # NOTE: Postgres 13 requires an alias, x below, where PG 17 doesn't. + - sql: > + SELECT count(*) AS count + FROM ( + SELECT CEIL((s.max_value-min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS slots + , CEIL((COALESCE(s.last_value,s.min_value)-s.min_value::NUMERIC+1)/s.increment_by::NUMERIC) AS used + FROM pg_catalog.pg_sequences s + ) x + WHERE (ROUND(used/slots*100)::int) > 75; + metrics: + - metric_name: ccp_sequence_exhaustion_count + value_column: count + description: | + Count of sequences that have reached greater than or equal to 75% of their max available numbers. + Function monitor.sequence_status() can provide more details if run directly on system. + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT s.datname AS dbname + , s.xact_commit + , s.xact_rollback + , s.blks_read + , s.blks_hit + , s.tup_returned + , s.tup_fetched + , s.tup_inserted + , s.tup_updated + , s.tup_deleted + , s.conflicts + , s.temp_files + , s.temp_bytes + , s.deadlocks + FROM pg_catalog.pg_stat_database s + JOIN pg_catalog.pg_database d ON d.datname = s.datname + WHERE d.datistemplate = false; + metrics: + - metric_name: ccp_stat_database_blks_hit + value_column: blks_hit + description: Number of times disk blocks were found already in the buffer cache, so that a read was not necessary + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_database_blks_read + value_column: blks_read + description: Number of disk blocks read in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_database_conflicts + value_column: conflicts + description: Number of queries canceled due to conflicts with recovery in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_database_deadlocks + value_column: deadlocks + description: Number of deadlocks detected in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_database_temp_bytes + value_column: temp_bytes + description: Total amount of data written to temporary files by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_database_temp_files + value_column: temp_files + description: Number of rows deleted by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_tup_deleted + value_column: tup_deleted + description: Number of rows deleted by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_tup_fetched + value_column: tup_fetched + description: Number of rows fetched by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_tup_inserted + value_column: tup_inserted + description: Number of rows inserted by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_tup_returned + value_column: tup_returned + description: Number of rows returned by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_tup_updated + value_column: tup_updated + description: Number of rows updated by queries in this database + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_xact_commit + value_column: xact_commit + description: Number of transactions in this database that have been committed + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + + - metric_name: ccp_stat_database_xact_rollback + value_column: xact_rollback + description: Number of transactions in this database that have been rolled back + attribute_columns: ["dbname"] + static_attributes: + server: "localhost:5432" + diff --git a/internal/collector/postgres_5s_metrics.yaml b/internal/collector/postgres_5s_metrics.yaml new file mode 100644 index 0000000000..29a0477343 --- /dev/null +++ b/internal/collector/postgres_5s_metrics.yaml @@ -0,0 +1,842 @@ +# This list of queries configures an OTel SQL Query Receiver to read pgMonitor +# metrics from Postgres. +# +# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries +# https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml +# + - sql: > + SELECT + EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)) AS seconds_since_last_archive + FROM pg_catalog.pg_stat_archiver + metrics: + - metric_name: ccp_archive_command_status_seconds_since_last_archive + value_column: seconds_since_last_archive + value_type: double + description: Seconds since the last successful archive operation + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT archived_count + FROM pg_catalog.pg_stat_archiver + metrics: + - metric_name: ccp_archive_command_status_archived_count + value_column: archived_count + description: Number of WAL files that have been successfully archived + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT failed_count + FROM pg_catalog.pg_stat_archiver + metrics: + - metric_name: ccp_archive_command_status_failed_count + value_column: failed_count + description: Number of failed attempts for archiving WAL files + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT CASE + WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0 + WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) < 0 THEN 0 + ELSE EXTRACT(epoch from (last_failed_time - last_archived_time)) + END AS seconds_since_last_fail + FROM pg_catalog.pg_stat_archiver + + metrics: + - metric_name: ccp_archive_command_status_seconds_since_last_fail + value_column: seconds_since_last_fail + description: Seconds since the last recorded failure of the archive_command + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT ((total - idle) - idle_in_txn) as active + , total + , idle + , idle_in_txn + , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - state_change))),0) FROM pg_catalog.pg_stat_activity WHERE state = 'idle in transaction') AS max_idle_in_txn_time + , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND state <> 'idle' ) AS max_query_time + , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND wait_event_type = 'Lock' ) AS max_blocked_query_time + , max_connections + FROM ( + SELECT COUNT(*) as total + , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle + , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x + JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true); + + metrics: + - metric_name: ccp_connection_stats_active + value_column: active + description: Total non-idle connections + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_idle + value_column: idle + description: Total idle connections + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_idle_in_txn + value_column: idle_in_txn + description: Total idle in transaction connections + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_max_blocked_query_time + value_column: max_blocked_query_time + value_type: double + description: Value of max_connections for the monitored database + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_max_connections + value_column: max_connections + description: Value of max_connections for the monitored database + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_max_idle_in_txn_time + value_column: max_idle_in_txn_time + value_type: double + description: Length of time in seconds of the longest idle in transaction session + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_max_query_time + value_column: max_query_time + value_type: double + description: Length of time in seconds of the longest running query + static_attributes: + server: "localhost:5432" + - metric_name: ccp_connection_stats_total + value_column: total + description: Total idle and non-idle connections + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT datname AS dbname + , checksum_failures AS count + , coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds + FROM pg_catalog.pg_stat_database + WHERE pg_stat_database.datname IS NOT NULL; + metrics: + - metric_name: ccp_data_checksum_failure_count + value_column: count + attribute_columns: ["dbname"] + description: Total number of checksum failures on this database + static_attributes: + server: "localhost:5432" + - metric_name: ccp_data_checksum_failure_time_since_last_failure_seconds + value_column: time_since_last_failure_seconds + value_type: double + attribute_columns: ["dbname"] + description: Time interval in seconds since the last checksum failure was encountered + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT pg_database.datname as dbname + , tmp.mode + , COALESCE(count,0) as count + FROM + ( + VALUES ('accesssharelock'), + ('rowsharelock'), + ('rowexclusivelock'), + ('shareupdateexclusivelock'), + ('sharelock'), + ('sharerowexclusivelock'), + ('exclusivelock'), + ('accessexclusivelock') + ) AS tmp(mode) CROSS JOIN pg_catalog.pg_database + LEFT JOIN + (SELECT database, lower(mode) AS mode,count(*) AS count + FROM pg_catalog.pg_locks WHERE database IS NOT NULL + GROUP BY database, lower(mode) + ) AS tmp2 + ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database; + metrics: + - metric_name: ccp_locks_count + value_column: count + attribute_columns: ["dbname", "mode"] + description: Return value of 1 means database is in recovery. Otherwise 2 it is a primary. + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT monitor.kdapi_scalar_bigint('cpu_request') AS request + , monitor.kdapi_scalar_bigint('cpu_limit') AS limit + metrics: + - metric_name: ccp_nodemx_cpu_limit + value_column: limit + description: CPU limit value in milli cores + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpu_request + value_column: request + description: CPU request value in milli cores + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT CASE WHEN monitor.cgroup_mode() = 'legacy' + THEN monitor.cgroup_scalar_bigint('cpuacct.usage') + ELSE (SELECT val FROM monitor.cgroup_setof_kv('cpu.stat') where key = 'usage_usec') * 1000 + END AS usage, + extract(epoch from clock_timestamp()) AS usage_ts; + metrics: + - metric_name: ccp_nodemx_cpuacct_usage + value_column: usage + value_type: double + description: CPU usage in nanoseconds + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpuacct_usage_ts + value_column: usage_ts + value_type: double + description: CPU usage snapshot timestamp + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + CASE + WHEN monitor.cgroup_mode() = 'legacy' THEN + monitor.cgroup_scalar_bigint('cpu.cfs_period_us') + ELSE + (monitor.cgroup_array_bigint('cpu.max'))[2] + END AS period_us, + CASE + WHEN monitor.cgroup_mode() = 'legacy' THEN + GREATEST(monitor.cgroup_scalar_bigint('cpu.cfs_quota_us'), 0) + ELSE + GREATEST((monitor.cgroup_array_bigint('cpu.max'))[1], 0) + END AS quota_us; + metrics: + - metric_name: ccp_nodemx_cpucfs_period_us + value_column: period_us + description: The total available run-time within a period (in microseconds) + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpucfs_quota_us + value_column: quota_us + value_type: double + description: The length of a period (in microseconds) + static_attributes: + server: "localhost:5432" + + # NOTE: cgroup v2 has throttled_usec, vs. throttled_time. + - sql: > + WITH d(key, val) AS + (select key, val from monitor.cgroup_setof_kv('cpu.stat')) + SELECT + (SELECT val FROM d WHERE key='nr_periods') AS nr_periods, + (SELECT val FROM d WHERE key='nr_throttled') AS nr_throttled, + (SELECT val FROM d WHERE key='throttled_usec') AS throttled_time, + extract(epoch from clock_timestamp()) as snap_ts; + metrics: + - metric_name: ccp_nodemx_cpustat_nr_periods + value_column: nr_periods + value_type: double + description: Number of periods that any thread was runnable + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpustat_nr_throttled + value_column: nr_throttled + description: Number of runnable periods in which the application used its entire quota and was throttled + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpustat_snap_ts + value_column: snap_ts + value_type: double + description: CPU stat snapshot timestamp + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_cpustat_throttled_time + value_column: throttled_time + value_type: double # TODO: Is this right? + description: Sum total amount of time individual threads within the monitor.cgroup were throttled + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT mount_point,fs_type,total_bytes,available_bytes,total_file_nodes,free_file_nodes + FROM monitor.proc_mountinfo() m + JOIN monitor.fsinfo(m.mount_point) f USING (major_number, minor_number) + WHERE m.mount_point IN ('/pgdata', '/pgwal') OR + m.mount_point like '/tablespaces/%' + metrics: + - metric_name: ccp_nodemx_data_disk_available_bytes + value_column: available_bytes + value_type: double + description: Available size in bytes + attribute_columns: ["fs_type", "mount_point"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_data_disk_free_file_nodes + value_column: free_file_nodes + description: Available file nodes + attribute_columns: ["fs_type", "mount_point"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_data_disk_total_bytes + value_column: total_bytes + description: Size in bytes + attribute_columns: ["fs_type", "mount_point"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_data_disk_total_file_nodes + value_column: total_file_nodes + description: Total file nodes + attribute_columns: ["fs_type", "mount_point"] + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT mount_point,sectors_read,sectors_written + FROM monitor.proc_mountinfo() m + JOIN monitor.proc_diskstats() d USING (major_number, minor_number) + WHERE m.mount_point IN ('/pgdata', '/pgwal') OR + m.mount_point like '/tablespaces/%'; + metrics: + - metric_name: ccp_nodemx_disk_activity_sectors_read + value_column: sectors_read + description: Total sectors read + attribute_columns: ["mount_point"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_disk_activity_sectors_written + value_column: sectors_written + description: Total sectors written + attribute_columns: ["mount_point"] + static_attributes: + server: "localhost:5432" + + - sql: > + WITH d(key, val) as (SELECT key, val FROM monitor.cgroup_setof_kv('memory.stat')) + SELECT + monitor.kdapi_scalar_bigint('mem_request') AS request, + CASE + WHEN monitor.cgroup_mode() = 'legacy' THEN + (CASE WHEN monitor.cgroup_scalar_bigint('memory.limit_in_bytes') = 9223372036854771712 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.limit_in_bytes') END) + ELSE + (CASE WHEN monitor.cgroup_scalar_bigint('memory.max') = 9223372036854775807 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.max') END) + END AS limit, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN (SELECT val FROM d WHERE key='cache') + ELSE 0 + END as cache, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN (SELECT val FROM d WHERE key='rss') + ELSE 0 + END as RSS, + (SELECT val FROM d WHERE key='shmem') as shmem, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN (SELECT val FROM d WHERE key='mapped_file') + ELSE 0 + END as mapped_file, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN (SELECT val FROM d WHERE key='dirty') + ELSE (SELECT val FROM d WHERE key='file_dirty') + END as dirty, + (SELECT val FROM d WHERE key='active_anon') as active_anon, + (SELECT val FROM d WHERE key='inactive_anon') as inactive_anon, + (SELECT val FROM d WHERE key='active_file') as active_file, + (SELECT val FROM d WHERE key='inactive_file') as inactive_file, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN monitor.cgroup_scalar_bigint('memory.usage_in_bytes') + ELSE monitor.cgroup_scalar_bigint('memory.current') + END as usage_in_bytes, + CASE + WHEN monitor.cgroup_mode() = 'legacy' + THEN monitor.cgroup_scalar_bigint('memory.kmem.usage_in_bytes') + ELSE 0 + END as kmem_usage_in_byte; + metrics: + - metric_name: ccp_nodemx_mem_active_anon + value_column: active_anon + value_type: double + description: Total bytes of anonymous and swap cache memory on active LRU list + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_active_file + value_column: active_file + value_type: double + description: Total bytes of file-backed memory on active LRU list + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_cache + value_column: cache + value_type: double + description: Total bytes of page cache memory + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_dirty + value_column: dirty + description: Total bytes that are waiting to get written back to the disk + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_inactive_anon + value_column: inactive_anon + value_type: double + description: Total bytes of anonymous and swap cache memory on inactive LRU list + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_inactive_file + value_column: inactive_file + value_type: double + description: Total bytes of file-backed memory on inactive LRU list + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_kmem_usage_in_byte + value_column: kmem_usage_in_byte + description: Unknown metric from ccp_nodemx_mem + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_limit + value_column: limit + description: Memory limit value in bytes + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_mapped_file + value_column: mapped_file + description: Total bytes of mapped file (includes tmpfs/shmem) + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_request + value_column: request + description: Memory request value in bytes + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_rss + value_column: rss + value_type: double + description: Total bytes of anonymous and swap cache memory + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_shmem + value_column: shmem + value_type: double + description: Total bytes of shared memory + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_mem_usage_in_bytes + value_column: usage_in_bytes + description: Total usage in bytes + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT interface + ,tx_bytes + ,tx_packets + ,rx_bytes + ,rx_packets from monitor.proc_network_stats() + metrics: + - metric_name: ccp_nodemx_network_rx_bytes + value_column: rx_bytes + description: Number of bytes received + attribute_columns: ["interface"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_network_rx_packets + value_column: rx_packets + description: Number of packets received + attribute_columns: ["interface"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_network_tx_bytes + value_column: tx_bytes + description: Number of bytes transmitted + attribute_columns: ["interface"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_nodemx_network_tx_packets + value_column: tx_packets + description: Number of packets transmitted + attribute_columns: ["interface"] + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT monitor.cgroup_process_count() as count; + metrics: + - metric_name: ccp_nodemx_process_count + value_column: count + description: Total number of database processes + static_attributes: + server: "localhost:5432" + + # Setting pg_stat_statements_reset_info to -1 means update as often as possible. + - sql: > + SELECT monitor.pg_stat_statements_reset_info(-1) as time; + metrics: + - metric_name: ccp_pg_stat_statements_reset_time + value_column: time + description: Epoch time when stats were reset + static_attributes: + server: "localhost:5432" + + + # This query against pg_stat_statements is compatible with PG 13 and later. + # https://github.com/CrunchyData/pgmonitor-extension/blob/main/sql/functions/functions.sql + # TODO: Double-check the sorting and the attribute values on the below. + - sql: > + WITH monitor AS ( + SELECT + pg_get_userbyid(s.userid) AS role + , d.datname AS dbname + , s.queryid AS queryid + , btrim(replace(left(s.query, 40), '\n', '')) AS query + , s.calls + , s.total_exec_time AS total_exec_time + , s.max_exec_time AS max_exec_time + , s.mean_exec_time AS mean_exec_time + , s.rows + , s.wal_records AS records + , s.wal_fpi AS fpi + , s.wal_bytes AS bytes + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d ON d.oid = s.dbid + ) + SELECT role + , dbname + , queryid + , query + , max(monitor.mean_exec_time) AS top_mean_exec_time_ms + FROM monitor + GROUP BY 1,2,3,4 + ORDER BY 5 DESC + LIMIT 20; + metrics: + - metric_name: ccp_pg_stat_statements_top_mean_exec_time_ms + value_column: top_mean_exec_time_ms + value_type: double + description: Average query runtime in milliseconds + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + + - sql: > + WITH monitor AS ( + SELECT + pg_get_userbyid(s.userid) AS role + , d.datname AS dbname + , s.calls + , s.total_exec_time + , s.mean_exec_time + , s.rows + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d ON d.oid = s.dbid + ) + SELECT role + , dbname + , sum(calls) AS calls_count + , sum(total_exec_time) AS exec_time_ms + , avg(mean_exec_time) AS mean_exec_time_ms + , sum(rows) AS row_count + FROM monitor + GROUP BY 1,2; + metrics: + - metric_name: ccp_pg_stat_statements_total_calls_count + value_column: calls_count + value_type: double + description: Total number of queries run per user/database + attribute_columns: ["dbname", "role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_pg_stat_statements_total_exec_time_ms + value_column: exec_time_ms + value_type: double + description: Total runtime of all queries per user/database + attribute_columns: ["dbname", "role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_pg_stat_statements_total_mean_exec_time_ms + value_column: mean_exec_time_ms + value_type: double + description: Total runtime of all queries per user/database + attribute_columns: ["dbname", "role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_pg_stat_statements_total_row_count + value_column: row_count + value_type: double + description: Total rows returned from all queries per user/database + attribute_columns: ["dbname", "role"] + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT current_setting('server_version_num')::int AS current; + metrics: + - metric_name: ccp_postgresql_version_current + value_column: current + description: The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######). + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds; + metrics: + - metric_name: ccp_postmaster_uptime_seconds + value_column: seconds + value_type: double + description: Time interval in seconds since PostgreSQL database was last restarted. + static_attributes: + server: "localhost:5432" + + # ccp_replication_lag_size_bytes will return NULL on a replica + - sql: > + SELECT pg_wal_lsn_diff(sent_lsn, replay_lsn) AS bytes + FROM pg_catalog.pg_stat_replication; + metrics: + - metric_name: ccp_replication_lag_size_bytes + value_column: bytes + value_type: double + description: Time interval in seconds since PostgreSQL database was last restarted. + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + CASE + WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0 + ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER + END + AS replay_time + , CASE + WHEN pg_is_in_recovery() = false THEN 0 + ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER + END + AS received_time + , CASE + WHEN pg_is_in_recovery() = true THEN 'replica' + ELSE 'primary' + END + AS role; + metrics: + - metric_name: ccp_replication_lag_received_time + value_column: received_time + value_type: double + description: | + Length of time since the last WAL file was received and replayed on replica. + Always increases, possibly causing false positives if the primary stops writing. + Monitors for replicas that stop receiving WAL all together. + attribute_columns: ["role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_replication_lag_replay_time + value_column: replay_time + value_type: double + description: | + Length of time since the last transaction was replayed on replica. + Returns zero if last WAL received equals last WAL replayed. Avoids + false positives when primary stops writing. Monitors for replicas that + cannot keep up with primary WAL generation. + attribute_columns: ["role"] + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true; + metrics: + - metric_name: ccp_settings_pending_restart_count + value_column: count + description: Number of settings from pg_settings catalog in a pending_restart state + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT + buffers_clean + , maxwritten_clean + , buffers_alloc + FROM pg_catalog.pg_stat_bgwriter; + metrics: + - metric_name: ccp_stat_bgwriter_buffers_alloc + value_column: buffers_alloc + description: Number of buffers allocated + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_buffers_clean + value_column: buffers_clean + data_type: sum + description: Number of buffers written by the background writer + static_attributes: + server: "localhost:5432" + - metric_name: ccp_stat_bgwriter_maxwritten_clean + value_column: maxwritten_clean + description: Number of times the background writer stopped a cleaning scan because it had written too many buffers + static_attributes: + server: "localhost:5432" + + - sql: > + WITH max_age AS ( + SELECT 2000000000 as max_old_xid + , setting AS autovacuum_freeze_max_age + FROM pg_catalog.pg_settings + WHERE name = 'autovacuum_freeze_max_age') + , per_database_stats AS ( + SELECT datname + , m.max_old_xid::int + , m.autovacuum_freeze_max_age::int + , age(d.datfrozenxid) AS oldest_current_xid + FROM pg_catalog.pg_database d + JOIN max_age m ON (true) + WHERE d.datallowconn) + SELECT max(oldest_current_xid) AS oldest_current_xid + , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound + , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac + FROM per_database_stats; + metrics: + - metric_name: ccp_transaction_wraparound_oldest_current_xid + value_column: oldest_current_xid + description: Oldest current transaction ID in cluster + static_attributes: + server: "localhost:5432" + - metric_name: ccp_transaction_wraparound_percent_towards_emergency_autovac + value_column: percent_towards_emergency_autovac + description: Percentage towards emergency autovacuum process starting + static_attributes: + server: "localhost:5432" + - metric_name: ccp_transaction_wraparound_percent_towards_wraparound + value_column: percent_towards_wraparound + description: Percentage towards transaction ID wraparound + static_attributes: + server: "localhost:5432" + + - sql: > + SELECT last_5_min_size_bytes, + (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes + FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification > CURRENT_TIMESTAMP - '5 minutes'::interval) x; + metrics: + - metric_name: ccp_wal_activity_total_size_bytes + value_column: total_size_bytes + description: Current size in bytes of the WAL directory + static_attributes: + server: "localhost:5432" + + - sql: > + WITH monitor AS ( + SELECT + pg_get_userbyid(s.userid) AS role + , d.datname AS dbname + , s.queryid AS queryid + , btrim(replace(left(s.query, 40), '\n', '')) AS query + , s.calls + , s.total_exec_time AS total_exec_time + , s.max_exec_time AS max_exec_time_ms + , s.rows + , s.wal_records AS records + , s.wal_fpi AS fpi + , s.wal_bytes AS bytes + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d ON d.oid = s.dbid + ) + SELECT role + , dbname + , queryid + , query + , max_exec_time_ms + , records + FROM monitor + ORDER BY 5 DESC + LIMIT 20; + metrics: + - metric_name: ccp_pg_stat_statements_top_max_exec_time_ms + value_column: max_exec_time_ms + value_type: double + description: Epoch time when stats were reset + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + + - sql: > + WITH monitor AS ( + SELECT + pg_get_userbyid(s.userid) AS role + , d.datname AS dbname + , s.queryid AS queryid + , btrim(replace(left(s.query, 40), '\n', '')) AS query + , s.calls + , s.total_exec_time AS total_exec_time_ms + , s.rows + , s.wal_records AS records + , s.wal_fpi AS fpi + , s.wal_bytes AS bytes + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d ON d.oid = s.dbid + ) + SELECT role + , dbname + , queryid + , query + , total_exec_time_ms + , records + FROM monitor + ORDER BY 5 DESC + LIMIT 20; + metrics: + - metric_name: ccp_pg_stat_statements_top_total_exec_time_ms + value_column: total_exec_time_ms + value_type: double + description: Total time spent in the statement in milliseconds + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + + - sql: > + WITH monitor AS ( + SELECT + pg_get_userbyid(s.userid) AS role + , d.datname AS dbname + , s.queryid AS queryid + , btrim(replace(left(s.query, 40), '\n', '')) AS query + , s.calls + , s.total_exec_time AS total_exec_time + , s.max_exec_time AS max_exec_time + , s.mean_exec_time AS mean_exec_time + , s.rows + , s.wal_records AS records + , s.wal_fpi AS fpi + , s.wal_bytes AS bytes + FROM public.pg_stat_statements s + JOIN pg_catalog.pg_database d ON d.oid = s.dbid + ) + SELECT role + , dbname + , query + , queryid + , records + , fpi + , bytes + FROM monitor + ORDER BY bytes DESC + LIMIT 20; + metrics: + - metric_name: ccp_pg_stat_statements_top_wal_bytes + value_column: bytes + value_type: double + description: Total amount of WAL generated by the statement in bytes + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_pg_stat_statements_top_wal_fpi + value_column: fpi + value_type: double + description: Total number of WAL full page images generated by the statement + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + - metric_name: ccp_pg_stat_statements_top_wal_records + value_column: records + value_type: double + description: Total number of WAL records generated by the statement + attribute_columns: ["dbname", "query", "queryid", "role"] + static_attributes: + server: "localhost:5432" + diff --git a/internal/collector/postgres_metrics.go b/internal/collector/postgres_metrics.go new file mode 100644 index 0000000000..d6902c2262 --- /dev/null +++ b/internal/collector/postgres_metrics.go @@ -0,0 +1,110 @@ +// Copyright 2024 - 2025 Crunchy Data Solutions, Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// TODO fold this back into postgres.go once the collector package stabilizes. +package collector + +import ( + "context" + _ "embed" + "encoding/json" + "fmt" + "slices" + + "github.com/crunchydata/postgres-operator/internal/feature" + "github.com/crunchydata/postgres-operator/internal/pgmonitor" + "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" +) + +// https://pkg.go.dev/embed +// +//go:embed "generated/postgres_5s_metrics.json" +var defaultFiveSecondMetrics json.RawMessage + +//go:embed "generated/postgres_5m_metrics.json" +var fiveMinuteMetrics json.RawMessage + +//go:embed "generated/pgbackrest_metrics.json" +var pgBackRestMetrics json.RawMessage + +//go:embed "generated/gte_pg17_metrics.json" +var gtePG17 json.RawMessage + +//go:embed "generated/lt_pg17_metrics.json" +var ltPG17 json.RawMessage + +//go:embed "generated/gte_pg16_metrics.json" +var gtePG16 json.RawMessage + +//go:embed "generated/lt_pg16_metrics.json" +var ltPG16 json.RawMessage + +func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresCluster, config *Config) { + if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + var fiveSecondMetrics json.RawMessage + fiveSecondMetrics, _ = appendToJSONArray(defaultFiveSecondMetrics, pgBackRestMetrics) + + if inCluster.Spec.PostgresVersion >= 17 { + fiveSecondMetrics, _ = appendToJSONArray(fiveSecondMetrics, gtePG17) + } else { + fiveSecondMetrics, _ = appendToJSONArray(fiveSecondMetrics, ltPG17) + } + + if inCluster.Spec.PostgresVersion >= 16 { + fiveSecondMetrics, _ = appendToJSONArray(fiveSecondMetrics, gtePG16) + } else { + fiveSecondMetrics, _ = appendToJSONArray(fiveSecondMetrics, ltPG16) + } + // Add Prometheus exporter + config.Exporters[Prometheus] = map[string]any{ + "endpoint": "0.0.0.0:8889", + } + + config.Receivers[FiveSecondSqlQuery] = map[string]any{ + "driver": "postgres", + "datasource": fmt.Sprintf(`host=localhost dbname=postgres port=5432 user=%s password=${env:PGPASSWORD} sslmode=disable`, pgmonitor.MonitoringUser), + "collection_interval": "5s", + // Give Postgres time to finish setup. + "initial_delay": "10s", + "queries": slices.Clone(fiveSecondMetrics), + } + + config.Receivers[FiveMinuteSqlQuery] = map[string]any{ + "driver": "postgres", + "datasource": fmt.Sprintf(`host=localhost dbname=postgres port=5432 user=%s password=${env:PGPASSWORD} sslmode=disable`, pgmonitor.MonitoringUser), + "collection_interval": "300s", + // Give Postgres time to finish setup. + "initial_delay": "10s", + "queries": slices.Clone(fiveMinuteMetrics), + } + // Add Metrics Pipeline + config.Pipelines[PostgresMetrics] = Pipeline{ + Receivers: []ComponentID{FiveSecondSqlQuery, FiveMinuteSqlQuery}, + Exporters: []ComponentID{Prometheus}, + } + } +} + +// appendToJSONArray appends elements of a json.RawMessage containing an array +// to another json.RawMessage containing an array. +func appendToJSONArray(a1, a2 json.RawMessage) (json.RawMessage, error) { + var slc1 []json.RawMessage + if err := json.Unmarshal(a1, &slc1); err != nil { + return nil, err + } + + var slc2 []json.RawMessage + if err := json.Unmarshal(a2, &slc2); err != nil { + return nil, err + } + + mergedSlice := append(slc1, slc2...) + + merged, err := json.Marshal(mergedSlice) + if err != nil { + return nil, err + } + + return merged, nil +} diff --git a/internal/controller/postgrescluster/controller.go b/internal/controller/postgrescluster/controller.go index 5af8ba89ee..8ccf88de00 100644 --- a/internal/controller/postgrescluster/controller.go +++ b/internal/controller/postgrescluster/controller.go @@ -30,6 +30,7 @@ import ( "github.com/crunchydata/postgres-operator/internal/collector" "github.com/crunchydata/postgres-operator/internal/config" "github.com/crunchydata/postgres-operator/internal/controller/runtime" + "github.com/crunchydata/postgres-operator/internal/feature" "github.com/crunchydata/postgres-operator/internal/initialize" "github.com/crunchydata/postgres-operator/internal/kubernetes" "github.com/crunchydata/postgres-operator/internal/logging" @@ -344,7 +345,7 @@ func (r *Reconciler) Reconcile( if err == nil { exporterQueriesConfig, err = r.reconcileExporterQueriesConfig(ctx, cluster) } - if err == nil { + if err == nil && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { exporterWebConfig, err = r.reconcileExporterWebConfig(ctx, cluster) } if err == nil { diff --git a/internal/controller/postgrescluster/instance.go b/internal/controller/postgrescluster/instance.go index 42e86e62cb..56b414439b 100644 --- a/internal/controller/postgrescluster/instance.go +++ b/internal/controller/postgrescluster/instance.go @@ -1201,16 +1201,32 @@ func (r *Reconciler) reconcileInstance( } if err == nil && - (feature.Enabled(ctx, feature.OpenTelemetryLogs) || feature.Enabled(ctx, feature.OpenTelemetryMetrics)) { + (feature.Enabled(ctx, feature.OpenTelemetryLogs) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics)) { + // TODO: Setting the includeLogrotate argument to false for now. This // should be changed when we implement log rotation for postgres collector.AddToPod(ctx, cluster.Spec.Instrumentation, cluster.Spec.ImagePullPolicy, instanceConfigMap, &instance.Spec.Template.Spec, []corev1.VolumeMount{postgres.DataVolumeMount()}, "", false) } - // Add pgMonitor resources to the instance Pod spec - if err == nil { - err = addPGMonitorToInstancePodSpec(ctx, cluster, &instance.Spec.Template, exporterQueriesConfig, exporterWebConfig) + if err == nil && + feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + + monitoringUserSecret := &corev1.Secret{ObjectMeta: naming.MonitoringUserSecret(cluster)} + err := errors.WithStack( + r.Client.Get(ctx, client.ObjectKeyFromObject(monitoringUserSecret), monitoringUserSecret)) + + if client.IgnoreNotFound(err) != nil { + return err + } + + collector.AddToPod(ctx, cluster.Spec.Instrumentation, cluster.Spec.ImagePullPolicy, instanceConfigMap, &instance.Spec.Template.Spec, + []corev1.VolumeMount{postgres.DataVolumeMount()}, string(monitoringUserSecret.Data["password"]), false) + } + + // Add postgres-exporter to the instance Pod spec + if err == nil && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + err = addPGExporterToInstancePodSpec(ctx, cluster, &instance.Spec.Template, exporterQueriesConfig, exporterWebConfig) } // add nss_wrapper init container and add nss_wrapper env vars to the database and pgbackrest diff --git a/internal/controller/postgrescluster/metrics_setup.sql b/internal/controller/postgrescluster/metrics_setup.sql new file mode 100644 index 0000000000..b99290fe1c --- /dev/null +++ b/internal/controller/postgrescluster/metrics_setup.sql @@ -0,0 +1,72 @@ +-- +-- Copyright © 2017-2024 Crunchy Data Solutions, Inc. All Rights Reserved. +-- + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'ccp_monitoring') THEN + CREATE ROLE ccp_monitoring WITH LOGIN; + END IF; + + -- The pgmonitor role is required by the pgnodemx extension in PostgreSQL versions 9.5 and 9.6 + -- and should be removed when upgrading to PostgreSQL 10 and above. + IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'pgmonitor') THEN + DROP ROLE pgmonitor; + END IF; +END +$$; + +GRANT pg_monitor to ccp_monitoring; +GRANT pg_execute_server_program TO ccp_monitoring; + +ALTER ROLE ccp_monitoring SET lock_timeout TO '2min'; +ALTER ROLE ccp_monitoring SET jit TO 'off'; + +CREATE SCHEMA IF NOT EXISTS monitor AUTHORIZATION ccp_monitoring; + +DROP TABLE IF EXISTS monitor.pg_stat_statements_reset_info; +-- Table to store last reset time for pg_stat_statements +CREATE TABLE monitor.pg_stat_statements_reset_info( + reset_time timestamptz +); + +DROP FUNCTION IF EXISTS monitor.pg_stat_statements_reset_info(int); +-- Function to reset pg_stat_statements periodically +CREATE FUNCTION monitor.pg_stat_statements_reset_info(p_throttle_minutes integer DEFAULT 1440) + RETURNS bigint + LANGUAGE plpgsql + SECURITY DEFINER + SET search_path TO pg_catalog, pg_temp +AS $function$ +DECLARE + + v_reset_timestamp timestamptz; + v_throttle interval; + +BEGIN + + IF p_throttle_minutes < 0 THEN + RETURN 0; + END IF; + + v_throttle := make_interval(mins := p_throttle_minutes); + + SELECT COALESCE(max(reset_time), '1970-01-01'::timestamptz) INTO v_reset_timestamp FROM monitor.pg_stat_statements_reset_info; + + IF ((CURRENT_TIMESTAMP - v_reset_timestamp) > v_throttle) THEN + -- Ensure table is empty + DELETE FROM monitor.pg_stat_statements_reset_info; + PERFORM pg_stat_statements_reset(); + INSERT INTO monitor.pg_stat_statements_reset_info(reset_time) values (now()); + END IF; + + RETURN (SELECT extract(epoch from reset_time) FROM monitor.pg_stat_statements_reset_info); + +EXCEPTION + WHEN others then + RETURN 0; +END +$function$; + +GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA monitor TO ccp_monitoring; +GRANT ALL ON ALL TABLES IN SCHEMA monitor TO ccp_monitoring; diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index 956a99bffd..23ee1c9bc2 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -6,6 +6,7 @@ package postgrescluster import ( "context" + _ "embed" "fmt" "io" "os" @@ -27,6 +28,9 @@ import ( "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" ) +//go:embed "metrics_setup.sql" +var metricsSetupForOTelCollector string + // If pgMonitor is enabled the pgMonitor sidecar(s) have been added to the // instance pod. reconcilePGMonitor will update the database to // create the necessary objects for the tool to run @@ -75,13 +79,17 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context, return err } - // TODO: Revisit how pgbackrest_info.sh is used with pgMonitor. - // pgMonitor queries expect a path to a script that runs pgBackRest - // info and provides json output. In the queries yaml for pgBackRest - // the default path is `/usr/bin/pgbackrest-info.sh`. We update - // the path to point to the script in our database image. - setup = strings.ReplaceAll(string(sql), "/usr/bin/pgbackrest-info.sh", - "/opt/crunchy/bin/postgres/pgbackrest_info.sh") + if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + setup = metricsSetupForOTelCollector + } else { + // TODO: Revisit how pgbackrest_info.sh is used with pgMonitor. + // pgMonitor queries expect a path to a script that runs pgBackRest + // info and provides json output. In the queries yaml for pgBackRest + // the default path is `/usr/bin/pgbackrest-info.sh`. We update + // the path to point to the script in our database image. + setup = strings.ReplaceAll(string(sql), "/usr/bin/pgbackrest-info.sh", + "/opt/crunchy/bin/postgres/pgbackrest_info.sh") + } for _, containerStatus := range writablePod.Status.ContainerStatuses { if containerStatus.Name == naming.ContainerDatabase { @@ -227,9 +235,9 @@ func (r *Reconciler) reconcileMonitoringSecret( return nil, err } -// addPGMonitorToInstancePodSpec performs the necessary setup to add -// pgMonitor resources on a PodTemplateSpec -func addPGMonitorToInstancePodSpec( +// addPGExporterToInstancePodSpec performs the necessary setup to add +// pgMonitor resources on a PodTemplateSpec for running postgres-exporter. +func addPGExporterToInstancePodSpec( ctx context.Context, cluster *v1beta1.PostgresCluster, template *corev1.PodTemplateSpec, @@ -456,7 +464,7 @@ func (r *Reconciler) reconcileExporterQueriesConfig(ctx context.Context, return nil, err } - if !pgmonitor.ExporterEnabled(cluster) { + if !pgmonitor.ExporterEnabled(cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // We could still have a NotFound error here so check the err. // If no error that means the configmap is found and needs to be deleted if err == nil { From 476452ef92a2ad962bb45a27a32c3594e573a38d Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Wed, 19 Feb 2025 09:31:13 -0500 Subject: [PATCH 02/22] Adds comments on NULL handling --- internal/collector/gte_pg16_metrics.yaml | 3 ++- internal/collector/lt_pg16_metrics.yaml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/internal/collector/gte_pg16_metrics.yaml b/internal/collector/gte_pg16_metrics.yaml index e5aeb7194e..319aad62dc 100644 --- a/internal/collector/gte_pg16_metrics.yaml +++ b/internal/collector/gte_pg16_metrics.yaml @@ -5,7 +5,8 @@ # https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml # NOTE: Some of the columns below can return NULL values, for which sqlqueryreceiver will warn. -# Those columns are idx_scan and idx_tup_fetch. +# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/sqlqueryreceiver#null-values +# Those columns are idx_scan and idx_tup_fetch and we avoid NULL by using COALESCE. - sql: > SELECT current_database() as dbname diff --git a/internal/collector/lt_pg16_metrics.yaml b/internal/collector/lt_pg16_metrics.yaml index e6dd086497..ca9fe8a0c8 100644 --- a/internal/collector/lt_pg16_metrics.yaml +++ b/internal/collector/lt_pg16_metrics.yaml @@ -5,7 +5,8 @@ # https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml # NOTE: Some of the columns below can return NULL values, for which sqlqueryreceiver will warn. -# Those columns are idx_scan and idx_tup_fetch. We now use COALESCE to return 0 as a default. +# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/sqlqueryreceiver#null-values +# Those columns are idx_scan and idx_tup_fetch and we avoid NULL by using COALESCE. - sql: > SELECT current_database() as dbname From ffb544593b45184d68584c06863093e0a96e43ca Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Wed, 19 Feb 2025 09:48:55 -0500 Subject: [PATCH 03/22] Acknowledge NotFound error for monitor user --- internal/controller/postgrescluster/instance.go | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/internal/controller/postgrescluster/instance.go b/internal/controller/postgrescluster/instance.go index 56b414439b..1c32ba8f1d 100644 --- a/internal/controller/postgrescluster/instance.go +++ b/internal/controller/postgrescluster/instance.go @@ -1213,15 +1213,13 @@ func (r *Reconciler) reconcileInstance( feature.Enabled(ctx, feature.OpenTelemetryMetrics) { monitoringUserSecret := &corev1.Secret{ObjectMeta: naming.MonitoringUserSecret(cluster)} - err := errors.WithStack( + err = errors.WithStack( r.Client.Get(ctx, client.ObjectKeyFromObject(monitoringUserSecret), monitoringUserSecret)) - if client.IgnoreNotFound(err) != nil { - return err + if err == nil { + collector.AddToPod(ctx, cluster.Spec.Instrumentation, cluster.Spec.ImagePullPolicy, instanceConfigMap, &instance.Spec.Template.Spec, + []corev1.VolumeMount{postgres.DataVolumeMount()}, string(monitoringUserSecret.Data["password"]), false) } - - collector.AddToPod(ctx, cluster.Spec.Instrumentation, cluster.Spec.ImagePullPolicy, instanceConfigMap, &instance.Spec.Template.Spec, - []corev1.VolumeMount{postgres.DataVolumeMount()}, string(monitoringUserSecret.Data["password"]), false) } // Add postgres-exporter to the instance Pod spec From 04edb2c7b66d4004bb2a896a1f1ccb3229e92b13 Mon Sep 17 00:00:00 2001 From: Tony Landreth <56887169+tony-landreth@users.noreply.github.com> Date: Wed, 19 Feb 2025 09:52:46 -0500 Subject: [PATCH 04/22] Update internal/controller/postgrescluster/metrics_setup.sql Co-authored-by: Benjamin Blattberg --- internal/controller/postgrescluster/metrics_setup.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/controller/postgrescluster/metrics_setup.sql b/internal/controller/postgrescluster/metrics_setup.sql index b99290fe1c..fb22a7b7fd 100644 --- a/internal/controller/postgrescluster/metrics_setup.sql +++ b/internal/controller/postgrescluster/metrics_setup.sql @@ -1,5 +1,5 @@ -- --- Copyright © 2017-2024 Crunchy Data Solutions, Inc. All Rights Reserved. +-- Copyright © 2017-2025 Crunchy Data Solutions, Inc. All Rights Reserved. -- DO $$ From b99c5c50bb974ac2a1951ac30f8a770dbda493f6 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Wed, 19 Feb 2025 11:55:02 -0500 Subject: [PATCH 05/22] Adds provisional ccp_pg_stat_activity_count --- .../generated/postgres_5s_metrics.json | 2 +- internal/collector/postgres_5s_metrics.yaml | 37 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/internal/collector/generated/postgres_5s_metrics.json b/internal/collector/generated/postgres_5s_metrics.json index 02b79ce2ed..bc89b821e5 100644 --- a/internal/collector/generated/postgres_5s_metrics.json +++ b/internal/collector/generated/postgres_5s_metrics.json @@ -1 +1 @@ -[{"metrics":[{"description":"Seconds since the last successful archive operation","metric_name":"ccp_archive_command_status_seconds_since_last_archive","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_archive","value_type":"double"}],"sql":"SELECT EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)) AS seconds_since_last_archive FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of WAL files that have been successfully archived","metric_name":"ccp_archive_command_status_archived_count","static_attributes":{"server":"localhost:5432"},"value_column":"archived_count"}],"sql":"SELECT archived_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of failed attempts for archiving WAL files","metric_name":"ccp_archive_command_status_failed_count","static_attributes":{"server":"localhost:5432"},"value_column":"failed_count"}],"sql":"SELECT failed_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Seconds since the last recorded failure of the archive_command","metric_name":"ccp_archive_command_status_seconds_since_last_fail","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_fail"}],"sql":"SELECT CASE\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) \u003c 0 THEN 0\n ELSE EXTRACT(epoch from (last_failed_time - last_archived_time))\n END AS seconds_since_last_fail\nFROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Total non-idle connections","metric_name":"ccp_connection_stats_active","static_attributes":{"server":"localhost:5432"},"value_column":"active"},{"description":"Total idle connections","metric_name":"ccp_connection_stats_idle","static_attributes":{"server":"localhost:5432"},"value_column":"idle"},{"description":"Total idle in transaction connections","metric_name":"ccp_connection_stats_idle_in_txn","static_attributes":{"server":"localhost:5432"},"value_column":"idle_in_txn"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_blocked_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_blocked_query_time","value_type":"double"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_connections","static_attributes":{"server":"localhost:5432"},"value_column":"max_connections"},{"description":"Length of time in seconds of the longest idle in transaction session","metric_name":"ccp_connection_stats_max_idle_in_txn_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_idle_in_txn_time","value_type":"double"},{"description":"Length of time in seconds of the longest running query","metric_name":"ccp_connection_stats_max_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_query_time","value_type":"double"},{"description":"Total idle and non-idle connections","metric_name":"ccp_connection_stats_total","static_attributes":{"server":"localhost:5432"},"value_column":"total"}],"sql":"SELECT ((total - idle) - idle_in_txn) as active\n , total\n , idle\n , idle_in_txn\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - state_change))),0) FROM pg_catalog.pg_stat_activity WHERE state = 'idle in transaction') AS max_idle_in_txn_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND state \u003c\u003e 'idle' ) AS max_query_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND wait_event_type = 'Lock' ) AS max_blocked_query_time\n , max_connections\n FROM (\n SELECT COUNT(*) as total\n , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle\n , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x\n JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);\n"},{"metrics":[{"attribute_columns":["dbname"],"description":"Total number of checksum failures on this database","metric_name":"ccp_data_checksum_failure_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"},{"attribute_columns":["dbname"],"description":"Time interval in seconds since the last checksum failure was encountered","metric_name":"ccp_data_checksum_failure_time_since_last_failure_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"time_since_last_failure_seconds","value_type":"double"}],"sql":"SELECT datname AS dbname , checksum_failures AS count , coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds FROM pg_catalog.pg_stat_database WHERE pg_stat_database.datname IS NOT NULL;\n"},{"metrics":[{"attribute_columns":["dbname","mode"],"description":"Return value of 1 means database is in recovery. Otherwise 2 it is a primary.","metric_name":"ccp_locks_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT pg_database.datname as dbname , tmp.mode , COALESCE(count,0) as count FROM (\n VALUES ('accesssharelock'),\n ('rowsharelock'),\n ('rowexclusivelock'),\n ('shareupdateexclusivelock'),\n ('sharelock'),\n ('sharerowexclusivelock'),\n ('exclusivelock'),\n ('accessexclusivelock')\n) AS tmp(mode) CROSS JOIN pg_catalog.pg_database LEFT JOIN\n (SELECT database, lower(mode) AS mode,count(*) AS count\n FROM pg_catalog.pg_locks WHERE database IS NOT NULL\n GROUP BY database, lower(mode)\n) AS tmp2 ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database;\n"},{"metrics":[{"description":"CPU limit value in milli cores","metric_name":"ccp_nodemx_cpu_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"CPU request value in milli cores","metric_name":"ccp_nodemx_cpu_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"}],"sql":"SELECT monitor.kdapi_scalar_bigint('cpu_request') AS request , monitor.kdapi_scalar_bigint('cpu_limit') AS limit\n"},{"metrics":[{"description":"CPU usage in nanoseconds","metric_name":"ccp_nodemx_cpuacct_usage","static_attributes":{"server":"localhost:5432"},"value_column":"usage","value_type":"double"},{"description":"CPU usage snapshot timestamp","metric_name":"ccp_nodemx_cpuacct_usage_ts","static_attributes":{"server":"localhost:5432"},"value_column":"usage_ts","value_type":"double"}],"sql":"SELECT CASE WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('cpuacct.usage')\n ELSE (SELECT val FROM monitor.cgroup_setof_kv('cpu.stat') where key = 'usage_usec') * 1000\n END AS usage,\n extract(epoch from clock_timestamp()) AS usage_ts;\n"},{"metrics":[{"description":"The total available run-time within a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_period_us","static_attributes":{"server":"localhost:5432"},"value_column":"period_us"},{"description":"The length of a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_quota_us","static_attributes":{"server":"localhost:5432"},"value_column":"quota_us","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n monitor.cgroup_scalar_bigint('cpu.cfs_period_us')\n ELSE\n (monitor.cgroup_array_bigint('cpu.max'))[2]\n END AS period_us,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n GREATEST(monitor.cgroup_scalar_bigint('cpu.cfs_quota_us'), 0)\n ELSE\n GREATEST((monitor.cgroup_array_bigint('cpu.max'))[1], 0)\n END AS quota_us;\n"},{"metrics":[{"description":"Number of periods that any thread was runnable","metric_name":"ccp_nodemx_cpustat_nr_periods","static_attributes":{"server":"localhost:5432"},"value_column":"nr_periods","value_type":"double"},{"description":"Number of runnable periods in which the application used its entire quota and was throttled","metric_name":"ccp_nodemx_cpustat_nr_throttled","static_attributes":{"server":"localhost:5432"},"value_column":"nr_throttled"},{"description":"CPU stat snapshot timestamp","metric_name":"ccp_nodemx_cpustat_snap_ts","static_attributes":{"server":"localhost:5432"},"value_column":"snap_ts","value_type":"double"},{"description":"Sum total amount of time individual threads within the monitor.cgroup were throttled","metric_name":"ccp_nodemx_cpustat_throttled_time","static_attributes":{"server":"localhost:5432"},"value_column":"throttled_time","value_type":"double"}],"sql":"WITH d(key, val) AS (select key, val from monitor.cgroup_setof_kv('cpu.stat')) SELECT\n (SELECT val FROM d WHERE key='nr_periods') AS nr_periods,\n (SELECT val FROM d WHERE key='nr_throttled') AS nr_throttled,\n (SELECT val FROM d WHERE key='throttled_usec') AS throttled_time,\n extract(epoch from clock_timestamp()) as snap_ts;\n"},{"metrics":[{"attribute_columns":["fs_type","mount_point"],"description":"Available size in bytes","metric_name":"ccp_nodemx_data_disk_available_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"available_bytes","value_type":"double"},{"attribute_columns":["fs_type","mount_point"],"description":"Available file nodes","metric_name":"ccp_nodemx_data_disk_free_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"free_file_nodes"},{"attribute_columns":["fs_type","mount_point"],"description":"Size in bytes","metric_name":"ccp_nodemx_data_disk_total_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_bytes"},{"attribute_columns":["fs_type","mount_point"],"description":"Total file nodes","metric_name":"ccp_nodemx_data_disk_total_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"total_file_nodes"}],"sql":"SELECT mount_point,fs_type,total_bytes,available_bytes,total_file_nodes,free_file_nodes\n FROM monitor.proc_mountinfo() m\n JOIN monitor.fsinfo(m.mount_point) f USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%'\n"},{"metrics":[{"attribute_columns":["mount_point"],"description":"Total sectors read","metric_name":"ccp_nodemx_disk_activity_sectors_read","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_read"},{"attribute_columns":["mount_point"],"description":"Total sectors written","metric_name":"ccp_nodemx_disk_activity_sectors_written","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_written"}],"sql":"SELECT mount_point,sectors_read,sectors_written\n FROM monitor.proc_mountinfo() m\n JOIN monitor.proc_diskstats() d USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%';\n"},{"metrics":[{"description":"Total bytes of anonymous and swap cache memory on active LRU list","metric_name":"ccp_nodemx_mem_active_anon","static_attributes":{"server":"localhost:5432"},"value_column":"active_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on active LRU list","metric_name":"ccp_nodemx_mem_active_file","static_attributes":{"server":"localhost:5432"},"value_column":"active_file","value_type":"double"},{"description":"Total bytes of page cache memory","metric_name":"ccp_nodemx_mem_cache","static_attributes":{"server":"localhost:5432"},"value_column":"cache","value_type":"double"},{"description":"Total bytes that are waiting to get written back to the disk","metric_name":"ccp_nodemx_mem_dirty","static_attributes":{"server":"localhost:5432"},"value_column":"dirty"},{"description":"Total bytes of anonymous and swap cache memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_anon","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_file","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_file","value_type":"double"},{"description":"Unknown metric from ccp_nodemx_mem","metric_name":"ccp_nodemx_mem_kmem_usage_in_byte","static_attributes":{"server":"localhost:5432"},"value_column":"kmem_usage_in_byte"},{"description":"Memory limit value in bytes","metric_name":"ccp_nodemx_mem_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"Total bytes of mapped file (includes tmpfs/shmem)","metric_name":"ccp_nodemx_mem_mapped_file","static_attributes":{"server":"localhost:5432"},"value_column":"mapped_file"},{"description":"Memory request value in bytes","metric_name":"ccp_nodemx_mem_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"},{"description":"Total bytes of anonymous and swap cache memory","metric_name":"ccp_nodemx_mem_rss","static_attributes":{"server":"localhost:5432"},"value_column":"rss","value_type":"double"},{"description":"Total bytes of shared memory","metric_name":"ccp_nodemx_mem_shmem","static_attributes":{"server":"localhost:5432"},"value_column":"shmem","value_type":"double"},{"description":"Total usage in bytes","metric_name":"ccp_nodemx_mem_usage_in_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"usage_in_bytes"}],"sql":"WITH d(key, val) as (SELECT key, val FROM monitor.cgroup_setof_kv('memory.stat')) SELECT\n monitor.kdapi_scalar_bigint('mem_request') AS request,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.limit_in_bytes') = 9223372036854771712 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.limit_in_bytes') END)\n ELSE\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.max') = 9223372036854775807 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.max') END)\n END AS limit,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='cache')\n ELSE 0\n END as cache,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='rss')\n ELSE 0\n END as RSS,\n (SELECT val FROM d WHERE key='shmem') as shmem,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='mapped_file')\n ELSE 0\n END as mapped_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='dirty')\n ELSE (SELECT val FROM d WHERE key='file_dirty')\n END as dirty,\n (SELECT val FROM d WHERE key='active_anon') as active_anon,\n (SELECT val FROM d WHERE key='inactive_anon') as inactive_anon,\n (SELECT val FROM d WHERE key='active_file') as active_file,\n (SELECT val FROM d WHERE key='inactive_file') as inactive_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.usage_in_bytes')\n ELSE monitor.cgroup_scalar_bigint('memory.current')\n END as usage_in_bytes,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.kmem.usage_in_bytes')\n ELSE 0\n END as kmem_usage_in_byte;\n"},{"metrics":[{"attribute_columns":["interface"],"description":"Number of bytes received","metric_name":"ccp_nodemx_network_rx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"rx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets received","metric_name":"ccp_nodemx_network_rx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"rx_packets"},{"attribute_columns":["interface"],"description":"Number of bytes transmitted","metric_name":"ccp_nodemx_network_tx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"tx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets transmitted","metric_name":"ccp_nodemx_network_tx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"tx_packets"}],"sql":"SELECT interface\n ,tx_bytes\n ,tx_packets\n ,rx_bytes\n ,rx_packets from monitor.proc_network_stats()\n"},{"metrics":[{"description":"Total number of database processes","metric_name":"ccp_nodemx_process_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT monitor.cgroup_process_count() as count;\n"},{"metrics":[{"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_reset_time","static_attributes":{"server":"localhost:5432"},"value_column":"time"}],"sql":"SELECT monitor.pg_stat_statements_reset_info(-1) as time;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Average query runtime in milliseconds","metric_name":"ccp_pg_stat_statements_top_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"top_mean_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max(monitor.mean_exec_time) AS top_mean_exec_time_ms\nFROM monitor GROUP BY 1,2,3,4 ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","role"],"description":"Total number of queries run per user/database","metric_name":"ccp_pg_stat_statements_total_calls_count","static_attributes":{"server":"localhost:5432"},"value_column":"calls_count","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"mean_exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total rows returned from all queries per user/database","metric_name":"ccp_pg_stat_statements_total_row_count","static_attributes":{"server":"localhost:5432"},"value_column":"row_count","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.calls\n , s.total_exec_time\n , s.mean_exec_time\n , s.rows\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , sum(calls) AS calls_count\n , sum(total_exec_time) AS exec_time_ms\n , avg(mean_exec_time) AS mean_exec_time_ms\n , sum(rows) AS row_count\nFROM monitor GROUP BY 1,2;\n"},{"metrics":[{"description":"The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######).","metric_name":"ccp_postgresql_version_current","static_attributes":{"server":"localhost:5432"},"value_column":"current"}],"sql":"SELECT current_setting('server_version_num')::int AS current;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_postmaster_uptime_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"seconds","value_type":"double"}],"sql":"SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_replication_lag_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"}],"sql":"SELECT pg_wal_lsn_diff(sent_lsn, replay_lsn) AS bytes\n FROM pg_catalog.pg_stat_replication;\n"},{"metrics":[{"attribute_columns":["role"],"description":"Length of time since the last WAL file was received and replayed on replica.\nAlways increases, possibly causing false positives if the primary stops writing.\nMonitors for replicas that stop receiving WAL all together.\n","metric_name":"ccp_replication_lag_received_time","static_attributes":{"server":"localhost:5432"},"value_column":"received_time","value_type":"double"},{"attribute_columns":["role"],"description":"Length of time since the last transaction was replayed on replica.\nReturns zero if last WAL received equals last WAL replayed. Avoids\nfalse positives when primary stops writing. Monitors for replicas that\ncannot keep up with primary WAL generation.\n","metric_name":"ccp_replication_lag_replay_time","static_attributes":{"server":"localhost:5432"},"value_column":"replay_time","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END\nAS replay_time , CASE\n WHEN pg_is_in_recovery() = false THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END\nAS received_time , CASE\n WHEN pg_is_in_recovery() = true THEN 'replica'\n ELSE 'primary'\n END\nAS role;\n"},{"metrics":[{"description":"Number of settings from pg_settings catalog in a pending_restart state","metric_name":"ccp_settings_pending_restart_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true;\n"},{"metrics":[{"description":"Number of buffers allocated","metric_name":"ccp_stat_bgwriter_buffers_alloc","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_alloc"},{"data_type":"sum","description":"Number of buffers written by the background writer","metric_name":"ccp_stat_bgwriter_buffers_clean","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_clean"},{"description":"Number of times the background writer stopped a cleaning scan because it had written too many buffers","metric_name":"ccp_stat_bgwriter_maxwritten_clean","static_attributes":{"server":"localhost:5432"},"value_column":"maxwritten_clean"}],"sql":"SELECT\n buffers_clean\n , maxwritten_clean\n , buffers_alloc\nFROM pg_catalog.pg_stat_bgwriter;\n"},{"metrics":[{"description":"Oldest current transaction ID in cluster","metric_name":"ccp_transaction_wraparound_oldest_current_xid","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_current_xid"},{"description":"Percentage towards emergency autovacuum process starting","metric_name":"ccp_transaction_wraparound_percent_towards_emergency_autovac","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_emergency_autovac"},{"description":"Percentage towards transaction ID wraparound","metric_name":"ccp_transaction_wraparound_percent_towards_wraparound","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_wraparound"}],"sql":"WITH max_age AS (\n SELECT 2000000000 as max_old_xid\n , setting AS autovacuum_freeze_max_age\n FROM pg_catalog.pg_settings\n WHERE name = 'autovacuum_freeze_max_age')\n, per_database_stats AS (\n SELECT datname\n , m.max_old_xid::int\n , m.autovacuum_freeze_max_age::int\n , age(d.datfrozenxid) AS oldest_current_xid\n FROM pg_catalog.pg_database d\n JOIN max_age m ON (true)\n WHERE d.datallowconn)\nSELECT max(oldest_current_xid) AS oldest_current_xid , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac FROM per_database_stats;\n"},{"metrics":[{"description":"Current size in bytes of the WAL directory","metric_name":"ccp_wal_activity_total_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_size_bytes"}],"sql":"SELECT last_5_min_size_bytes,\n (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes\n FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification \u003e CURRENT_TIMESTAMP - '5 minutes'::interval) x;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_top_max_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"max_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total time spent in the statement in milliseconds","metric_name":"ccp_pg_stat_statements_top_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"total_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , total_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total amount of WAL generated by the statement in bytes","metric_name":"ccp_pg_stat_statements_top_wal_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL full page images generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_fpi","static_attributes":{"server":"localhost:5432"},"value_column":"fpi","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL records generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_records","static_attributes":{"server":"localhost:5432"},"value_column":"records","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , query\n , queryid\n , records\n , fpi\n , bytes\nFROM monitor ORDER BY bytes DESC LIMIT 20;\n"}] +[{"metrics":[{"attribute_columns":["application_name","datname","state","usename"],"description":"number of connections in this state","metric_name":"ccp_pg_stat_activity_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT\n pg_database.datname,\n tmp.state,\n tmp2.usename,\n tmp2.application_name,\n COALESCE(count,0) as count,\n COALESCE(max_tx_duration,0) as max_tx_duration\nFROM\n (\n VALUES ('active'),\n ('idle'),\n ('idle in transaction'),\n ('idle in transaction (aborted)'),\n ('fastpath function call'),\n ('disabled')\n ) AS tmp(state) CROSS JOIN pg_database\nLEFT JOIN (\n SELECT\n datname,\n state,\n usename,\n application_name,\n count(*) AS count,\n MAX(EXTRACT(EPOCH FROM now() - xact_start))::float AS max_tx_duration\n FROM pg_stat_activity GROUP BY datname,state,usename,application_name) AS tmp2\n ON tmp.state = tmp2.state AND pg_database.datname = tmp2.datname;\n"},{"metrics":[{"description":"Seconds since the last successful archive operation","metric_name":"ccp_archive_command_status_seconds_since_last_archive","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_archive","value_type":"double"}],"sql":"SELECT EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)) AS seconds_since_last_archive FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of WAL files that have been successfully archived","metric_name":"ccp_archive_command_status_archived_count","static_attributes":{"server":"localhost:5432"},"value_column":"archived_count"}],"sql":"SELECT archived_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of failed attempts for archiving WAL files","metric_name":"ccp_archive_command_status_failed_count","static_attributes":{"server":"localhost:5432"},"value_column":"failed_count"}],"sql":"SELECT failed_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Seconds since the last recorded failure of the archive_command","metric_name":"ccp_archive_command_status_seconds_since_last_fail","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_fail"}],"sql":"SELECT CASE\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) \u003c 0 THEN 0\n ELSE EXTRACT(epoch from (last_failed_time - last_archived_time))\n END AS seconds_since_last_fail\nFROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Total non-idle connections","metric_name":"ccp_connection_stats_active","static_attributes":{"server":"localhost:5432"},"value_column":"active"},{"description":"Total idle connections","metric_name":"ccp_connection_stats_idle","static_attributes":{"server":"localhost:5432"},"value_column":"idle"},{"description":"Total idle in transaction connections","metric_name":"ccp_connection_stats_idle_in_txn","static_attributes":{"server":"localhost:5432"},"value_column":"idle_in_txn"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_blocked_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_blocked_query_time","value_type":"double"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_connections","static_attributes":{"server":"localhost:5432"},"value_column":"max_connections"},{"description":"Length of time in seconds of the longest idle in transaction session","metric_name":"ccp_connection_stats_max_idle_in_txn_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_idle_in_txn_time","value_type":"double"},{"description":"Length of time in seconds of the longest running query","metric_name":"ccp_connection_stats_max_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_query_time","value_type":"double"},{"description":"Total idle and non-idle connections","metric_name":"ccp_connection_stats_total","static_attributes":{"server":"localhost:5432"},"value_column":"total"}],"sql":"SELECT ((total - idle) - idle_in_txn) as active\n , total\n , idle\n , idle_in_txn\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - state_change))),0) FROM pg_catalog.pg_stat_activity WHERE state = 'idle in transaction') AS max_idle_in_txn_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND state \u003c\u003e 'idle' ) AS max_query_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND wait_event_type = 'Lock' ) AS max_blocked_query_time\n , max_connections\n FROM (\n SELECT COUNT(*) as total\n , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle\n , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x\n JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);\n"},{"metrics":[{"attribute_columns":["dbname"],"description":"Total number of checksum failures on this database","metric_name":"ccp_data_checksum_failure_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"},{"attribute_columns":["dbname"],"description":"Time interval in seconds since the last checksum failure was encountered","metric_name":"ccp_data_checksum_failure_time_since_last_failure_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"time_since_last_failure_seconds","value_type":"double"}],"sql":"SELECT datname AS dbname , checksum_failures AS count , coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds FROM pg_catalog.pg_stat_database WHERE pg_stat_database.datname IS NOT NULL;\n"},{"metrics":[{"attribute_columns":["dbname","mode"],"description":"Return value of 1 means database is in recovery. Otherwise 2 it is a primary.","metric_name":"ccp_locks_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT pg_database.datname as dbname , tmp.mode , COALESCE(count,0) as count FROM (\n VALUES ('accesssharelock'),\n ('rowsharelock'),\n ('rowexclusivelock'),\n ('shareupdateexclusivelock'),\n ('sharelock'),\n ('sharerowexclusivelock'),\n ('exclusivelock'),\n ('accessexclusivelock')\n) AS tmp(mode) CROSS JOIN pg_catalog.pg_database LEFT JOIN\n (SELECT database, lower(mode) AS mode,count(*) AS count\n FROM pg_catalog.pg_locks WHERE database IS NOT NULL\n GROUP BY database, lower(mode)\n) AS tmp2 ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database;\n"},{"metrics":[{"description":"CPU limit value in milli cores","metric_name":"ccp_nodemx_cpu_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"CPU request value in milli cores","metric_name":"ccp_nodemx_cpu_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"}],"sql":"SELECT monitor.kdapi_scalar_bigint('cpu_request') AS request , monitor.kdapi_scalar_bigint('cpu_limit') AS limit\n"},{"metrics":[{"description":"CPU usage in nanoseconds","metric_name":"ccp_nodemx_cpuacct_usage","static_attributes":{"server":"localhost:5432"},"value_column":"usage","value_type":"double"},{"description":"CPU usage snapshot timestamp","metric_name":"ccp_nodemx_cpuacct_usage_ts","static_attributes":{"server":"localhost:5432"},"value_column":"usage_ts","value_type":"double"}],"sql":"SELECT CASE WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('cpuacct.usage')\n ELSE (SELECT val FROM monitor.cgroup_setof_kv('cpu.stat') where key = 'usage_usec') * 1000\n END AS usage,\n extract(epoch from clock_timestamp()) AS usage_ts;\n"},{"metrics":[{"description":"The total available run-time within a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_period_us","static_attributes":{"server":"localhost:5432"},"value_column":"period_us"},{"description":"The length of a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_quota_us","static_attributes":{"server":"localhost:5432"},"value_column":"quota_us","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n monitor.cgroup_scalar_bigint('cpu.cfs_period_us')\n ELSE\n (monitor.cgroup_array_bigint('cpu.max'))[2]\n END AS period_us,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n GREATEST(monitor.cgroup_scalar_bigint('cpu.cfs_quota_us'), 0)\n ELSE\n GREATEST((monitor.cgroup_array_bigint('cpu.max'))[1], 0)\n END AS quota_us;\n"},{"metrics":[{"description":"Number of periods that any thread was runnable","metric_name":"ccp_nodemx_cpustat_nr_periods","static_attributes":{"server":"localhost:5432"},"value_column":"nr_periods","value_type":"double"},{"description":"Number of runnable periods in which the application used its entire quota and was throttled","metric_name":"ccp_nodemx_cpustat_nr_throttled","static_attributes":{"server":"localhost:5432"},"value_column":"nr_throttled"},{"description":"CPU stat snapshot timestamp","metric_name":"ccp_nodemx_cpustat_snap_ts","static_attributes":{"server":"localhost:5432"},"value_column":"snap_ts","value_type":"double"},{"description":"Sum total amount of time individual threads within the monitor.cgroup were throttled","metric_name":"ccp_nodemx_cpustat_throttled_time","static_attributes":{"server":"localhost:5432"},"value_column":"throttled_time","value_type":"double"}],"sql":"WITH d(key, val) AS (select key, val from monitor.cgroup_setof_kv('cpu.stat')) SELECT\n (SELECT val FROM d WHERE key='nr_periods') AS nr_periods,\n (SELECT val FROM d WHERE key='nr_throttled') AS nr_throttled,\n (SELECT val FROM d WHERE key='throttled_usec') AS throttled_time,\n extract(epoch from clock_timestamp()) as snap_ts;\n"},{"metrics":[{"attribute_columns":["fs_type","mount_point"],"description":"Available size in bytes","metric_name":"ccp_nodemx_data_disk_available_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"available_bytes","value_type":"double"},{"attribute_columns":["fs_type","mount_point"],"description":"Available file nodes","metric_name":"ccp_nodemx_data_disk_free_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"free_file_nodes"},{"attribute_columns":["fs_type","mount_point"],"description":"Size in bytes","metric_name":"ccp_nodemx_data_disk_total_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_bytes"},{"attribute_columns":["fs_type","mount_point"],"description":"Total file nodes","metric_name":"ccp_nodemx_data_disk_total_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"total_file_nodes"}],"sql":"SELECT mount_point,fs_type,total_bytes,available_bytes,total_file_nodes,free_file_nodes\n FROM monitor.proc_mountinfo() m\n JOIN monitor.fsinfo(m.mount_point) f USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%'\n"},{"metrics":[{"attribute_columns":["mount_point"],"description":"Total sectors read","metric_name":"ccp_nodemx_disk_activity_sectors_read","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_read"},{"attribute_columns":["mount_point"],"description":"Total sectors written","metric_name":"ccp_nodemx_disk_activity_sectors_written","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_written"}],"sql":"SELECT mount_point,sectors_read,sectors_written\n FROM monitor.proc_mountinfo() m\n JOIN monitor.proc_diskstats() d USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%';\n"},{"metrics":[{"description":"Total bytes of anonymous and swap cache memory on active LRU list","metric_name":"ccp_nodemx_mem_active_anon","static_attributes":{"server":"localhost:5432"},"value_column":"active_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on active LRU list","metric_name":"ccp_nodemx_mem_active_file","static_attributes":{"server":"localhost:5432"},"value_column":"active_file","value_type":"double"},{"description":"Total bytes of page cache memory","metric_name":"ccp_nodemx_mem_cache","static_attributes":{"server":"localhost:5432"},"value_column":"cache","value_type":"double"},{"description":"Total bytes that are waiting to get written back to the disk","metric_name":"ccp_nodemx_mem_dirty","static_attributes":{"server":"localhost:5432"},"value_column":"dirty"},{"description":"Total bytes of anonymous and swap cache memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_anon","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_file","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_file","value_type":"double"},{"description":"Unknown metric from ccp_nodemx_mem","metric_name":"ccp_nodemx_mem_kmem_usage_in_byte","static_attributes":{"server":"localhost:5432"},"value_column":"kmem_usage_in_byte"},{"description":"Memory limit value in bytes","metric_name":"ccp_nodemx_mem_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"Total bytes of mapped file (includes tmpfs/shmem)","metric_name":"ccp_nodemx_mem_mapped_file","static_attributes":{"server":"localhost:5432"},"value_column":"mapped_file"},{"description":"Memory request value in bytes","metric_name":"ccp_nodemx_mem_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"},{"description":"Total bytes of anonymous and swap cache memory","metric_name":"ccp_nodemx_mem_rss","static_attributes":{"server":"localhost:5432"},"value_column":"rss","value_type":"double"},{"description":"Total bytes of shared memory","metric_name":"ccp_nodemx_mem_shmem","static_attributes":{"server":"localhost:5432"},"value_column":"shmem","value_type":"double"},{"description":"Total usage in bytes","metric_name":"ccp_nodemx_mem_usage_in_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"usage_in_bytes"}],"sql":"WITH d(key, val) as (SELECT key, val FROM monitor.cgroup_setof_kv('memory.stat')) SELECT\n monitor.kdapi_scalar_bigint('mem_request') AS request,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.limit_in_bytes') = 9223372036854771712 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.limit_in_bytes') END)\n ELSE\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.max') = 9223372036854775807 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.max') END)\n END AS limit,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='cache')\n ELSE 0\n END as cache,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='rss')\n ELSE 0\n END as RSS,\n (SELECT val FROM d WHERE key='shmem') as shmem,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='mapped_file')\n ELSE 0\n END as mapped_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='dirty')\n ELSE (SELECT val FROM d WHERE key='file_dirty')\n END as dirty,\n (SELECT val FROM d WHERE key='active_anon') as active_anon,\n (SELECT val FROM d WHERE key='inactive_anon') as inactive_anon,\n (SELECT val FROM d WHERE key='active_file') as active_file,\n (SELECT val FROM d WHERE key='inactive_file') as inactive_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.usage_in_bytes')\n ELSE monitor.cgroup_scalar_bigint('memory.current')\n END as usage_in_bytes,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.kmem.usage_in_bytes')\n ELSE 0\n END as kmem_usage_in_byte;\n"},{"metrics":[{"attribute_columns":["interface"],"description":"Number of bytes received","metric_name":"ccp_nodemx_network_rx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"rx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets received","metric_name":"ccp_nodemx_network_rx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"rx_packets"},{"attribute_columns":["interface"],"description":"Number of bytes transmitted","metric_name":"ccp_nodemx_network_tx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"tx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets transmitted","metric_name":"ccp_nodemx_network_tx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"tx_packets"}],"sql":"SELECT interface\n ,tx_bytes\n ,tx_packets\n ,rx_bytes\n ,rx_packets from monitor.proc_network_stats()\n"},{"metrics":[{"description":"Total number of database processes","metric_name":"ccp_nodemx_process_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT monitor.cgroup_process_count() as count;\n"},{"metrics":[{"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_reset_time","static_attributes":{"server":"localhost:5432"},"value_column":"time"}],"sql":"SELECT monitor.pg_stat_statements_reset_info(-1) as time;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Average query runtime in milliseconds","metric_name":"ccp_pg_stat_statements_top_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"top_mean_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max(monitor.mean_exec_time) AS top_mean_exec_time_ms\nFROM monitor GROUP BY 1,2,3,4 ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","role"],"description":"Total number of queries run per user/database","metric_name":"ccp_pg_stat_statements_total_calls_count","static_attributes":{"server":"localhost:5432"},"value_column":"calls_count","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"mean_exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total rows returned from all queries per user/database","metric_name":"ccp_pg_stat_statements_total_row_count","static_attributes":{"server":"localhost:5432"},"value_column":"row_count","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.calls\n , s.total_exec_time\n , s.mean_exec_time\n , s.rows\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , sum(calls) AS calls_count\n , sum(total_exec_time) AS exec_time_ms\n , avg(mean_exec_time) AS mean_exec_time_ms\n , sum(rows) AS row_count\nFROM monitor GROUP BY 1,2;\n"},{"metrics":[{"description":"The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######).","metric_name":"ccp_postgresql_version_current","static_attributes":{"server":"localhost:5432"},"value_column":"current"}],"sql":"SELECT current_setting('server_version_num')::int AS current;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_postmaster_uptime_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"seconds","value_type":"double"}],"sql":"SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_replication_lag_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"}],"sql":"SELECT pg_wal_lsn_diff(sent_lsn, replay_lsn) AS bytes\n FROM pg_catalog.pg_stat_replication;\n"},{"metrics":[{"attribute_columns":["role"],"description":"Length of time since the last WAL file was received and replayed on replica.\nAlways increases, possibly causing false positives if the primary stops writing.\nMonitors for replicas that stop receiving WAL all together.\n","metric_name":"ccp_replication_lag_received_time","static_attributes":{"server":"localhost:5432"},"value_column":"received_time","value_type":"double"},{"attribute_columns":["role"],"description":"Length of time since the last transaction was replayed on replica.\nReturns zero if last WAL received equals last WAL replayed. Avoids\nfalse positives when primary stops writing. Monitors for replicas that\ncannot keep up with primary WAL generation.\n","metric_name":"ccp_replication_lag_replay_time","static_attributes":{"server":"localhost:5432"},"value_column":"replay_time","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END\nAS replay_time , CASE\n WHEN pg_is_in_recovery() = false THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END\nAS received_time , CASE\n WHEN pg_is_in_recovery() = true THEN 'replica'\n ELSE 'primary'\n END\nAS role;\n"},{"metrics":[{"description":"Number of settings from pg_settings catalog in a pending_restart state","metric_name":"ccp_settings_pending_restart_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true;\n"},{"metrics":[{"description":"Number of buffers allocated","metric_name":"ccp_stat_bgwriter_buffers_alloc","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_alloc"},{"data_type":"sum","description":"Number of buffers written by the background writer","metric_name":"ccp_stat_bgwriter_buffers_clean","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_clean"},{"description":"Number of times the background writer stopped a cleaning scan because it had written too many buffers","metric_name":"ccp_stat_bgwriter_maxwritten_clean","static_attributes":{"server":"localhost:5432"},"value_column":"maxwritten_clean"}],"sql":"SELECT\n buffers_clean\n , maxwritten_clean\n , buffers_alloc\nFROM pg_catalog.pg_stat_bgwriter;\n"},{"metrics":[{"description":"Oldest current transaction ID in cluster","metric_name":"ccp_transaction_wraparound_oldest_current_xid","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_current_xid"},{"description":"Percentage towards emergency autovacuum process starting","metric_name":"ccp_transaction_wraparound_percent_towards_emergency_autovac","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_emergency_autovac"},{"description":"Percentage towards transaction ID wraparound","metric_name":"ccp_transaction_wraparound_percent_towards_wraparound","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_wraparound"}],"sql":"WITH max_age AS (\n SELECT 2000000000 as max_old_xid\n , setting AS autovacuum_freeze_max_age\n FROM pg_catalog.pg_settings\n WHERE name = 'autovacuum_freeze_max_age')\n, per_database_stats AS (\n SELECT datname\n , m.max_old_xid::int\n , m.autovacuum_freeze_max_age::int\n , age(d.datfrozenxid) AS oldest_current_xid\n FROM pg_catalog.pg_database d\n JOIN max_age m ON (true)\n WHERE d.datallowconn)\nSELECT max(oldest_current_xid) AS oldest_current_xid , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac FROM per_database_stats;\n"},{"metrics":[{"description":"Current size in bytes of the WAL directory","metric_name":"ccp_wal_activity_total_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_size_bytes"}],"sql":"SELECT last_5_min_size_bytes,\n (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes\n FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification \u003e CURRENT_TIMESTAMP - '5 minutes'::interval) x;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_top_max_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"max_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total time spent in the statement in milliseconds","metric_name":"ccp_pg_stat_statements_top_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"total_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , total_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total amount of WAL generated by the statement in bytes","metric_name":"ccp_pg_stat_statements_top_wal_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL full page images generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_fpi","static_attributes":{"server":"localhost:5432"},"value_column":"fpi","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL records generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_records","static_attributes":{"server":"localhost:5432"},"value_column":"records","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , query\n , queryid\n , records\n , fpi\n , bytes\nFROM monitor ORDER BY bytes DESC LIMIT 20;\n"}] diff --git a/internal/collector/postgres_5s_metrics.yaml b/internal/collector/postgres_5s_metrics.yaml index 29a0477343..3f09e8a8d2 100644 --- a/internal/collector/postgres_5s_metrics.yaml +++ b/internal/collector/postgres_5s_metrics.yaml @@ -4,6 +4,43 @@ # https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/-/receiver/sqlqueryreceiver#metrics-queries # https://github.com/CrunchyData/pgmonitor/blob/development/sql_exporter/common/crunchy_global_collector.yml # + # TODO ccp_pg_stat_activity can be removed after metrics are fully aligned with the latest pgMonitor + - sql: > + SELECT + pg_database.datname, + tmp.state, + tmp2.usename, + tmp2.application_name, + COALESCE(count,0) as count, + COALESCE(max_tx_duration,0) as max_tx_duration + FROM + ( + VALUES ('active'), + ('idle'), + ('idle in transaction'), + ('idle in transaction (aborted)'), + ('fastpath function call'), + ('disabled') + ) AS tmp(state) CROSS JOIN pg_database + LEFT JOIN + ( + SELECT + datname, + state, + usename, + application_name, + count(*) AS count, + MAX(EXTRACT(EPOCH FROM now() - xact_start))::float AS max_tx_duration + FROM pg_stat_activity GROUP BY datname,state,usename,application_name) AS tmp2 + ON tmp.state = tmp2.state AND pg_database.datname = tmp2.datname; + metrics: + - metric_name: ccp_pg_stat_activity_count + value_column: count + description: number of connections in this state + attribute_columns: ["application_name", "datname", "state", "usename"] + static_attributes: + server: "localhost:5432" + - sql: > SELECT EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)) AS seconds_since_last_archive From 1ec914091b5a70b57e898bc62214fe0c50829b73 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Thu, 20 Feb 2025 14:11:57 -0500 Subject: [PATCH 06/22] Fixes ccp_stat_bgwriter_buffers_checkpoint --- internal/collector/generated/gte_pg17_metrics.json | 2 +- internal/collector/gte_pg17_metrics.yaml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/collector/generated/gte_pg17_metrics.json b/internal/collector/generated/gte_pg17_metrics.json index de39cf6cca..563abf01b3 100644 --- a/internal/collector/generated/gte_pg17_metrics.json +++ b/internal/collector/generated/gte_pg17_metrics.json @@ -1 +1 @@ -[{"metrics":[{"data_type":"sum","description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_bgwriter_buffers_checkpoint","static_attributes":{"server":"localhost:5432"}}],"sql":"SELECT c.buffers_written FROM pg_catalog.pg_stat_checkpointer c;\n"},{"metrics":[{"data_type":"sum","description":"Number of write operations, each of the size specified in op_bytes.","metric_name":"ccp_stat_bgwriter_buffers_backend","static_attributes":{"server":"localhost:5432"},"value_column":"writes"},{"data_type":"sum","description":"Number of fsync calls. These are only tracked in context normal.","metric_name":"ccp_stat_bgwriter_buffers_backend_fsync","static_attributes":{"server":"localhost:5432"},"value_column":"fsyncs"}],"sql":"SELECT\n s.writes\n , s.fsyncs\nFROM pg_catalog.pg_stat_io s WHERE backend_type = 'background writer';\n"},{"metrics":[{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_sync_time","static_attributes":{"server":"localhost:5432"},"value_column":"sync_time"},{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_write_time","static_attributes":{"server":"localhost:5432"},"value_column":"write_time","value_type":"double"},{"description":"Number of requested checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_req","static_attributes":{"server":"localhost:5432"},"value_column":"num_requested"},{"description":"Number of scheduled checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_timed","static_attributes":{"server":"localhost:5432"},"value_column":"num_timed"},{"description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_checkpointer_buffers_written","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_written"}],"sql":"SELECT\n c.num_timed\n , c.num_requested\n , c.write_time\n , c.sync_time\n , c.buffers_written\nFROM pg_catalog.pg_stat_checkpointer c;\n"}] +[{"metrics":[{"data_type":"sum","description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_bgwriter_buffers_checkpoint","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_written"}],"sql":"SELECT c.buffers_written FROM pg_catalog.pg_stat_checkpointer c;\n"},{"metrics":[{"data_type":"sum","description":"Number of write operations, each of the size specified in op_bytes.","metric_name":"ccp_stat_bgwriter_buffers_backend","static_attributes":{"server":"localhost:5432"},"value_column":"writes"},{"data_type":"sum","description":"Number of fsync calls. These are only tracked in context normal.","metric_name":"ccp_stat_bgwriter_buffers_backend_fsync","static_attributes":{"server":"localhost:5432"},"value_column":"fsyncs"}],"sql":"SELECT\n s.writes\n , s.fsyncs\nFROM pg_catalog.pg_stat_io s WHERE backend_type = 'background writer';\n"},{"metrics":[{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_sync_time","static_attributes":{"server":"localhost:5432"},"value_column":"sync_time"},{"description":"Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds","metric_name":"ccp_stat_bgwriter_checkpoint_write_time","static_attributes":{"server":"localhost:5432"},"value_column":"write_time","value_type":"double"},{"description":"Number of requested checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_req","static_attributes":{"server":"localhost:5432"},"value_column":"num_requested"},{"description":"Number of scheduled checkpoints that have been performed","metric_name":"ccp_stat_bgwriter_checkpoints_timed","static_attributes":{"server":"localhost:5432"},"value_column":"num_timed"},{"description":"Number of buffers written during checkpoints and restartpoints","metric_name":"ccp_stat_checkpointer_buffers_written","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_written"}],"sql":"SELECT\n c.num_timed\n , c.num_requested\n , c.write_time\n , c.sync_time\n , c.buffers_written\nFROM pg_catalog.pg_stat_checkpointer c;\n"}] diff --git a/internal/collector/gte_pg17_metrics.yaml b/internal/collector/gte_pg17_metrics.yaml index c985ec8e4f..de8f6786f5 100644 --- a/internal/collector/gte_pg17_metrics.yaml +++ b/internal/collector/gte_pg17_metrics.yaml @@ -9,6 +9,7 @@ FROM pg_catalog.pg_stat_checkpointer c; metrics: - metric_name: ccp_stat_bgwriter_buffers_checkpoint + value_column: buffers_written data_type: sum description: Number of buffers written during checkpoints and restartpoints static_attributes: From 30e8e5bf760bf7497bce09dd8a1a04dc62a60995 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Thu, 20 Feb 2025 14:13:58 -0500 Subject: [PATCH 07/22] Remove requirement of exporter block for OTel metrics --- internal/controller/postgrescluster/controller.go | 4 ++-- internal/controller/postgrescluster/pgmonitor.go | 11 ++++++----- internal/pgmonitor/postgres.go | 9 +++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/internal/controller/postgrescluster/controller.go b/internal/controller/postgrescluster/controller.go index 8ccf88de00..c5daef0c01 100644 --- a/internal/controller/postgrescluster/controller.go +++ b/internal/controller/postgrescluster/controller.go @@ -235,13 +235,13 @@ func (r *Reconciler) Reconcile( } pgHBAs := postgres.NewHBAs() - pgmonitor.PostgreSQLHBAs(cluster, &pgHBAs) + pgmonitor.PostgreSQLHBAs(ctx, cluster, &pgHBAs) pgbouncer.PostgreSQL(cluster, &pgHBAs) pgParameters := postgres.NewParameters() pgaudit.PostgreSQLParameters(&pgParameters) pgbackrest.PostgreSQL(cluster, &pgParameters, backupsSpecFound) - pgmonitor.PostgreSQLParameters(cluster, &pgParameters) + pgmonitor.PostgreSQLParameters(ctx, cluster, &pgParameters) otelConfig := collector.NewConfigForPostgresPod(ctx, cluster, &pgParameters) diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index 23ee1c9bc2..70990f92e8 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -73,7 +73,8 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context, // We use this ImageID and the setup.sql file in the hash we make to see if the operator needs to rerun // the `EnableExporterInPostgreSQL` funcs; that way we are always running // that function against an updated and running pod. - if pgmonitor.ExporterEnabled(cluster) { + + if pgmonitor.ExporterEnabled(cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { sql, err := os.ReadFile(fmt.Sprintf("%s/pg%d/setup.sql", pgmonitor.GetQueriesConfigDir(ctx), cluster.Spec.PostgresVersion)) if err != nil { return err @@ -110,7 +111,7 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context, return pgmonitor.EnableExporterInPostgreSQL(ctx, exec, monitoringSecret, pgmonitor.ExporterDB, setup) } - if !pgmonitor.ExporterEnabled(cluster) { + if !pgmonitor.ExporterEnabled(cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { action = func(ctx context.Context, exec postgres.Executor) error { return pgmonitor.DisableExporterInPostgreSQL(ctx, exec) } @@ -168,7 +169,7 @@ func (r *Reconciler) reconcileMonitoringSecret( return nil, err } - if !pgmonitor.ExporterEnabled(cluster) { + if !pgmonitor.ExporterEnabled(cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // TODO: Checking if the exporter is enabled to determine when monitoring // secret should be created. If more tools are added to the monitoring // suite, they could need the secret when the exporter is not enabled. @@ -259,7 +260,7 @@ func addPGMonitorExporterToInstancePodSpec( template *corev1.PodTemplateSpec, exporterQueriesConfig, exporterWebConfig *corev1.ConfigMap) error { - if !pgmonitor.ExporterEnabled(cluster) { + if !pgmonitor.ExporterEnabled(cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { return nil } @@ -407,7 +408,7 @@ func (r *Reconciler) reconcileExporterWebConfig(ctx context.Context, return nil, err } - if !pgmonitor.ExporterEnabled(cluster) || cluster.Spec.Monitoring.PGMonitor.Exporter.CustomTLSSecret == nil { + if !pgmonitor.ExporterEnabled(cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) || cluster.Spec.Monitoring.PGMonitor.Exporter.CustomTLSSecret == nil { // We could still have a NotFound error here so check the err. // If no error that means the configmap is found and needs to be deleted if err == nil { diff --git a/internal/pgmonitor/postgres.go b/internal/pgmonitor/postgres.go index 292d116e30..ae01614ab9 100644 --- a/internal/pgmonitor/postgres.go +++ b/internal/pgmonitor/postgres.go @@ -10,6 +10,7 @@ import ( corev1 "k8s.io/api/core/v1" + "github.com/crunchydata/postgres-operator/internal/feature" "github.com/crunchydata/postgres-operator/internal/logging" "github.com/crunchydata/postgres-operator/internal/postgres" "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" @@ -22,8 +23,8 @@ const ( // PostgreSQLHBAs provides the Postgres HBA rules for allowing the monitoring // exporter to be accessible -func PostgreSQLHBAs(inCluster *v1beta1.PostgresCluster, outHBAs *postgres.HBAs) { - if ExporterEnabled(inCluster) { +func PostgreSQLHBAs(ctx context.Context, inCluster *v1beta1.PostgresCluster, outHBAs *postgres.HBAs) { + if ExporterEnabled(inCluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // Limit the monitoring user to local connections using SCRAM. outHBAs.Mandatory = append(outHBAs.Mandatory, postgres.NewHBA().TCP().User(MonitoringUser).Method("scram-sha-256").Network("127.0.0.0/8"), @@ -34,8 +35,8 @@ func PostgreSQLHBAs(inCluster *v1beta1.PostgresCluster, outHBAs *postgres.HBAs) // PostgreSQLParameters provides additional required configuration parameters // that Postgres needs to support monitoring -func PostgreSQLParameters(inCluster *v1beta1.PostgresCluster, outParameters *postgres.Parameters) { - if ExporterEnabled(inCluster) { +func PostgreSQLParameters(ctx context.Context, inCluster *v1beta1.PostgresCluster, outParameters *postgres.Parameters) { + if ExporterEnabled(inCluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // Exporter expects that shared_preload_libraries are installed // pg_stat_statements: https://access.crunchydata.com/documentation/pgmonitor/latest/exporter/ // pgnodemx: https://github.com/CrunchyData/pgnodemx From 89db6cd6303925b840e7fccd86766f5483936123 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Thu, 20 Feb 2025 14:21:29 -0500 Subject: [PATCH 08/22] Updates test --- internal/pgmonitor/postgres_test.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/internal/pgmonitor/postgres_test.go b/internal/pgmonitor/postgres_test.go index b91e9ba125..3b6bff58de 100644 --- a/internal/pgmonitor/postgres_test.go +++ b/internal/pgmonitor/postgres_test.go @@ -5,6 +5,7 @@ package pgmonitor import ( + "context" "strings" "testing" @@ -15,10 +16,12 @@ import ( ) func TestPostgreSQLHBA(t *testing.T) { + ctx := context.Background() + t.Run("ExporterDisabled", func(t *testing.T) { inCluster := &v1beta1.PostgresCluster{} outHBAs := postgres.HBAs{} - PostgreSQLHBAs(inCluster, &outHBAs) + PostgreSQLHBAs(ctx, inCluster, &outHBAs) assert.Equal(t, len(outHBAs.Mandatory), 0) }) @@ -33,7 +36,7 @@ func TestPostgreSQLHBA(t *testing.T) { } outHBAs := postgres.HBAs{} - PostgreSQLHBAs(inCluster, &outHBAs) + PostgreSQLHBAs(ctx, inCluster, &outHBAs) assert.Equal(t, len(outHBAs.Mandatory), 3) assert.Equal(t, outHBAs.Mandatory[0].String(), `host all "ccp_monitoring" "127.0.0.0/8" scram-sha-256`) @@ -43,10 +46,12 @@ func TestPostgreSQLHBA(t *testing.T) { } func TestPostgreSQLParameters(t *testing.T) { + ctx := context.Background() + t.Run("ExporterDisabled", func(t *testing.T) { inCluster := &v1beta1.PostgresCluster{} outParameters := postgres.NewParameters() - PostgreSQLParameters(inCluster, &outParameters) + PostgreSQLParameters(ctx, inCluster, &outParameters) assert.Assert(t, !outParameters.Mandatory.Has("shared_preload_libraries")) }) @@ -61,7 +66,7 @@ func TestPostgreSQLParameters(t *testing.T) { } outParameters := postgres.NewParameters() - PostgreSQLParameters(inCluster, &outParameters) + PostgreSQLParameters(ctx, inCluster, &outParameters) libs, found := outParameters.Mandatory.Get("shared_preload_libraries") assert.Assert(t, found) assert.Assert(t, strings.Contains(libs, "pg_stat_statements")) @@ -80,7 +85,7 @@ func TestPostgreSQLParameters(t *testing.T) { outParameters := postgres.NewParameters() outParameters.Mandatory.Add("shared_preload_libraries", "daisy") - PostgreSQLParameters(inCluster, &outParameters) + PostgreSQLParameters(ctx, inCluster, &outParameters) libs, found := outParameters.Mandatory.Get("shared_preload_libraries") assert.Assert(t, found) assert.Assert(t, strings.Contains(libs, "pg_stat_statements")) From b68efb6b03439bec6dcd8e368e8375f4d2b48ad3 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Thu, 20 Feb 2025 17:19:47 -0500 Subject: [PATCH 09/22] Moves problematic queries into functions --- .../generated/pgbackrest_metrics.json | 2 +- .../generated/postgres_5s_metrics.json | 2 +- internal/collector/pgbackrest_metrics.yaml | 169 ------------------ internal/collector/postgres_5s_metrics.yaml | 107 +++++++++-- internal/collector/postgres_metrics.go | 9 +- .../postgrescluster/metrics_setup.sql | 150 ++++++++++++++++ 6 files changed, 241 insertions(+), 198 deletions(-) delete mode 100644 internal/collector/pgbackrest_metrics.yaml diff --git a/internal/collector/generated/pgbackrest_metrics.json b/internal/collector/generated/pgbackrest_metrics.json index 713f0a8ac1..63114afc03 100644 --- a/internal/collector/generated/pgbackrest_metrics.json +++ b/internal/collector/generated/pgbackrest_metrics.json @@ -1 +1 @@ -[{"metrics":[{"attribute_columns":["repo"],"description":"Seconds since the last completed full or differential backup. Differential is always based off last full.","metric_name":"ccp_backrest_last_diff_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_diff_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full backup","metric_name":"ccp_backrest_last_full_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_full_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full, differential or incremental backup.\nIncremental is always based off last full or differential.\n","metric_name":"ccp_backrest_last_incr_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_incr_backup"},{"attribute_columns":["backup_type","repo"],"description":"pgBackRest version number when this backup was performed","metric_name":"ccp_backrest_last_info_backrest_repo_version","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backrest_repo_version"},{"attribute_columns":["backup_type","repo"],"description":"An error has been encountered in the backup. Check logs for more information.","metric_name":"ccp_backrest_last_info_backup_error","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backup_error"},{"attribute_columns":["backup_type","repo"],"description":"Total runtime in seconds of this backup","metric_name":"ccp_backrest_last_info_backup_runtime_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"backup_runtime_seconds"},{"attribute_columns":["backup_type","repo"],"description":"Actual size of only this individual backup in the pgbackrest repository","metric_name":"ccp_backrest_last_info_repo_backup_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_backup_size_bytes"},{"attribute_columns":["backup_type","repo"],"description":"Total size of this backup in the pgbackrest repository, including all required previous backups and WAL","metric_name":"ccp_backrest_last_info_repo_total_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_total_size_bytes"},{"attribute_columns":["repo"],"description":"Seconds since the oldest completed full backup","metric_name":"ccp_backrest_oldest_full_backup_time_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_full_backup"}],"sql":"DROP TABLE IF EXISTS pgbackrest_info;\nCREATE TEMPORARY TABLE pgbackrest_info (data json);\n\nCOPY pgbackrest_info (data)\nFROM PROGRAM 'export LC_ALL=C \u0026\u0026 printf \"\\f\" \u0026\u0026 pgbackrest info --log-level-console=info --log-level-stderr=warn --output=json --stanza=db \u0026\u0026 printf \"\\f\"'\nWITH (FORMAT csv, HEADER false, QUOTE E'\\f');\n\nWITH\nall_backups (data) AS (\n SELECT jsonb_array_elements(to_jsonb(data)) FROM pgbackrest_info\n),\nstanza_backups (stanza, backup) AS (\n SELECT data-\u003e\u003e'name', jsonb_array_elements(data-\u003e'backup') FROM all_backups\n),\nordered_backups (stanza, backup, seq_oldest, seq_newest) AS (\n SELECT stanza, backup,\n ROW_NUMBER() OVER (\n PARTITION BY stanza, backup-\u003e'database'-\u003e\u003e'repo-key', backup-\u003e\u003e'type'\n ORDER BY backup-\u003e'timestamp'-\u003e\u003e'start' ASC, backup-\u003e'timestamp'-\u003e\u003e'stop' ASC\n ),\n ROW_NUMBER() OVER (\n PARTITION BY stanza, backup-\u003e'database'-\u003e\u003e'repo-key', backup-\u003e\u003e'type'\n ORDER BY backup-\u003e'timestamp'-\u003e\u003e'start' DESC, backup-\u003e'timestamp'-\u003e\u003e'stop' DESC\n )\n FROM stanza_backups\n),\n\nccp_backrest_last_info AS (\n SELECT\n stanza,\n split_part(backup-\u003e'backrest'-\u003e\u003e'version', '.', 1) || lpad(split_part(backup-\u003e'backrest'-\u003e\u003e'version', '.', 2), 2, '0') || lpad(coalesce(nullif(split_part(backup-\u003e'backrest'-\u003e\u003e'version', '.', 3), ''), '00'), 2, '0') AS backrest_repo_version,\n backup-\u003e'database'-\u003e\u003e'repo-key' AS repo,\n backup-\u003e\u003e'type' AS backup_type,\n backup-\u003e'info'-\u003e'repository'-\u003e\u003e'delta' AS repo_backup_size_bytes,\n backup-\u003e'info'-\u003e'repository'-\u003e\u003e'size' AS repo_total_size_bytes,\n (backup-\u003e'timestamp'-\u003e\u003e'stop')::bigint - (backup-\u003e'timestamp'-\u003e\u003e'start')::bigint AS backup_runtime_seconds,\n CASE WHEN backup-\u003e\u003e'error' = 'true' THEN 1 ELSE 0 END AS backup_error\n FROM ordered_backups\n WHERE seq_newest = 1\n),\n\nccp_backrest_oldest_full_backup AS (\n SELECT\n stanza,\n backup-\u003e'database'-\u003e\u003e'repo-key' AS repo,\n min((backup-\u003e'timestamp'-\u003e\u003e'stop')::bigint) AS time_seconds\n FROM ordered_backups\n WHERE seq_oldest = 1 AND backup-\u003e\u003e'type' IN ('full')\n GROUP BY 1,2\n),\n\nccp_backrest_last_full_backup AS (\n SELECT\n stanza,\n backup-\u003e'database'-\u003e\u003e'repo-key' AS repo,\n EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup-\u003e'timestamp'-\u003e\u003e'stop')::bigint) AS time_since_completion_seconds\n FROM ordered_backups\n WHERE seq_newest = 1 AND backup-\u003e\u003e'type' IN ('full')\n GROUP BY 1,2\n),\n\nccp_backrest_last_diff_backup AS (\n SELECT\n stanza,\n backup-\u003e'database'-\u003e\u003e'repo-key' AS repo,\n EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup-\u003e'timestamp'-\u003e\u003e'stop')::bigint) AS time_since_completion_seconds\n FROM ordered_backups\n WHERE seq_newest = 1 AND backup-\u003e\u003e'type' IN ('full','diff')\n GROUP BY 1,2\n),\n\nccp_backrest_last_incr_backup AS (\n SELECT\n stanza,\n backup-\u003e'database'-\u003e\u003e'repo-key' AS repo,\n EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup-\u003e'timestamp'-\u003e\u003e'stop')::bigint) AS time_since_completion_seconds\n FROM ordered_backups\n WHERE seq_newest = 1 AND backup-\u003e\u003e'type' IN ('full','diff','incr')\n GROUP BY 1,2\n)\n\nSELECT\n ccp_backrest_last_diff_backup.time_since_completion_seconds as last_diff_backup,\n ccp_backrest_last_full_backup.time_since_completion_seconds as last_full_backup,\n ccp_backrest_last_incr_backup.time_since_completion_seconds as last_incr_backup,\n ccp_backrest_last_info.backrest_repo_version as last_info_backrest_repo_version,\n ccp_backrest_last_info.backup_error as last_info_backup_error,\n ccp_backrest_last_info.backup_type as backup_type,\n ccp_backrest_last_info.backup_runtime_seconds as backup_runtime_seconds,\n ccp_backrest_last_info.repo_backup_size_bytes as repo_backup_size_bytes,\n ccp_backrest_last_info.repo_total_size_bytes as repo_total_size_bytes,\n ccp_backrest_oldest_full_backup.time_seconds as oldest_full_backup,\n ccp_backrest_last_incr_backup.repo as repo\n \nFROM\n ccp_backrest_last_diff_backup\n , ccp_backrest_last_full_backup\n , ccp_backrest_last_incr_backup\n , ccp_backrest_last_info\n , ccp_backrest_oldest_full_backup;\n"}] +[{"metrics":[{"attribute_columns":["repo"],"description":"Seconds since the last completed full or differential backup. Differential is always based off last full.","metric_name":"ccp_backrest_last_diff_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_diff_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full backup","metric_name":"ccp_backrest_last_full_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_full_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full, differential or incremental backup.\nIncremental is always based off last full or differential.\n","metric_name":"ccp_backrest_last_incr_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_incr_backup"},{"attribute_columns":["backup_type","repo"],"description":"pgBackRest version number when this backup was performed","metric_name":"ccp_backrest_last_info_backrest_repo_version","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backrest_repo_version"},{"attribute_columns":["backup_type","repo"],"description":"An error has been encountered in the backup. Check logs for more information.","metric_name":"ccp_backrest_last_info_backup_error","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backup_error"},{"attribute_columns":["backup_type","repo"],"description":"Total runtime in seconds of this backup","metric_name":"ccp_backrest_last_info_backup_runtime_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"backup_runtime_seconds"},{"attribute_columns":["backup_type","repo"],"description":"Actual size of only this individual backup in the pgbackrest repository","metric_name":"ccp_backrest_last_info_repo_backup_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_backup_size_bytes"},{"attribute_columns":["backup_type","repo"],"description":"Total size of this backup in the pgbackrest repository, including all required previous backups and WAL","metric_name":"ccp_backrest_last_info_repo_total_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_total_size_bytes"},{"attribute_columns":["repo"],"description":"Seconds since the oldest completed full backup","metric_name":"ccp_backrest_oldest_full_backup_time_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_full_backup"}],"sql":"SELECT * FROM get_pgbackrest_info();\n"}] diff --git a/internal/collector/generated/postgres_5s_metrics.json b/internal/collector/generated/postgres_5s_metrics.json index bc89b821e5..5990f1e743 100644 --- a/internal/collector/generated/postgres_5s_metrics.json +++ b/internal/collector/generated/postgres_5s_metrics.json @@ -1 +1 @@ -[{"metrics":[{"attribute_columns":["application_name","datname","state","usename"],"description":"number of connections in this state","metric_name":"ccp_pg_stat_activity_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT\n pg_database.datname,\n tmp.state,\n tmp2.usename,\n tmp2.application_name,\n COALESCE(count,0) as count,\n COALESCE(max_tx_duration,0) as max_tx_duration\nFROM\n (\n VALUES ('active'),\n ('idle'),\n ('idle in transaction'),\n ('idle in transaction (aborted)'),\n ('fastpath function call'),\n ('disabled')\n ) AS tmp(state) CROSS JOIN pg_database\nLEFT JOIN (\n SELECT\n datname,\n state,\n usename,\n application_name,\n count(*) AS count,\n MAX(EXTRACT(EPOCH FROM now() - xact_start))::float AS max_tx_duration\n FROM pg_stat_activity GROUP BY datname,state,usename,application_name) AS tmp2\n ON tmp.state = tmp2.state AND pg_database.datname = tmp2.datname;\n"},{"metrics":[{"description":"Seconds since the last successful archive operation","metric_name":"ccp_archive_command_status_seconds_since_last_archive","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_archive","value_type":"double"}],"sql":"SELECT EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)) AS seconds_since_last_archive FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of WAL files that have been successfully archived","metric_name":"ccp_archive_command_status_archived_count","static_attributes":{"server":"localhost:5432"},"value_column":"archived_count"}],"sql":"SELECT archived_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of failed attempts for archiving WAL files","metric_name":"ccp_archive_command_status_failed_count","static_attributes":{"server":"localhost:5432"},"value_column":"failed_count"}],"sql":"SELECT failed_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Seconds since the last recorded failure of the archive_command","metric_name":"ccp_archive_command_status_seconds_since_last_fail","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_fail"}],"sql":"SELECT CASE\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) \u003c 0 THEN 0\n ELSE EXTRACT(epoch from (last_failed_time - last_archived_time))\n END AS seconds_since_last_fail\nFROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Total non-idle connections","metric_name":"ccp_connection_stats_active","static_attributes":{"server":"localhost:5432"},"value_column":"active"},{"description":"Total idle connections","metric_name":"ccp_connection_stats_idle","static_attributes":{"server":"localhost:5432"},"value_column":"idle"},{"description":"Total idle in transaction connections","metric_name":"ccp_connection_stats_idle_in_txn","static_attributes":{"server":"localhost:5432"},"value_column":"idle_in_txn"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_blocked_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_blocked_query_time","value_type":"double"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_connections","static_attributes":{"server":"localhost:5432"},"value_column":"max_connections"},{"description":"Length of time in seconds of the longest idle in transaction session","metric_name":"ccp_connection_stats_max_idle_in_txn_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_idle_in_txn_time","value_type":"double"},{"description":"Length of time in seconds of the longest running query","metric_name":"ccp_connection_stats_max_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_query_time","value_type":"double"},{"description":"Total idle and non-idle connections","metric_name":"ccp_connection_stats_total","static_attributes":{"server":"localhost:5432"},"value_column":"total"}],"sql":"SELECT ((total - idle) - idle_in_txn) as active\n , total\n , idle\n , idle_in_txn\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - state_change))),0) FROM pg_catalog.pg_stat_activity WHERE state = 'idle in transaction') AS max_idle_in_txn_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND state \u003c\u003e 'idle' ) AS max_query_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND wait_event_type = 'Lock' ) AS max_blocked_query_time\n , max_connections\n FROM (\n SELECT COUNT(*) as total\n , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle\n , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x\n JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);\n"},{"metrics":[{"attribute_columns":["dbname"],"description":"Total number of checksum failures on this database","metric_name":"ccp_data_checksum_failure_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"},{"attribute_columns":["dbname"],"description":"Time interval in seconds since the last checksum failure was encountered","metric_name":"ccp_data_checksum_failure_time_since_last_failure_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"time_since_last_failure_seconds","value_type":"double"}],"sql":"SELECT datname AS dbname , checksum_failures AS count , coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds FROM pg_catalog.pg_stat_database WHERE pg_stat_database.datname IS NOT NULL;\n"},{"metrics":[{"attribute_columns":["dbname","mode"],"description":"Return value of 1 means database is in recovery. Otherwise 2 it is a primary.","metric_name":"ccp_locks_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT pg_database.datname as dbname , tmp.mode , COALESCE(count,0) as count FROM (\n VALUES ('accesssharelock'),\n ('rowsharelock'),\n ('rowexclusivelock'),\n ('shareupdateexclusivelock'),\n ('sharelock'),\n ('sharerowexclusivelock'),\n ('exclusivelock'),\n ('accessexclusivelock')\n) AS tmp(mode) CROSS JOIN pg_catalog.pg_database LEFT JOIN\n (SELECT database, lower(mode) AS mode,count(*) AS count\n FROM pg_catalog.pg_locks WHERE database IS NOT NULL\n GROUP BY database, lower(mode)\n) AS tmp2 ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database;\n"},{"metrics":[{"description":"CPU limit value in milli cores","metric_name":"ccp_nodemx_cpu_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"CPU request value in milli cores","metric_name":"ccp_nodemx_cpu_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"}],"sql":"SELECT monitor.kdapi_scalar_bigint('cpu_request') AS request , monitor.kdapi_scalar_bigint('cpu_limit') AS limit\n"},{"metrics":[{"description":"CPU usage in nanoseconds","metric_name":"ccp_nodemx_cpuacct_usage","static_attributes":{"server":"localhost:5432"},"value_column":"usage","value_type":"double"},{"description":"CPU usage snapshot timestamp","metric_name":"ccp_nodemx_cpuacct_usage_ts","static_attributes":{"server":"localhost:5432"},"value_column":"usage_ts","value_type":"double"}],"sql":"SELECT CASE WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('cpuacct.usage')\n ELSE (SELECT val FROM monitor.cgroup_setof_kv('cpu.stat') where key = 'usage_usec') * 1000\n END AS usage,\n extract(epoch from clock_timestamp()) AS usage_ts;\n"},{"metrics":[{"description":"The total available run-time within a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_period_us","static_attributes":{"server":"localhost:5432"},"value_column":"period_us"},{"description":"The length of a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_quota_us","static_attributes":{"server":"localhost:5432"},"value_column":"quota_us","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n monitor.cgroup_scalar_bigint('cpu.cfs_period_us')\n ELSE\n (monitor.cgroup_array_bigint('cpu.max'))[2]\n END AS period_us,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n GREATEST(monitor.cgroup_scalar_bigint('cpu.cfs_quota_us'), 0)\n ELSE\n GREATEST((monitor.cgroup_array_bigint('cpu.max'))[1], 0)\n END AS quota_us;\n"},{"metrics":[{"description":"Number of periods that any thread was runnable","metric_name":"ccp_nodemx_cpustat_nr_periods","static_attributes":{"server":"localhost:5432"},"value_column":"nr_periods","value_type":"double"},{"description":"Number of runnable periods in which the application used its entire quota and was throttled","metric_name":"ccp_nodemx_cpustat_nr_throttled","static_attributes":{"server":"localhost:5432"},"value_column":"nr_throttled"},{"description":"CPU stat snapshot timestamp","metric_name":"ccp_nodemx_cpustat_snap_ts","static_attributes":{"server":"localhost:5432"},"value_column":"snap_ts","value_type":"double"},{"description":"Sum total amount of time individual threads within the monitor.cgroup were throttled","metric_name":"ccp_nodemx_cpustat_throttled_time","static_attributes":{"server":"localhost:5432"},"value_column":"throttled_time","value_type":"double"}],"sql":"WITH d(key, val) AS (select key, val from monitor.cgroup_setof_kv('cpu.stat')) SELECT\n (SELECT val FROM d WHERE key='nr_periods') AS nr_periods,\n (SELECT val FROM d WHERE key='nr_throttled') AS nr_throttled,\n (SELECT val FROM d WHERE key='throttled_usec') AS throttled_time,\n extract(epoch from clock_timestamp()) as snap_ts;\n"},{"metrics":[{"attribute_columns":["fs_type","mount_point"],"description":"Available size in bytes","metric_name":"ccp_nodemx_data_disk_available_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"available_bytes","value_type":"double"},{"attribute_columns":["fs_type","mount_point"],"description":"Available file nodes","metric_name":"ccp_nodemx_data_disk_free_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"free_file_nodes"},{"attribute_columns":["fs_type","mount_point"],"description":"Size in bytes","metric_name":"ccp_nodemx_data_disk_total_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_bytes"},{"attribute_columns":["fs_type","mount_point"],"description":"Total file nodes","metric_name":"ccp_nodemx_data_disk_total_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"total_file_nodes"}],"sql":"SELECT mount_point,fs_type,total_bytes,available_bytes,total_file_nodes,free_file_nodes\n FROM monitor.proc_mountinfo() m\n JOIN monitor.fsinfo(m.mount_point) f USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%'\n"},{"metrics":[{"attribute_columns":["mount_point"],"description":"Total sectors read","metric_name":"ccp_nodemx_disk_activity_sectors_read","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_read"},{"attribute_columns":["mount_point"],"description":"Total sectors written","metric_name":"ccp_nodemx_disk_activity_sectors_written","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_written"}],"sql":"SELECT mount_point,sectors_read,sectors_written\n FROM monitor.proc_mountinfo() m\n JOIN monitor.proc_diskstats() d USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%';\n"},{"metrics":[{"description":"Total bytes of anonymous and swap cache memory on active LRU list","metric_name":"ccp_nodemx_mem_active_anon","static_attributes":{"server":"localhost:5432"},"value_column":"active_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on active LRU list","metric_name":"ccp_nodemx_mem_active_file","static_attributes":{"server":"localhost:5432"},"value_column":"active_file","value_type":"double"},{"description":"Total bytes of page cache memory","metric_name":"ccp_nodemx_mem_cache","static_attributes":{"server":"localhost:5432"},"value_column":"cache","value_type":"double"},{"description":"Total bytes that are waiting to get written back to the disk","metric_name":"ccp_nodemx_mem_dirty","static_attributes":{"server":"localhost:5432"},"value_column":"dirty"},{"description":"Total bytes of anonymous and swap cache memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_anon","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_file","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_file","value_type":"double"},{"description":"Unknown metric from ccp_nodemx_mem","metric_name":"ccp_nodemx_mem_kmem_usage_in_byte","static_attributes":{"server":"localhost:5432"},"value_column":"kmem_usage_in_byte"},{"description":"Memory limit value in bytes","metric_name":"ccp_nodemx_mem_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"Total bytes of mapped file (includes tmpfs/shmem)","metric_name":"ccp_nodemx_mem_mapped_file","static_attributes":{"server":"localhost:5432"},"value_column":"mapped_file"},{"description":"Memory request value in bytes","metric_name":"ccp_nodemx_mem_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"},{"description":"Total bytes of anonymous and swap cache memory","metric_name":"ccp_nodemx_mem_rss","static_attributes":{"server":"localhost:5432"},"value_column":"rss","value_type":"double"},{"description":"Total bytes of shared memory","metric_name":"ccp_nodemx_mem_shmem","static_attributes":{"server":"localhost:5432"},"value_column":"shmem","value_type":"double"},{"description":"Total usage in bytes","metric_name":"ccp_nodemx_mem_usage_in_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"usage_in_bytes"}],"sql":"WITH d(key, val) as (SELECT key, val FROM monitor.cgroup_setof_kv('memory.stat')) SELECT\n monitor.kdapi_scalar_bigint('mem_request') AS request,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.limit_in_bytes') = 9223372036854771712 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.limit_in_bytes') END)\n ELSE\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.max') = 9223372036854775807 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.max') END)\n END AS limit,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='cache')\n ELSE 0\n END as cache,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='rss')\n ELSE 0\n END as RSS,\n (SELECT val FROM d WHERE key='shmem') as shmem,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='mapped_file')\n ELSE 0\n END as mapped_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='dirty')\n ELSE (SELECT val FROM d WHERE key='file_dirty')\n END as dirty,\n (SELECT val FROM d WHERE key='active_anon') as active_anon,\n (SELECT val FROM d WHERE key='inactive_anon') as inactive_anon,\n (SELECT val FROM d WHERE key='active_file') as active_file,\n (SELECT val FROM d WHERE key='inactive_file') as inactive_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.usage_in_bytes')\n ELSE monitor.cgroup_scalar_bigint('memory.current')\n END as usage_in_bytes,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.kmem.usage_in_bytes')\n ELSE 0\n END as kmem_usage_in_byte;\n"},{"metrics":[{"attribute_columns":["interface"],"description":"Number of bytes received","metric_name":"ccp_nodemx_network_rx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"rx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets received","metric_name":"ccp_nodemx_network_rx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"rx_packets"},{"attribute_columns":["interface"],"description":"Number of bytes transmitted","metric_name":"ccp_nodemx_network_tx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"tx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets transmitted","metric_name":"ccp_nodemx_network_tx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"tx_packets"}],"sql":"SELECT interface\n ,tx_bytes\n ,tx_packets\n ,rx_bytes\n ,rx_packets from monitor.proc_network_stats()\n"},{"metrics":[{"description":"Total number of database processes","metric_name":"ccp_nodemx_process_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT monitor.cgroup_process_count() as count;\n"},{"metrics":[{"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_reset_time","static_attributes":{"server":"localhost:5432"},"value_column":"time"}],"sql":"SELECT monitor.pg_stat_statements_reset_info(-1) as time;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Average query runtime in milliseconds","metric_name":"ccp_pg_stat_statements_top_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"top_mean_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max(monitor.mean_exec_time) AS top_mean_exec_time_ms\nFROM monitor GROUP BY 1,2,3,4 ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","role"],"description":"Total number of queries run per user/database","metric_name":"ccp_pg_stat_statements_total_calls_count","static_attributes":{"server":"localhost:5432"},"value_column":"calls_count","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"mean_exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total rows returned from all queries per user/database","metric_name":"ccp_pg_stat_statements_total_row_count","static_attributes":{"server":"localhost:5432"},"value_column":"row_count","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.calls\n , s.total_exec_time\n , s.mean_exec_time\n , s.rows\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , sum(calls) AS calls_count\n , sum(total_exec_time) AS exec_time_ms\n , avg(mean_exec_time) AS mean_exec_time_ms\n , sum(rows) AS row_count\nFROM monitor GROUP BY 1,2;\n"},{"metrics":[{"description":"The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######).","metric_name":"ccp_postgresql_version_current","static_attributes":{"server":"localhost:5432"},"value_column":"current"}],"sql":"SELECT current_setting('server_version_num')::int AS current;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_postmaster_uptime_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"seconds","value_type":"double"}],"sql":"SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_replication_lag_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"}],"sql":"SELECT pg_wal_lsn_diff(sent_lsn, replay_lsn) AS bytes\n FROM pg_catalog.pg_stat_replication;\n"},{"metrics":[{"attribute_columns":["role"],"description":"Length of time since the last WAL file was received and replayed on replica.\nAlways increases, possibly causing false positives if the primary stops writing.\nMonitors for replicas that stop receiving WAL all together.\n","metric_name":"ccp_replication_lag_received_time","static_attributes":{"server":"localhost:5432"},"value_column":"received_time","value_type":"double"},{"attribute_columns":["role"],"description":"Length of time since the last transaction was replayed on replica.\nReturns zero if last WAL received equals last WAL replayed. Avoids\nfalse positives when primary stops writing. Monitors for replicas that\ncannot keep up with primary WAL generation.\n","metric_name":"ccp_replication_lag_replay_time","static_attributes":{"server":"localhost:5432"},"value_column":"replay_time","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END\nAS replay_time , CASE\n WHEN pg_is_in_recovery() = false THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END\nAS received_time , CASE\n WHEN pg_is_in_recovery() = true THEN 'replica'\n ELSE 'primary'\n END\nAS role;\n"},{"metrics":[{"description":"Number of settings from pg_settings catalog in a pending_restart state","metric_name":"ccp_settings_pending_restart_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true;\n"},{"metrics":[{"description":"Number of buffers allocated","metric_name":"ccp_stat_bgwriter_buffers_alloc","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_alloc"},{"data_type":"sum","description":"Number of buffers written by the background writer","metric_name":"ccp_stat_bgwriter_buffers_clean","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_clean"},{"description":"Number of times the background writer stopped a cleaning scan because it had written too many buffers","metric_name":"ccp_stat_bgwriter_maxwritten_clean","static_attributes":{"server":"localhost:5432"},"value_column":"maxwritten_clean"}],"sql":"SELECT\n buffers_clean\n , maxwritten_clean\n , buffers_alloc\nFROM pg_catalog.pg_stat_bgwriter;\n"},{"metrics":[{"description":"Oldest current transaction ID in cluster","metric_name":"ccp_transaction_wraparound_oldest_current_xid","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_current_xid"},{"description":"Percentage towards emergency autovacuum process starting","metric_name":"ccp_transaction_wraparound_percent_towards_emergency_autovac","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_emergency_autovac"},{"description":"Percentage towards transaction ID wraparound","metric_name":"ccp_transaction_wraparound_percent_towards_wraparound","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_wraparound"}],"sql":"WITH max_age AS (\n SELECT 2000000000 as max_old_xid\n , setting AS autovacuum_freeze_max_age\n FROM pg_catalog.pg_settings\n WHERE name = 'autovacuum_freeze_max_age')\n, per_database_stats AS (\n SELECT datname\n , m.max_old_xid::int\n , m.autovacuum_freeze_max_age::int\n , age(d.datfrozenxid) AS oldest_current_xid\n FROM pg_catalog.pg_database d\n JOIN max_age m ON (true)\n WHERE d.datallowconn)\nSELECT max(oldest_current_xid) AS oldest_current_xid , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac FROM per_database_stats;\n"},{"metrics":[{"description":"Current size in bytes of the WAL directory","metric_name":"ccp_wal_activity_total_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_size_bytes"}],"sql":"SELECT last_5_min_size_bytes,\n (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes\n FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification \u003e CURRENT_TIMESTAMP - '5 minutes'::interval) x;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_top_max_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"max_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total time spent in the statement in milliseconds","metric_name":"ccp_pg_stat_statements_top_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"total_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , total_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total amount of WAL generated by the statement in bytes","metric_name":"ccp_pg_stat_statements_top_wal_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL full page images generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_fpi","static_attributes":{"server":"localhost:5432"},"value_column":"fpi","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL records generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_records","static_attributes":{"server":"localhost:5432"},"value_column":"records","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , query\n , queryid\n , records\n , fpi\n , bytes\nFROM monitor ORDER BY bytes DESC LIMIT 20;\n"}] +[{"metrics":[{"attribute_columns":["application_name","datname","state","usename"],"description":"number of connections in this state","metric_name":"ccp_pg_stat_activity_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT\n pg_database.datname,\n tmp.state,\n COALESCE(tmp2.usename, '') as usename,\n COALESCE(tmp2.application_name, '') as application_name,\n COALESCE(count,0) as count,\n COALESCE(max_tx_duration,0) as max_tx_duration\nFROM\n (\n VALUES ('active'),\n ('idle'),\n ('idle in transaction'),\n ('idle in transaction (aborted)'),\n ('fastpath function call'),\n ('disabled')\n ) AS tmp(state) CROSS JOIN pg_database\nLEFT JOIN (\n SELECT\n datname,\n state,\n usename,\n application_name,\n count(*) AS count,\n MAX(EXTRACT(EPOCH FROM now() - xact_start))::float AS max_tx_duration\n FROM pg_stat_activity GROUP BY datname,state,usename,application_name) AS tmp2\n ON tmp.state = tmp2.state AND pg_database.datname = tmp2.datname;\n"},{"metrics":[{"description":"Seconds since the last successful archive operation","metric_name":"ccp_archive_command_status_seconds_since_last_archive","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_archive","value_type":"double"}],"sql":"SELECT EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)) AS seconds_since_last_archive FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of WAL files that have been successfully archived","metric_name":"ccp_archive_command_status_archived_count","static_attributes":{"server":"localhost:5432"},"value_column":"archived_count"}],"sql":"SELECT archived_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of failed attempts for archiving WAL files","metric_name":"ccp_archive_command_status_failed_count","static_attributes":{"server":"localhost:5432"},"value_column":"failed_count"}],"sql":"SELECT failed_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Seconds since the last recorded failure of the archive_command","metric_name":"ccp_archive_command_status_seconds_since_last_fail","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_fail"}],"sql":"SELECT CASE\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) \u003c 0 THEN 0\n ELSE EXTRACT(epoch from (last_failed_time - last_archived_time))\n END AS seconds_since_last_fail\nFROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Total non-idle connections","metric_name":"ccp_connection_stats_active","static_attributes":{"server":"localhost:5432"},"value_column":"active"},{"description":"Total idle connections","metric_name":"ccp_connection_stats_idle","static_attributes":{"server":"localhost:5432"},"value_column":"idle"},{"description":"Total idle in transaction connections","metric_name":"ccp_connection_stats_idle_in_txn","static_attributes":{"server":"localhost:5432"},"value_column":"idle_in_txn"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_blocked_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_blocked_query_time","value_type":"double"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_connections","static_attributes":{"server":"localhost:5432"},"value_column":"max_connections"},{"description":"Length of time in seconds of the longest idle in transaction session","metric_name":"ccp_connection_stats_max_idle_in_txn_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_idle_in_txn_time","value_type":"double"},{"description":"Length of time in seconds of the longest running query","metric_name":"ccp_connection_stats_max_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_query_time","value_type":"double"},{"description":"Total idle and non-idle connections","metric_name":"ccp_connection_stats_total","static_attributes":{"server":"localhost:5432"},"value_column":"total"}],"sql":"SELECT ((total - idle) - idle_in_txn) as active\n , total\n , idle\n , idle_in_txn\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - state_change))),0) FROM pg_catalog.pg_stat_activity WHERE state = 'idle in transaction') AS max_idle_in_txn_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND state \u003c\u003e 'idle' ) AS max_query_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND wait_event_type = 'Lock' ) AS max_blocked_query_time\n , max_connections\n FROM (\n SELECT COUNT(*) as total\n , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle\n , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x\n JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);\n"},{"metrics":[{"attribute_columns":["dbname"],"description":"Total number of checksum failures on this database","metric_name":"ccp_data_checksum_failure_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"},{"attribute_columns":["dbname"],"description":"Time interval in seconds since the last checksum failure was encountered","metric_name":"ccp_data_checksum_failure_time_since_last_failure_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"time_since_last_failure_seconds","value_type":"double"}],"sql":"SELECT datname AS dbname , checksum_failures AS count , coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds FROM pg_catalog.pg_stat_database WHERE pg_stat_database.datname IS NOT NULL;\n"},{"metrics":[{"attribute_columns":["dbname","mode"],"description":"Return value of 1 means database is in recovery. Otherwise 2 it is a primary.","metric_name":"ccp_locks_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT pg_database.datname as dbname , tmp.mode , COALESCE(count,0) as count FROM (\n VALUES ('accesssharelock'),\n ('rowsharelock'),\n ('rowexclusivelock'),\n ('shareupdateexclusivelock'),\n ('sharelock'),\n ('sharerowexclusivelock'),\n ('exclusivelock'),\n ('accessexclusivelock')\n) AS tmp(mode) CROSS JOIN pg_catalog.pg_database LEFT JOIN\n (SELECT database, lower(mode) AS mode,count(*) AS count\n FROM pg_catalog.pg_locks WHERE database IS NOT NULL\n GROUP BY database, lower(mode)\n) AS tmp2 ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database;\n"},{"metrics":[{"description":"CPU limit value in milli cores","metric_name":"ccp_nodemx_cpu_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"CPU request value in milli cores","metric_name":"ccp_nodemx_cpu_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"}],"sql":"SELECT monitor.kdapi_scalar_bigint('cpu_request') AS request , monitor.kdapi_scalar_bigint('cpu_limit') AS limit\n"},{"metrics":[{"description":"CPU usage in nanoseconds","metric_name":"ccp_nodemx_cpuacct_usage","static_attributes":{"server":"localhost:5432"},"value_column":"usage","value_type":"double"},{"description":"CPU usage snapshot timestamp","metric_name":"ccp_nodemx_cpuacct_usage_ts","static_attributes":{"server":"localhost:5432"},"value_column":"usage_ts","value_type":"double"}],"sql":"SELECT CASE WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('cpuacct.usage')\n ELSE (SELECT val FROM monitor.cgroup_setof_kv('cpu.stat') where key = 'usage_usec') * 1000\n END AS usage,\n extract(epoch from clock_timestamp()) AS usage_ts;\n"},{"metrics":[{"description":"The total available run-time within a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_period_us","static_attributes":{"server":"localhost:5432"},"value_column":"period_us"},{"description":"The length of a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_quota_us","static_attributes":{"server":"localhost:5432"},"value_column":"quota_us","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n monitor.cgroup_scalar_bigint('cpu.cfs_period_us')\n ELSE\n (monitor.cgroup_array_bigint('cpu.max'))[2]\n END AS period_us,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n GREATEST(monitor.cgroup_scalar_bigint('cpu.cfs_quota_us'), 0)\n ELSE\n GREATEST((monitor.cgroup_array_bigint('cpu.max'))[1], 0)\n END AS quota_us;\n"},{"metrics":[{"description":"Number of periods that any thread was runnable","metric_name":"ccp_nodemx_cpustat_nr_periods","static_attributes":{"server":"localhost:5432"},"value_column":"nr_periods","value_type":"double"},{"description":"Number of runnable periods in which the application used its entire quota and was throttled","metric_name":"ccp_nodemx_cpustat_nr_throttled","static_attributes":{"server":"localhost:5432"},"value_column":"nr_throttled"},{"description":"CPU stat snapshot timestamp","metric_name":"ccp_nodemx_cpustat_snap_ts","static_attributes":{"server":"localhost:5432"},"value_column":"snap_ts","value_type":"double"},{"description":"Sum total amount of time individual threads within the monitor.cgroup were throttled","metric_name":"ccp_nodemx_cpustat_throttled_time","static_attributes":{"server":"localhost:5432"},"value_column":"throttled_time","value_type":"double"}],"sql":"WITH d(key, val) AS (select key, val from monitor.cgroup_setof_kv('cpu.stat')) SELECT\n (SELECT val FROM d WHERE key='nr_periods') AS nr_periods,\n (SELECT val FROM d WHERE key='nr_throttled') AS nr_throttled,\n (SELECT val FROM d WHERE key='throttled_usec') AS throttled_time,\n extract(epoch from clock_timestamp()) as snap_ts;\n"},{"metrics":[{"attribute_columns":["fs_type","mount_point"],"description":"Available size in bytes","metric_name":"ccp_nodemx_data_disk_available_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"available_bytes","value_type":"double"},{"attribute_columns":["fs_type","mount_point"],"description":"Available file nodes","metric_name":"ccp_nodemx_data_disk_free_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"free_file_nodes"},{"attribute_columns":["fs_type","mount_point"],"description":"Size in bytes","metric_name":"ccp_nodemx_data_disk_total_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_bytes"},{"attribute_columns":["fs_type","mount_point"],"description":"Total file nodes","metric_name":"ccp_nodemx_data_disk_total_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"total_file_nodes"}],"sql":"SELECT mount_point,fs_type,total_bytes,available_bytes,total_file_nodes,free_file_nodes\n FROM monitor.proc_mountinfo() m\n JOIN monitor.fsinfo(m.mount_point) f USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%'\n"},{"metrics":[{"attribute_columns":["mount_point"],"description":"Total sectors read","metric_name":"ccp_nodemx_disk_activity_sectors_read","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_read"},{"attribute_columns":["mount_point"],"description":"Total sectors written","metric_name":"ccp_nodemx_disk_activity_sectors_written","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_written"}],"sql":"SELECT mount_point,sectors_read,sectors_written\n FROM monitor.proc_mountinfo() m\n JOIN monitor.proc_diskstats() d USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%';\n"},{"metrics":[{"description":"Total bytes of anonymous and swap cache memory on active LRU list","metric_name":"ccp_nodemx_mem_active_anon","static_attributes":{"server":"localhost:5432"},"value_column":"active_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on active LRU list","metric_name":"ccp_nodemx_mem_active_file","static_attributes":{"server":"localhost:5432"},"value_column":"active_file","value_type":"double"},{"description":"Total bytes of page cache memory","metric_name":"ccp_nodemx_mem_cache","static_attributes":{"server":"localhost:5432"},"value_column":"cache","value_type":"double"},{"description":"Total bytes that are waiting to get written back to the disk","metric_name":"ccp_nodemx_mem_dirty","static_attributes":{"server":"localhost:5432"},"value_column":"dirty"},{"description":"Total bytes of anonymous and swap cache memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_anon","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_file","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_file","value_type":"double"},{"description":"Unknown metric from ccp_nodemx_mem","metric_name":"ccp_nodemx_mem_kmem_usage_in_byte","static_attributes":{"server":"localhost:5432"},"value_column":"kmem_usage_in_byte"},{"description":"Memory limit value in bytes","metric_name":"ccp_nodemx_mem_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"Total bytes of mapped file (includes tmpfs/shmem)","metric_name":"ccp_nodemx_mem_mapped_file","static_attributes":{"server":"localhost:5432"},"value_column":"mapped_file"},{"description":"Memory request value in bytes","metric_name":"ccp_nodemx_mem_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"},{"description":"Total bytes of anonymous and swap cache memory","metric_name":"ccp_nodemx_mem_rss","static_attributes":{"server":"localhost:5432"},"value_column":"rss","value_type":"double"},{"description":"Total bytes of shared memory","metric_name":"ccp_nodemx_mem_shmem","static_attributes":{"server":"localhost:5432"},"value_column":"shmem","value_type":"double"},{"description":"Total usage in bytes","metric_name":"ccp_nodemx_mem_usage_in_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"usage_in_bytes"}],"sql":"WITH d(key, val) as (SELECT key, val FROM monitor.cgroup_setof_kv('memory.stat')) SELECT\n monitor.kdapi_scalar_bigint('mem_request') AS request,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.limit_in_bytes') = 9223372036854771712 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.limit_in_bytes') END)\n ELSE\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.max') = 9223372036854775807 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.max') END)\n END AS limit,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='cache')\n ELSE 0\n END as cache,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='rss')\n ELSE 0\n END as RSS,\n (SELECT val FROM d WHERE key='shmem') as shmem,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='mapped_file')\n ELSE 0\n END as mapped_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='dirty')\n ELSE (SELECT val FROM d WHERE key='file_dirty')\n END as dirty,\n (SELECT val FROM d WHERE key='active_anon') as active_anon,\n (SELECT val FROM d WHERE key='inactive_anon') as inactive_anon,\n (SELECT val FROM d WHERE key='active_file') as active_file,\n (SELECT val FROM d WHERE key='inactive_file') as inactive_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.usage_in_bytes')\n ELSE monitor.cgroup_scalar_bigint('memory.current')\n END as usage_in_bytes,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.kmem.usage_in_bytes')\n ELSE 0\n END as kmem_usage_in_byte;\n"},{"metrics":[{"attribute_columns":["interface"],"description":"Number of bytes received","metric_name":"ccp_nodemx_network_rx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"rx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets received","metric_name":"ccp_nodemx_network_rx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"rx_packets"},{"attribute_columns":["interface"],"description":"Number of bytes transmitted","metric_name":"ccp_nodemx_network_tx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"tx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets transmitted","metric_name":"ccp_nodemx_network_tx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"tx_packets"}],"sql":"SELECT interface\n ,tx_bytes\n ,tx_packets\n ,rx_bytes\n ,rx_packets from monitor.proc_network_stats()\n"},{"metrics":[{"description":"Total number of database processes","metric_name":"ccp_nodemx_process_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT monitor.cgroup_process_count() as count;\n"},{"metrics":[{"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_reset_time","static_attributes":{"server":"localhost:5432"},"value_column":"time"}],"sql":"SELECT monitor.pg_stat_statements_reset_info(-1) as time;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Average query runtime in milliseconds","metric_name":"ccp_pg_stat_statements_top_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"top_mean_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max(monitor.mean_exec_time) AS top_mean_exec_time_ms\nFROM monitor GROUP BY 1,2,3,4 ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","role"],"description":"Total number of queries run per user/database","metric_name":"ccp_pg_stat_statements_total_calls_count","static_attributes":{"server":"localhost:5432"},"value_column":"calls_count","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"mean_exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total rows returned from all queries per user/database","metric_name":"ccp_pg_stat_statements_total_row_count","static_attributes":{"server":"localhost:5432"},"value_column":"row_count","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.calls\n , s.total_exec_time\n , s.mean_exec_time\n , s.rows\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , sum(calls) AS calls_count\n , sum(total_exec_time) AS exec_time_ms\n , avg(mean_exec_time) AS mean_exec_time_ms\n , sum(rows) AS row_count\nFROM monitor GROUP BY 1,2;\n"},{"metrics":[{"description":"The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######).","metric_name":"ccp_postgresql_version_current","static_attributes":{"server":"localhost:5432"},"value_column":"current"}],"sql":"SELECT current_setting('server_version_num')::int AS current;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_postmaster_uptime_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"seconds","value_type":"double"}],"sql":"SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_replication_lag_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"}],"sql":"SELECT * FROM get_replication_lag();\n"},{"metrics":[{"attribute_columns":["role"],"description":"Length of time since the last WAL file was received and replayed on replica.\nAlways increases, possibly causing false positives if the primary stops writing.\nMonitors for replicas that stop receiving WAL all together.\n","metric_name":"ccp_replication_lag_received_time","static_attributes":{"server":"localhost:5432"},"value_column":"received_time","value_type":"double"},{"attribute_columns":["role"],"description":"Length of time since the last transaction was replayed on replica.\nReturns zero if last WAL received equals last WAL replayed. Avoids\nfalse positives when primary stops writing. Monitors for replicas that\ncannot keep up with primary WAL generation.\n","metric_name":"ccp_replication_lag_replay_time","static_attributes":{"server":"localhost:5432"},"value_column":"replay_time","value_type":"double"}],"sql":"SELECT\n COALESCE(\n CASE\n WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END,\n 0\n ) AS replay_time,\n COALESCE(\n CASE\n WHEN pg_is_in_recovery() = false THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END,\n 0\n ) AS received_time,\n CASE\n WHEN pg_is_in_recovery() = true THEN 'replica'\n ELSE 'primary'\n END AS role;\n"},{"metrics":[{"description":"Number of settings from pg_settings catalog in a pending_restart state","metric_name":"ccp_settings_pending_restart_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true;\n"},{"metrics":[{"description":"Number of buffers allocated","metric_name":"ccp_stat_bgwriter_buffers_alloc","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_alloc"},{"data_type":"sum","description":"Number of buffers written by the background writer","metric_name":"ccp_stat_bgwriter_buffers_clean","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_clean"},{"description":"Number of times the background writer stopped a cleaning scan because it had written too many buffers","metric_name":"ccp_stat_bgwriter_maxwritten_clean","static_attributes":{"server":"localhost:5432"},"value_column":"maxwritten_clean"}],"sql":"SELECT\n buffers_clean\n , maxwritten_clean\n , buffers_alloc\nFROM pg_catalog.pg_stat_bgwriter;\n"},{"metrics":[{"description":"Oldest current transaction ID in cluster","metric_name":"ccp_transaction_wraparound_oldest_current_xid","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_current_xid"},{"description":"Percentage towards emergency autovacuum process starting","metric_name":"ccp_transaction_wraparound_percent_towards_emergency_autovac","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_emergency_autovac"},{"description":"Percentage towards transaction ID wraparound","metric_name":"ccp_transaction_wraparound_percent_towards_wraparound","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_wraparound"}],"sql":"WITH max_age AS (\n SELECT 2000000000 as max_old_xid\n , setting AS autovacuum_freeze_max_age\n FROM pg_catalog.pg_settings\n WHERE name = 'autovacuum_freeze_max_age')\n, per_database_stats AS (\n SELECT datname\n , m.max_old_xid::int\n , m.autovacuum_freeze_max_age::int\n , age(d.datfrozenxid) AS oldest_current_xid\n FROM pg_catalog.pg_database d\n JOIN max_age m ON (true)\n WHERE d.datallowconn)\nSELECT max(oldest_current_xid) AS oldest_current_xid , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac FROM per_database_stats;\n"},{"metrics":[{"description":"Current size in bytes of the WAL directory","metric_name":"ccp_wal_activity_total_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_size_bytes"}],"sql":"SELECT last_5_min_size_bytes,\n (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes\n FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification \u003e CURRENT_TIMESTAMP - '5 minutes'::interval) x;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_top_max_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"max_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total time spent in the statement in milliseconds","metric_name":"ccp_pg_stat_statements_top_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"total_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , total_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total amount of WAL generated by the statement in bytes","metric_name":"ccp_pg_stat_statements_top_wal_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL full page images generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_fpi","static_attributes":{"server":"localhost:5432"},"value_column":"fpi","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL records generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_records","static_attributes":{"server":"localhost:5432"},"value_column":"records","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , query\n , queryid\n , records\n , fpi\n , bytes\nFROM monitor ORDER BY bytes DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["repo"],"description":"Seconds since the last completed full or differential backup. Differential is always based off last full.","metric_name":"ccp_backrest_last_diff_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_diff_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full backup","metric_name":"ccp_backrest_last_full_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_full_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full, differential or incremental backup.\nIncremental is always based off last full or differential.\n","metric_name":"ccp_backrest_last_incr_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_incr_backup"},{"attribute_columns":["backup_type","repo"],"description":"pgBackRest version number when this backup was performed","metric_name":"ccp_backrest_last_info_backrest_repo_version","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backrest_repo_version"},{"attribute_columns":["backup_type","repo"],"description":"An error has been encountered in the backup. Check logs for more information.","metric_name":"ccp_backrest_last_info_backup_error","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backup_error"},{"attribute_columns":["backup_type","repo"],"description":"Total runtime in seconds of this backup","metric_name":"ccp_backrest_last_info_backup_runtime_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"backup_runtime_seconds"},{"attribute_columns":["backup_type","repo"],"description":"Actual size of only this individual backup in the pgbackrest repository","metric_name":"ccp_backrest_last_info_repo_backup_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_backup_size_bytes"},{"attribute_columns":["backup_type","repo"],"description":"Total size of this backup in the pgbackrest repository, including all required previous backups and WAL","metric_name":"ccp_backrest_last_info_repo_total_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_total_size_bytes"},{"attribute_columns":["repo"],"description":"Seconds since the oldest completed full backup","metric_name":"ccp_backrest_oldest_full_backup_time_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_full_backup"}],"sql":"SELECT * FROM get_pgbackrest_info();\n"}] diff --git a/internal/collector/pgbackrest_metrics.yaml b/internal/collector/pgbackrest_metrics.yaml deleted file mode 100644 index cb5dbba5f8..0000000000 --- a/internal/collector/pgbackrest_metrics.yaml +++ /dev/null @@ -1,169 +0,0 @@ - # FIXME: The repo key is obtained inelegantly. - # The query below runs pgbackrest info and parses the output. - # The --stanza argument matches DefaultStanzaName, defined in internal/pgbackrest/config.go. - - sql: | - DROP TABLE IF EXISTS pgbackrest_info; - CREATE TEMPORARY TABLE pgbackrest_info (data json); - - COPY pgbackrest_info (data) - FROM PROGRAM 'export LC_ALL=C && printf "\f" && pgbackrest info --log-level-console=info --log-level-stderr=warn --output=json --stanza=db && printf "\f"' - WITH (FORMAT csv, HEADER false, QUOTE E'\f'); - - WITH - all_backups (data) AS ( - SELECT jsonb_array_elements(to_jsonb(data)) FROM pgbackrest_info - ), - stanza_backups (stanza, backup) AS ( - SELECT data->>'name', jsonb_array_elements(data->'backup') FROM all_backups - ), - ordered_backups (stanza, backup, seq_oldest, seq_newest) AS ( - SELECT stanza, backup, - ROW_NUMBER() OVER ( - PARTITION BY stanza, backup->'database'->>'repo-key', backup->>'type' - ORDER BY backup->'timestamp'->>'start' ASC, backup->'timestamp'->>'stop' ASC - ), - ROW_NUMBER() OVER ( - PARTITION BY stanza, backup->'database'->>'repo-key', backup->>'type' - ORDER BY backup->'timestamp'->>'start' DESC, backup->'timestamp'->>'stop' DESC - ) - FROM stanza_backups - ), - - ccp_backrest_last_info AS ( - SELECT - stanza, - split_part(backup->'backrest'->>'version', '.', 1) || lpad(split_part(backup->'backrest'->>'version', '.', 2), 2, '0') || lpad(coalesce(nullif(split_part(backup->'backrest'->>'version', '.', 3), ''), '00'), 2, '0') AS backrest_repo_version, - backup->'database'->>'repo-key' AS repo, - backup->>'type' AS backup_type, - backup->'info'->'repository'->>'delta' AS repo_backup_size_bytes, - backup->'info'->'repository'->>'size' AS repo_total_size_bytes, - (backup->'timestamp'->>'stop')::bigint - (backup->'timestamp'->>'start')::bigint AS backup_runtime_seconds, - CASE WHEN backup->>'error' = 'true' THEN 1 ELSE 0 END AS backup_error - FROM ordered_backups - WHERE seq_newest = 1 - ), - - ccp_backrest_oldest_full_backup AS ( - SELECT - stanza, - backup->'database'->>'repo-key' AS repo, - min((backup->'timestamp'->>'stop')::bigint) AS time_seconds - FROM ordered_backups - WHERE seq_oldest = 1 AND backup->>'type' IN ('full') - GROUP BY 1,2 - ), - - ccp_backrest_last_full_backup AS ( - SELECT - stanza, - backup->'database'->>'repo-key' AS repo, - EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds - FROM ordered_backups - WHERE seq_newest = 1 AND backup->>'type' IN ('full') - GROUP BY 1,2 - ), - - ccp_backrest_last_diff_backup AS ( - SELECT - stanza, - backup->'database'->>'repo-key' AS repo, - EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds - FROM ordered_backups - WHERE seq_newest = 1 AND backup->>'type' IN ('full','diff') - GROUP BY 1,2 - ), - - ccp_backrest_last_incr_backup AS ( - SELECT - stanza, - backup->'database'->>'repo-key' AS repo, - EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds - FROM ordered_backups - WHERE seq_newest = 1 AND backup->>'type' IN ('full','diff','incr') - GROUP BY 1,2 - ) - - SELECT - ccp_backrest_last_diff_backup.time_since_completion_seconds as last_diff_backup, - ccp_backrest_last_full_backup.time_since_completion_seconds as last_full_backup, - ccp_backrest_last_incr_backup.time_since_completion_seconds as last_incr_backup, - ccp_backrest_last_info.backrest_repo_version as last_info_backrest_repo_version, - ccp_backrest_last_info.backup_error as last_info_backup_error, - ccp_backrest_last_info.backup_type as backup_type, - ccp_backrest_last_info.backup_runtime_seconds as backup_runtime_seconds, - ccp_backrest_last_info.repo_backup_size_bytes as repo_backup_size_bytes, - ccp_backrest_last_info.repo_total_size_bytes as repo_total_size_bytes, - ccp_backrest_oldest_full_backup.time_seconds as oldest_full_backup, - ccp_backrest_last_incr_backup.repo as repo - - FROM - ccp_backrest_last_diff_backup - , ccp_backrest_last_full_backup - , ccp_backrest_last_incr_backup - , ccp_backrest_last_info - , ccp_backrest_oldest_full_backup; - metrics: - - metric_name: ccp_backrest_last_diff_backup_time_since_completion_seconds - description: Seconds since the last completed full or differential backup. Differential is always based off last full. - value_column: last_diff_backup - attribute_columns: ["repo"] - static_attributes: - server: "localhost:5432" - stanza: "db" - - metric_name: ccp_backrest_last_full_backup_time_since_completion_seconds - description: Seconds since the last completed full backup - value_column: last_full_backup - attribute_columns: ["repo"] - static_attributes: - server: "localhost:5432" - stanza: "db" - - metric_name: ccp_backrest_last_incr_backup_time_since_completion_seconds - description: | - Seconds since the last completed full, differential or incremental backup. - Incremental is always based off last full or differential. - value_column: last_incr_backup - attribute_columns: ["repo"] - static_attributes: - server: "localhost:5432" - stanza: "db" - - metric_name: ccp_backrest_last_info_backrest_repo_version - description: pgBackRest version number when this backup was performed - value_column: last_info_backrest_repo_version - attribute_columns: ["backup_type", "repo"] - static_attributes: - server: "localhost:5432" - stanza: "db" - - metric_name: ccp_backrest_last_info_backup_error - description: An error has been encountered in the backup. Check logs for more information. - value_column: last_info_backup_error - attribute_columns: ["backup_type", "repo"] - static_attributes: - server: "localhost:5432" - stanza: "db" - - metric_name: ccp_backrest_last_info_backup_runtime_seconds - description: Total runtime in seconds of this backup - value_column: backup_runtime_seconds - attribute_columns: ["backup_type", "repo"] - static_attributes: - server: "localhost:5432" - stanza: "db" - - metric_name: ccp_backrest_last_info_repo_backup_size_bytes - description: Actual size of only this individual backup in the pgbackrest repository - value_column: repo_backup_size_bytes - attribute_columns: ["backup_type", "repo"] - static_attributes: - server: "localhost:5432" - stanza: "db" - - metric_name: ccp_backrest_last_info_repo_total_size_bytes - description: Total size of this backup in the pgbackrest repository, including all required previous backups and WAL - value_column: repo_total_size_bytes - attribute_columns: ["backup_type", "repo"] - static_attributes: - server: "localhost:5432" - stanza: "db" - - metric_name: ccp_backrest_oldest_full_backup_time_seconds - description: Seconds since the oldest completed full backup - value_column: oldest_full_backup - attribute_columns: ["repo"] - static_attributes: - server: "localhost:5432" diff --git a/internal/collector/postgres_5s_metrics.yaml b/internal/collector/postgres_5s_metrics.yaml index 3f09e8a8d2..835c9d4337 100644 --- a/internal/collector/postgres_5s_metrics.yaml +++ b/internal/collector/postgres_5s_metrics.yaml @@ -9,8 +9,8 @@ SELECT pg_database.datname, tmp.state, - tmp2.usename, - tmp2.application_name, + COALESCE(tmp2.usename, '') as usename, + COALESCE(tmp2.application_name, '') as application_name, COALESCE(count,0) as count, COALESCE(max_tx_duration,0) as max_tx_duration FROM @@ -626,10 +626,9 @@ static_attributes: server: "localhost:5432" - # ccp_replication_lag_size_bytes will return NULL on a replica + # get_replication_lag is created in metrics_setup.sql - sql: > - SELECT pg_wal_lsn_diff(sent_lsn, replay_lsn) AS bytes - FROM pg_catalog.pg_stat_replication; + SELECT * FROM get_replication_lag(); metrics: - metric_name: ccp_replication_lag_size_bytes value_column: bytes @@ -640,21 +639,24 @@ - sql: > SELECT + COALESCE( + CASE + WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0 + ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER + END, + 0 + ) AS replay_time, + COALESCE( + CASE + WHEN pg_is_in_recovery() = false THEN 0 + ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER + END, + 0 + ) AS received_time, CASE - WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0 - ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER - END - AS replay_time - , CASE - WHEN pg_is_in_recovery() = false THEN 0 - ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER - END - AS received_time - , CASE - WHEN pg_is_in_recovery() = true THEN 'replica' - ELSE 'primary' - END - AS role; + WHEN pg_is_in_recovery() = true THEN 'replica' + ELSE 'primary' + END AS role; metrics: - metric_name: ccp_replication_lag_received_time value_column: received_time @@ -877,3 +879,70 @@ static_attributes: server: "localhost:5432" + - sql: | + SELECT * FROM get_pgbackrest_info(); + metrics: + - metric_name: ccp_backrest_last_diff_backup_time_since_completion_seconds + description: Seconds since the last completed full or differential backup. Differential is always based off last full. + value_column: last_diff_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_full_backup_time_since_completion_seconds + description: Seconds since the last completed full backup + value_column: last_full_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_incr_backup_time_since_completion_seconds + description: | + Seconds since the last completed full, differential or incremental backup. + Incremental is always based off last full or differential. + value_column: last_incr_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_backrest_repo_version + description: pgBackRest version number when this backup was performed + value_column: last_info_backrest_repo_version + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_backup_error + description: An error has been encountered in the backup. Check logs for more information. + value_column: last_info_backup_error + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_backup_runtime_seconds + description: Total runtime in seconds of this backup + value_column: backup_runtime_seconds + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_repo_backup_size_bytes + description: Actual size of only this individual backup in the pgbackrest repository + value_column: repo_backup_size_bytes + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_last_info_repo_total_size_bytes + description: Total size of this backup in the pgbackrest repository, including all required previous backups and WAL + value_column: repo_total_size_bytes + attribute_columns: ["backup_type", "repo"] + static_attributes: + server: "localhost:5432" + stanza: "db" + - metric_name: ccp_backrest_oldest_full_backup_time_seconds + description: Seconds since the oldest completed full backup + value_column: oldest_full_backup + attribute_columns: ["repo"] + static_attributes: + server: "localhost:5432" diff --git a/internal/collector/postgres_metrics.go b/internal/collector/postgres_metrics.go index d6902c2262..92aed244f5 100644 --- a/internal/collector/postgres_metrics.go +++ b/internal/collector/postgres_metrics.go @@ -2,7 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 -// TODO fold this back into postgres.go once the collector package stabilizes. package collector import ( @@ -20,14 +19,11 @@ import ( // https://pkg.go.dev/embed // //go:embed "generated/postgres_5s_metrics.json" -var defaultFiveSecondMetrics json.RawMessage +var fiveSecondMetrics json.RawMessage //go:embed "generated/postgres_5m_metrics.json" var fiveMinuteMetrics json.RawMessage -//go:embed "generated/pgbackrest_metrics.json" -var pgBackRestMetrics json.RawMessage - //go:embed "generated/gte_pg17_metrics.json" var gtePG17 json.RawMessage @@ -42,9 +38,6 @@ var ltPG16 json.RawMessage func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresCluster, config *Config) { if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { - var fiveSecondMetrics json.RawMessage - fiveSecondMetrics, _ = appendToJSONArray(defaultFiveSecondMetrics, pgBackRestMetrics) - if inCluster.Spec.PostgresVersion >= 17 { fiveSecondMetrics, _ = appendToJSONArray(fiveSecondMetrics, gtePG17) } else { diff --git a/internal/controller/postgrescluster/metrics_setup.sql b/internal/controller/postgrescluster/metrics_setup.sql index fb22a7b7fd..728de80c3e 100644 --- a/internal/controller/postgrescluster/metrics_setup.sql +++ b/internal/controller/postgrescluster/metrics_setup.sql @@ -70,3 +70,153 @@ $function$; GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA monitor TO ccp_monitoring; GRANT ALL ON ALL TABLES IN SCHEMA monitor TO ccp_monitoring; + +--- get_pgbackrest_info is used by the OTel collector. +--- get_replication_lag is created as function, so that we can query without warning on a replica. +CREATE OR REPLACE FUNCTION get_replication_lag() RETURNS TABLE(bytes NUMERIC) AS $$ +BEGIN + IF pg_is_in_recovery() THEN + RETURN QUERY SELECT 0::NUMERIC AS bytes; + ELSE + RETURN QUERY SELECT pg_wal_lsn_diff(sent_lsn, replay_lsn) AS bytes + FROM pg_catalog.pg_stat_replication; + END IF; +END; +$$ LANGUAGE plpgsql; + +--- get_pgbackrest_info is used by the OTel collector. +--- get_pgbackrest_info is created as a function so that no ddl runs on a replica. +--- In the query, the --stanza argument matches DefaultStanzaName, defined in internal/pgbackrest/config.go. +CREATE OR REPLACE FUNCTION get_pgbackrest_info() +RETURNS TABLE ( + last_diff_backup BIGINT, + last_full_backup BIGINT, + last_incr_backup BIGINT, + last_info_backrest_repo_version TEXT, + last_info_backup_error INT, + backup_type TEXT, + backup_runtime_seconds BIGINT, + repo_backup_size_bytes TEXT, + repo_total_size_bytes TEXT, + oldest_full_backup BIGINT, + repo TEXT +) AS $$ +BEGIN + IF pg_is_in_recovery() THEN + RETURN QUERY + SELECT + 0::bigint AS last_diff_backup, + 0::bigint AS last_full_backup, + 0::bigint AS last_incr_backup, + '0' AS last_info_backrest_repo_version, + 0::int AS last_info_backup_error, + 'n/a'::text AS backup_type, + 0::bigint AS backup_runtime_seconds, + '0'::text AS repo_backup_size_bytes, + '0'::text AS repo_total_size_bytes, + 0::bigint AS oldest_full_backup, + 'n/a' AS repo; + ELSE + DROP TABLE IF EXISTS pgbackrest_info; + CREATE TEMPORARY TABLE pgbackrest_info (data json); + COPY pgbackrest_info (data) + FROM PROGRAM 'export LC_ALL=C && printf "\f" && pgbackrest info --log-level-console=info --log-level-stderr=warn --output=json --stanza=db && printf "\f"' + WITH (FORMAT csv, HEADER false, QUOTE E'\f'); + + RETURN QUERY + WITH + all_backups (data) AS ( + SELECT jsonb_array_elements(to_jsonb(data)) FROM pgbackrest_info + ), + stanza_backups (stanza, backup) AS ( + SELECT data->>'name', jsonb_array_elements(data->'backup') FROM all_backups + ), + ordered_backups (stanza, backup, seq_oldest, seq_newest) AS ( + SELECT stanza, backup, + ROW_NUMBER() OVER ( + PARTITION BY stanza, backup->'database'->>'repo-key', backup->>'type' + ORDER BY backup->'timestamp'->>'start' ASC, backup->'timestamp'->>'stop' ASC + ), + ROW_NUMBER() OVER ( + PARTITION BY stanza, backup->'database'->>'repo-key', backup->>'type' + ORDER BY backup->'timestamp'->>'start' DESC, backup->'timestamp'->>'stop' DESC + ) + FROM stanza_backups + ), + + ccp_backrest_last_info AS ( + SELECT + stanza, + split_part(backup->'backrest'->>'version', '.', 1) || lpad(split_part(backup->'backrest'->>'version', '.', 2), 2, '0') || lpad(coalesce(nullif(split_part(backup->'backrest'->>'version', '.', 3), ''), '00'), 2, '0') AS backrest_repo_version, + backup->'database'->>'repo-key' AS repo, + backup->>'type' AS backup_type, + backup->'info'->'repository'->>'delta' AS repo_backup_size_bytes, + backup->'info'->'repository'->>'size' AS repo_total_size_bytes, + (backup->'timestamp'->>'stop')::bigint - (backup->'timestamp'->>'start')::bigint AS backup_runtime_seconds, + CASE WHEN backup->>'error' = 'true' THEN 1 ELSE 0 END AS backup_error + FROM ordered_backups + WHERE seq_newest = 1 + ), + + ccp_backrest_oldest_full_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + min((backup->'timestamp'->>'stop')::bigint) AS time_seconds + FROM ordered_backups + WHERE seq_oldest = 1 AND backup->>'type' IN ('full') + GROUP BY 1,2 + ), + + ccp_backrest_last_full_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds + FROM ordered_backups + WHERE seq_newest = 1 AND backup->>'type' IN ('full') + GROUP BY 1,2 + ), + + ccp_backrest_last_diff_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds + FROM ordered_backups + WHERE seq_newest = 1 AND backup->>'type' IN ('full','diff') + GROUP BY 1,2 + ), + + ccp_backrest_last_incr_backup AS ( + SELECT + stanza, + backup->'database'->>'repo-key' AS repo, + EXTRACT(EPOCH FROM CURRENT_TIMESTAMP)::bigint - max((backup->'timestamp'->>'stop')::bigint) AS time_since_completion_seconds + FROM ordered_backups + WHERE seq_newest = 1 AND backup->>'type' IN ('full','diff','incr') + GROUP BY 1,2 + ) + + SELECT + ccp_backrest_last_diff_backup.time_since_completion_seconds, + ccp_backrest_last_full_backup.time_since_completion_seconds, + ccp_backrest_last_incr_backup.time_since_completion_seconds, + ccp_backrest_last_info.backrest_repo_version, + ccp_backrest_last_info.backup_error, + ccp_backrest_last_info.backup_type, + ccp_backrest_last_info.backup_runtime_seconds, + ccp_backrest_last_info.repo_backup_size_bytes, + ccp_backrest_last_info.repo_total_size_bytes, + ccp_backrest_oldest_full_backup.time_seconds, + ccp_backrest_last_incr_backup.repo + FROM + ccp_backrest_last_diff_backup + JOIN ccp_backrest_last_full_backup ON ccp_backrest_last_diff_backup.stanza = ccp_backrest_last_full_backup.stanza AND ccp_backrest_last_diff_backup.repo = ccp_backrest_last_full_backup.repo + JOIN ccp_backrest_last_incr_backup ON ccp_backrest_last_diff_backup.stanza = ccp_backrest_last_incr_backup.stanza AND ccp_backrest_last_diff_backup.repo = ccp_backrest_last_incr_backup.repo + JOIN ccp_backrest_last_info ON ccp_backrest_last_diff_backup.stanza = ccp_backrest_last_info.stanza AND ccp_backrest_last_diff_backup.repo = ccp_backrest_last_info.repo + JOIN ccp_backrest_oldest_full_backup ON ccp_backrest_last_diff_backup.stanza = ccp_backrest_oldest_full_backup.stanza AND ccp_backrest_last_diff_backup.repo = ccp_backrest_oldest_full_backup.repo; + END IF; +END; +$$ LANGUAGE plpgsql; + From 86ad298f8358038678869ca90317f223b762f434 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 09:58:04 -0500 Subject: [PATCH 10/22] Handles NULL for query on replica --- internal/collector/generated/postgres_5s_metrics.json | 2 +- internal/collector/postgres_5s_metrics.yaml | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/collector/generated/postgres_5s_metrics.json b/internal/collector/generated/postgres_5s_metrics.json index 5990f1e743..09ea77846b 100644 --- a/internal/collector/generated/postgres_5s_metrics.json +++ b/internal/collector/generated/postgres_5s_metrics.json @@ -1 +1 @@ -[{"metrics":[{"attribute_columns":["application_name","datname","state","usename"],"description":"number of connections in this state","metric_name":"ccp_pg_stat_activity_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT\n pg_database.datname,\n tmp.state,\n COALESCE(tmp2.usename, '') as usename,\n COALESCE(tmp2.application_name, '') as application_name,\n COALESCE(count,0) as count,\n COALESCE(max_tx_duration,0) as max_tx_duration\nFROM\n (\n VALUES ('active'),\n ('idle'),\n ('idle in transaction'),\n ('idle in transaction (aborted)'),\n ('fastpath function call'),\n ('disabled')\n ) AS tmp(state) CROSS JOIN pg_database\nLEFT JOIN (\n SELECT\n datname,\n state,\n usename,\n application_name,\n count(*) AS count,\n MAX(EXTRACT(EPOCH FROM now() - xact_start))::float AS max_tx_duration\n FROM pg_stat_activity GROUP BY datname,state,usename,application_name) AS tmp2\n ON tmp.state = tmp2.state AND pg_database.datname = tmp2.datname;\n"},{"metrics":[{"description":"Seconds since the last successful archive operation","metric_name":"ccp_archive_command_status_seconds_since_last_archive","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_archive","value_type":"double"}],"sql":"SELECT EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)) AS seconds_since_last_archive FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of WAL files that have been successfully archived","metric_name":"ccp_archive_command_status_archived_count","static_attributes":{"server":"localhost:5432"},"value_column":"archived_count"}],"sql":"SELECT archived_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of failed attempts for archiving WAL files","metric_name":"ccp_archive_command_status_failed_count","static_attributes":{"server":"localhost:5432"},"value_column":"failed_count"}],"sql":"SELECT failed_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Seconds since the last recorded failure of the archive_command","metric_name":"ccp_archive_command_status_seconds_since_last_fail","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_fail"}],"sql":"SELECT CASE\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) \u003c 0 THEN 0\n ELSE EXTRACT(epoch from (last_failed_time - last_archived_time))\n END AS seconds_since_last_fail\nFROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Total non-idle connections","metric_name":"ccp_connection_stats_active","static_attributes":{"server":"localhost:5432"},"value_column":"active"},{"description":"Total idle connections","metric_name":"ccp_connection_stats_idle","static_attributes":{"server":"localhost:5432"},"value_column":"idle"},{"description":"Total idle in transaction connections","metric_name":"ccp_connection_stats_idle_in_txn","static_attributes":{"server":"localhost:5432"},"value_column":"idle_in_txn"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_blocked_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_blocked_query_time","value_type":"double"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_connections","static_attributes":{"server":"localhost:5432"},"value_column":"max_connections"},{"description":"Length of time in seconds of the longest idle in transaction session","metric_name":"ccp_connection_stats_max_idle_in_txn_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_idle_in_txn_time","value_type":"double"},{"description":"Length of time in seconds of the longest running query","metric_name":"ccp_connection_stats_max_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_query_time","value_type":"double"},{"description":"Total idle and non-idle connections","metric_name":"ccp_connection_stats_total","static_attributes":{"server":"localhost:5432"},"value_column":"total"}],"sql":"SELECT ((total - idle) - idle_in_txn) as active\n , total\n , idle\n , idle_in_txn\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - state_change))),0) FROM pg_catalog.pg_stat_activity WHERE state = 'idle in transaction') AS max_idle_in_txn_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND state \u003c\u003e 'idle' ) AS max_query_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND wait_event_type = 'Lock' ) AS max_blocked_query_time\n , max_connections\n FROM (\n SELECT COUNT(*) as total\n , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle\n , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x\n JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);\n"},{"metrics":[{"attribute_columns":["dbname"],"description":"Total number of checksum failures on this database","metric_name":"ccp_data_checksum_failure_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"},{"attribute_columns":["dbname"],"description":"Time interval in seconds since the last checksum failure was encountered","metric_name":"ccp_data_checksum_failure_time_since_last_failure_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"time_since_last_failure_seconds","value_type":"double"}],"sql":"SELECT datname AS dbname , checksum_failures AS count , coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds FROM pg_catalog.pg_stat_database WHERE pg_stat_database.datname IS NOT NULL;\n"},{"metrics":[{"attribute_columns":["dbname","mode"],"description":"Return value of 1 means database is in recovery. Otherwise 2 it is a primary.","metric_name":"ccp_locks_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT pg_database.datname as dbname , tmp.mode , COALESCE(count,0) as count FROM (\n VALUES ('accesssharelock'),\n ('rowsharelock'),\n ('rowexclusivelock'),\n ('shareupdateexclusivelock'),\n ('sharelock'),\n ('sharerowexclusivelock'),\n ('exclusivelock'),\n ('accessexclusivelock')\n) AS tmp(mode) CROSS JOIN pg_catalog.pg_database LEFT JOIN\n (SELECT database, lower(mode) AS mode,count(*) AS count\n FROM pg_catalog.pg_locks WHERE database IS NOT NULL\n GROUP BY database, lower(mode)\n) AS tmp2 ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database;\n"},{"metrics":[{"description":"CPU limit value in milli cores","metric_name":"ccp_nodemx_cpu_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"CPU request value in milli cores","metric_name":"ccp_nodemx_cpu_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"}],"sql":"SELECT monitor.kdapi_scalar_bigint('cpu_request') AS request , monitor.kdapi_scalar_bigint('cpu_limit') AS limit\n"},{"metrics":[{"description":"CPU usage in nanoseconds","metric_name":"ccp_nodemx_cpuacct_usage","static_attributes":{"server":"localhost:5432"},"value_column":"usage","value_type":"double"},{"description":"CPU usage snapshot timestamp","metric_name":"ccp_nodemx_cpuacct_usage_ts","static_attributes":{"server":"localhost:5432"},"value_column":"usage_ts","value_type":"double"}],"sql":"SELECT CASE WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('cpuacct.usage')\n ELSE (SELECT val FROM monitor.cgroup_setof_kv('cpu.stat') where key = 'usage_usec') * 1000\n END AS usage,\n extract(epoch from clock_timestamp()) AS usage_ts;\n"},{"metrics":[{"description":"The total available run-time within a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_period_us","static_attributes":{"server":"localhost:5432"},"value_column":"period_us"},{"description":"The length of a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_quota_us","static_attributes":{"server":"localhost:5432"},"value_column":"quota_us","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n monitor.cgroup_scalar_bigint('cpu.cfs_period_us')\n ELSE\n (monitor.cgroup_array_bigint('cpu.max'))[2]\n END AS period_us,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n GREATEST(monitor.cgroup_scalar_bigint('cpu.cfs_quota_us'), 0)\n ELSE\n GREATEST((monitor.cgroup_array_bigint('cpu.max'))[1], 0)\n END AS quota_us;\n"},{"metrics":[{"description":"Number of periods that any thread was runnable","metric_name":"ccp_nodemx_cpustat_nr_periods","static_attributes":{"server":"localhost:5432"},"value_column":"nr_periods","value_type":"double"},{"description":"Number of runnable periods in which the application used its entire quota and was throttled","metric_name":"ccp_nodemx_cpustat_nr_throttled","static_attributes":{"server":"localhost:5432"},"value_column":"nr_throttled"},{"description":"CPU stat snapshot timestamp","metric_name":"ccp_nodemx_cpustat_snap_ts","static_attributes":{"server":"localhost:5432"},"value_column":"snap_ts","value_type":"double"},{"description":"Sum total amount of time individual threads within the monitor.cgroup were throttled","metric_name":"ccp_nodemx_cpustat_throttled_time","static_attributes":{"server":"localhost:5432"},"value_column":"throttled_time","value_type":"double"}],"sql":"WITH d(key, val) AS (select key, val from monitor.cgroup_setof_kv('cpu.stat')) SELECT\n (SELECT val FROM d WHERE key='nr_periods') AS nr_periods,\n (SELECT val FROM d WHERE key='nr_throttled') AS nr_throttled,\n (SELECT val FROM d WHERE key='throttled_usec') AS throttled_time,\n extract(epoch from clock_timestamp()) as snap_ts;\n"},{"metrics":[{"attribute_columns":["fs_type","mount_point"],"description":"Available size in bytes","metric_name":"ccp_nodemx_data_disk_available_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"available_bytes","value_type":"double"},{"attribute_columns":["fs_type","mount_point"],"description":"Available file nodes","metric_name":"ccp_nodemx_data_disk_free_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"free_file_nodes"},{"attribute_columns":["fs_type","mount_point"],"description":"Size in bytes","metric_name":"ccp_nodemx_data_disk_total_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_bytes"},{"attribute_columns":["fs_type","mount_point"],"description":"Total file nodes","metric_name":"ccp_nodemx_data_disk_total_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"total_file_nodes"}],"sql":"SELECT mount_point,fs_type,total_bytes,available_bytes,total_file_nodes,free_file_nodes\n FROM monitor.proc_mountinfo() m\n JOIN monitor.fsinfo(m.mount_point) f USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%'\n"},{"metrics":[{"attribute_columns":["mount_point"],"description":"Total sectors read","metric_name":"ccp_nodemx_disk_activity_sectors_read","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_read"},{"attribute_columns":["mount_point"],"description":"Total sectors written","metric_name":"ccp_nodemx_disk_activity_sectors_written","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_written"}],"sql":"SELECT mount_point,sectors_read,sectors_written\n FROM monitor.proc_mountinfo() m\n JOIN monitor.proc_diskstats() d USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%';\n"},{"metrics":[{"description":"Total bytes of anonymous and swap cache memory on active LRU list","metric_name":"ccp_nodemx_mem_active_anon","static_attributes":{"server":"localhost:5432"},"value_column":"active_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on active LRU list","metric_name":"ccp_nodemx_mem_active_file","static_attributes":{"server":"localhost:5432"},"value_column":"active_file","value_type":"double"},{"description":"Total bytes of page cache memory","metric_name":"ccp_nodemx_mem_cache","static_attributes":{"server":"localhost:5432"},"value_column":"cache","value_type":"double"},{"description":"Total bytes that are waiting to get written back to the disk","metric_name":"ccp_nodemx_mem_dirty","static_attributes":{"server":"localhost:5432"},"value_column":"dirty"},{"description":"Total bytes of anonymous and swap cache memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_anon","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_file","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_file","value_type":"double"},{"description":"Unknown metric from ccp_nodemx_mem","metric_name":"ccp_nodemx_mem_kmem_usage_in_byte","static_attributes":{"server":"localhost:5432"},"value_column":"kmem_usage_in_byte"},{"description":"Memory limit value in bytes","metric_name":"ccp_nodemx_mem_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"Total bytes of mapped file (includes tmpfs/shmem)","metric_name":"ccp_nodemx_mem_mapped_file","static_attributes":{"server":"localhost:5432"},"value_column":"mapped_file"},{"description":"Memory request value in bytes","metric_name":"ccp_nodemx_mem_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"},{"description":"Total bytes of anonymous and swap cache memory","metric_name":"ccp_nodemx_mem_rss","static_attributes":{"server":"localhost:5432"},"value_column":"rss","value_type":"double"},{"description":"Total bytes of shared memory","metric_name":"ccp_nodemx_mem_shmem","static_attributes":{"server":"localhost:5432"},"value_column":"shmem","value_type":"double"},{"description":"Total usage in bytes","metric_name":"ccp_nodemx_mem_usage_in_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"usage_in_bytes"}],"sql":"WITH d(key, val) as (SELECT key, val FROM monitor.cgroup_setof_kv('memory.stat')) SELECT\n monitor.kdapi_scalar_bigint('mem_request') AS request,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.limit_in_bytes') = 9223372036854771712 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.limit_in_bytes') END)\n ELSE\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.max') = 9223372036854775807 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.max') END)\n END AS limit,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='cache')\n ELSE 0\n END as cache,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='rss')\n ELSE 0\n END as RSS,\n (SELECT val FROM d WHERE key='shmem') as shmem,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='mapped_file')\n ELSE 0\n END as mapped_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='dirty')\n ELSE (SELECT val FROM d WHERE key='file_dirty')\n END as dirty,\n (SELECT val FROM d WHERE key='active_anon') as active_anon,\n (SELECT val FROM d WHERE key='inactive_anon') as inactive_anon,\n (SELECT val FROM d WHERE key='active_file') as active_file,\n (SELECT val FROM d WHERE key='inactive_file') as inactive_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.usage_in_bytes')\n ELSE monitor.cgroup_scalar_bigint('memory.current')\n END as usage_in_bytes,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.kmem.usage_in_bytes')\n ELSE 0\n END as kmem_usage_in_byte;\n"},{"metrics":[{"attribute_columns":["interface"],"description":"Number of bytes received","metric_name":"ccp_nodemx_network_rx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"rx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets received","metric_name":"ccp_nodemx_network_rx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"rx_packets"},{"attribute_columns":["interface"],"description":"Number of bytes transmitted","metric_name":"ccp_nodemx_network_tx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"tx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets transmitted","metric_name":"ccp_nodemx_network_tx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"tx_packets"}],"sql":"SELECT interface\n ,tx_bytes\n ,tx_packets\n ,rx_bytes\n ,rx_packets from monitor.proc_network_stats()\n"},{"metrics":[{"description":"Total number of database processes","metric_name":"ccp_nodemx_process_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT monitor.cgroup_process_count() as count;\n"},{"metrics":[{"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_reset_time","static_attributes":{"server":"localhost:5432"},"value_column":"time"}],"sql":"SELECT monitor.pg_stat_statements_reset_info(-1) as time;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Average query runtime in milliseconds","metric_name":"ccp_pg_stat_statements_top_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"top_mean_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max(monitor.mean_exec_time) AS top_mean_exec_time_ms\nFROM monitor GROUP BY 1,2,3,4 ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","role"],"description":"Total number of queries run per user/database","metric_name":"ccp_pg_stat_statements_total_calls_count","static_attributes":{"server":"localhost:5432"},"value_column":"calls_count","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"mean_exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total rows returned from all queries per user/database","metric_name":"ccp_pg_stat_statements_total_row_count","static_attributes":{"server":"localhost:5432"},"value_column":"row_count","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.calls\n , s.total_exec_time\n , s.mean_exec_time\n , s.rows\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , sum(calls) AS calls_count\n , sum(total_exec_time) AS exec_time_ms\n , avg(mean_exec_time) AS mean_exec_time_ms\n , sum(rows) AS row_count\nFROM monitor GROUP BY 1,2;\n"},{"metrics":[{"description":"The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######).","metric_name":"ccp_postgresql_version_current","static_attributes":{"server":"localhost:5432"},"value_column":"current"}],"sql":"SELECT current_setting('server_version_num')::int AS current;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_postmaster_uptime_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"seconds","value_type":"double"}],"sql":"SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_replication_lag_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"}],"sql":"SELECT * FROM get_replication_lag();\n"},{"metrics":[{"attribute_columns":["role"],"description":"Length of time since the last WAL file was received and replayed on replica.\nAlways increases, possibly causing false positives if the primary stops writing.\nMonitors for replicas that stop receiving WAL all together.\n","metric_name":"ccp_replication_lag_received_time","static_attributes":{"server":"localhost:5432"},"value_column":"received_time","value_type":"double"},{"attribute_columns":["role"],"description":"Length of time since the last transaction was replayed on replica.\nReturns zero if last WAL received equals last WAL replayed. Avoids\nfalse positives when primary stops writing. Monitors for replicas that\ncannot keep up with primary WAL generation.\n","metric_name":"ccp_replication_lag_replay_time","static_attributes":{"server":"localhost:5432"},"value_column":"replay_time","value_type":"double"}],"sql":"SELECT\n COALESCE(\n CASE\n WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END,\n 0\n ) AS replay_time,\n COALESCE(\n CASE\n WHEN pg_is_in_recovery() = false THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END,\n 0\n ) AS received_time,\n CASE\n WHEN pg_is_in_recovery() = true THEN 'replica'\n ELSE 'primary'\n END AS role;\n"},{"metrics":[{"description":"Number of settings from pg_settings catalog in a pending_restart state","metric_name":"ccp_settings_pending_restart_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true;\n"},{"metrics":[{"description":"Number of buffers allocated","metric_name":"ccp_stat_bgwriter_buffers_alloc","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_alloc"},{"data_type":"sum","description":"Number of buffers written by the background writer","metric_name":"ccp_stat_bgwriter_buffers_clean","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_clean"},{"description":"Number of times the background writer stopped a cleaning scan because it had written too many buffers","metric_name":"ccp_stat_bgwriter_maxwritten_clean","static_attributes":{"server":"localhost:5432"},"value_column":"maxwritten_clean"}],"sql":"SELECT\n buffers_clean\n , maxwritten_clean\n , buffers_alloc\nFROM pg_catalog.pg_stat_bgwriter;\n"},{"metrics":[{"description":"Oldest current transaction ID in cluster","metric_name":"ccp_transaction_wraparound_oldest_current_xid","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_current_xid"},{"description":"Percentage towards emergency autovacuum process starting","metric_name":"ccp_transaction_wraparound_percent_towards_emergency_autovac","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_emergency_autovac"},{"description":"Percentage towards transaction ID wraparound","metric_name":"ccp_transaction_wraparound_percent_towards_wraparound","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_wraparound"}],"sql":"WITH max_age AS (\n SELECT 2000000000 as max_old_xid\n , setting AS autovacuum_freeze_max_age\n FROM pg_catalog.pg_settings\n WHERE name = 'autovacuum_freeze_max_age')\n, per_database_stats AS (\n SELECT datname\n , m.max_old_xid::int\n , m.autovacuum_freeze_max_age::int\n , age(d.datfrozenxid) AS oldest_current_xid\n FROM pg_catalog.pg_database d\n JOIN max_age m ON (true)\n WHERE d.datallowconn)\nSELECT max(oldest_current_xid) AS oldest_current_xid , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac FROM per_database_stats;\n"},{"metrics":[{"description":"Current size in bytes of the WAL directory","metric_name":"ccp_wal_activity_total_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_size_bytes"}],"sql":"SELECT last_5_min_size_bytes,\n (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes\n FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification \u003e CURRENT_TIMESTAMP - '5 minutes'::interval) x;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_top_max_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"max_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total time spent in the statement in milliseconds","metric_name":"ccp_pg_stat_statements_top_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"total_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , total_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total amount of WAL generated by the statement in bytes","metric_name":"ccp_pg_stat_statements_top_wal_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL full page images generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_fpi","static_attributes":{"server":"localhost:5432"},"value_column":"fpi","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL records generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_records","static_attributes":{"server":"localhost:5432"},"value_column":"records","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , query\n , queryid\n , records\n , fpi\n , bytes\nFROM monitor ORDER BY bytes DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["repo"],"description":"Seconds since the last completed full or differential backup. Differential is always based off last full.","metric_name":"ccp_backrest_last_diff_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_diff_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full backup","metric_name":"ccp_backrest_last_full_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_full_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full, differential or incremental backup.\nIncremental is always based off last full or differential.\n","metric_name":"ccp_backrest_last_incr_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_incr_backup"},{"attribute_columns":["backup_type","repo"],"description":"pgBackRest version number when this backup was performed","metric_name":"ccp_backrest_last_info_backrest_repo_version","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backrest_repo_version"},{"attribute_columns":["backup_type","repo"],"description":"An error has been encountered in the backup. Check logs for more information.","metric_name":"ccp_backrest_last_info_backup_error","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backup_error"},{"attribute_columns":["backup_type","repo"],"description":"Total runtime in seconds of this backup","metric_name":"ccp_backrest_last_info_backup_runtime_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"backup_runtime_seconds"},{"attribute_columns":["backup_type","repo"],"description":"Actual size of only this individual backup in the pgbackrest repository","metric_name":"ccp_backrest_last_info_repo_backup_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_backup_size_bytes"},{"attribute_columns":["backup_type","repo"],"description":"Total size of this backup in the pgbackrest repository, including all required previous backups and WAL","metric_name":"ccp_backrest_last_info_repo_total_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_total_size_bytes"},{"attribute_columns":["repo"],"description":"Seconds since the oldest completed full backup","metric_name":"ccp_backrest_oldest_full_backup_time_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_full_backup"}],"sql":"SELECT * FROM get_pgbackrest_info();\n"}] +[{"metrics":[{"attribute_columns":["application_name","datname","state","usename"],"description":"number of connections in this state","metric_name":"ccp_pg_stat_activity_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT\n pg_database.datname,\n tmp.state,\n COALESCE(tmp2.usename, '') as usename,\n COALESCE(tmp2.application_name, '') as application_name,\n COALESCE(count,0) as count,\n COALESCE(max_tx_duration,0) as max_tx_duration\nFROM\n (\n VALUES ('active'),\n ('idle'),\n ('idle in transaction'),\n ('idle in transaction (aborted)'),\n ('fastpath function call'),\n ('disabled')\n ) AS tmp(state) CROSS JOIN pg_database\nLEFT JOIN (\n SELECT\n datname,\n state,\n usename,\n application_name,\n count(*) AS count,\n MAX(EXTRACT(EPOCH FROM now() - xact_start))::float AS max_tx_duration\n FROM pg_stat_activity GROUP BY datname,state,usename,application_name) AS tmp2\n ON tmp.state = tmp2.state AND pg_database.datname = tmp2.datname;\n"},{"metrics":[{"description":"Seconds since the last successful archive operation","metric_name":"ccp_archive_command_status_seconds_since_last_archive","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_archive","value_type":"double"}],"sql":"SELECT COALESCE(EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)), 0) AS seconds_since_last_archive FROM pg_catalog.pg_stat_archiver;\n"},{"metrics":[{"description":"Number of WAL files that have been successfully archived","metric_name":"ccp_archive_command_status_archived_count","static_attributes":{"server":"localhost:5432"},"value_column":"archived_count"}],"sql":"SELECT archived_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Number of failed attempts for archiving WAL files","metric_name":"ccp_archive_command_status_failed_count","static_attributes":{"server":"localhost:5432"},"value_column":"failed_count"}],"sql":"SELECT failed_count FROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Seconds since the last recorded failure of the archive_command","metric_name":"ccp_archive_command_status_seconds_since_last_fail","static_attributes":{"server":"localhost:5432"},"value_column":"seconds_since_last_fail"}],"sql":"SELECT CASE\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) IS NULL THEN 0\n WHEN EXTRACT(epoch from (last_failed_time - last_archived_time)) \u003c 0 THEN 0\n ELSE EXTRACT(epoch from (last_failed_time - last_archived_time))\n END AS seconds_since_last_fail\nFROM pg_catalog.pg_stat_archiver\n"},{"metrics":[{"description":"Total non-idle connections","metric_name":"ccp_connection_stats_active","static_attributes":{"server":"localhost:5432"},"value_column":"active"},{"description":"Total idle connections","metric_name":"ccp_connection_stats_idle","static_attributes":{"server":"localhost:5432"},"value_column":"idle"},{"description":"Total idle in transaction connections","metric_name":"ccp_connection_stats_idle_in_txn","static_attributes":{"server":"localhost:5432"},"value_column":"idle_in_txn"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_blocked_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_blocked_query_time","value_type":"double"},{"description":"Value of max_connections for the monitored database","metric_name":"ccp_connection_stats_max_connections","static_attributes":{"server":"localhost:5432"},"value_column":"max_connections"},{"description":"Length of time in seconds of the longest idle in transaction session","metric_name":"ccp_connection_stats_max_idle_in_txn_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_idle_in_txn_time","value_type":"double"},{"description":"Length of time in seconds of the longest running query","metric_name":"ccp_connection_stats_max_query_time","static_attributes":{"server":"localhost:5432"},"value_column":"max_query_time","value_type":"double"},{"description":"Total idle and non-idle connections","metric_name":"ccp_connection_stats_total","static_attributes":{"server":"localhost:5432"},"value_column":"total"}],"sql":"SELECT ((total - idle) - idle_in_txn) as active\n , total\n , idle\n , idle_in_txn\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - state_change))),0) FROM pg_catalog.pg_stat_activity WHERE state = 'idle in transaction') AS max_idle_in_txn_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND state \u003c\u003e 'idle' ) AS max_query_time\n , (SELECT COALESCE(EXTRACT(epoch FROM (MAX(clock_timestamp() - query_start))),0) FROM pg_catalog.pg_stat_activity WHERE backend_type = 'client backend' AND wait_event_type = 'Lock' ) AS max_blocked_query_time\n , max_connections\n FROM (\n SELECT COUNT(*) as total\n , COALESCE(SUM(CASE WHEN state = 'idle' THEN 1 ELSE 0 END),0) AS idle\n , COALESCE(SUM(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END),0) AS idle_in_txn FROM pg_catalog.pg_stat_activity) x\n JOIN (SELECT setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);\n"},{"metrics":[{"attribute_columns":["dbname"],"description":"Total number of checksum failures on this database","metric_name":"ccp_data_checksum_failure_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"},{"attribute_columns":["dbname"],"description":"Time interval in seconds since the last checksum failure was encountered","metric_name":"ccp_data_checksum_failure_time_since_last_failure_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"time_since_last_failure_seconds","value_type":"double"}],"sql":"SELECT datname AS dbname , checksum_failures AS count , coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds FROM pg_catalog.pg_stat_database WHERE pg_stat_database.datname IS NOT NULL;\n"},{"metrics":[{"attribute_columns":["dbname","mode"],"description":"Return value of 1 means database is in recovery. Otherwise 2 it is a primary.","metric_name":"ccp_locks_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT pg_database.datname as dbname , tmp.mode , COALESCE(count,0) as count FROM (\n VALUES ('accesssharelock'),\n ('rowsharelock'),\n ('rowexclusivelock'),\n ('shareupdateexclusivelock'),\n ('sharelock'),\n ('sharerowexclusivelock'),\n ('exclusivelock'),\n ('accessexclusivelock')\n) AS tmp(mode) CROSS JOIN pg_catalog.pg_database LEFT JOIN\n (SELECT database, lower(mode) AS mode,count(*) AS count\n FROM pg_catalog.pg_locks WHERE database IS NOT NULL\n GROUP BY database, lower(mode)\n) AS tmp2 ON tmp.mode=tmp2.mode and pg_database.oid = tmp2.database;\n"},{"metrics":[{"description":"CPU limit value in milli cores","metric_name":"ccp_nodemx_cpu_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"CPU request value in milli cores","metric_name":"ccp_nodemx_cpu_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"}],"sql":"SELECT monitor.kdapi_scalar_bigint('cpu_request') AS request , monitor.kdapi_scalar_bigint('cpu_limit') AS limit\n"},{"metrics":[{"description":"CPU usage in nanoseconds","metric_name":"ccp_nodemx_cpuacct_usage","static_attributes":{"server":"localhost:5432"},"value_column":"usage","value_type":"double"},{"description":"CPU usage snapshot timestamp","metric_name":"ccp_nodemx_cpuacct_usage_ts","static_attributes":{"server":"localhost:5432"},"value_column":"usage_ts","value_type":"double"}],"sql":"SELECT CASE WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('cpuacct.usage')\n ELSE (SELECT val FROM monitor.cgroup_setof_kv('cpu.stat') where key = 'usage_usec') * 1000\n END AS usage,\n extract(epoch from clock_timestamp()) AS usage_ts;\n"},{"metrics":[{"description":"The total available run-time within a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_period_us","static_attributes":{"server":"localhost:5432"},"value_column":"period_us"},{"description":"The length of a period (in microseconds)","metric_name":"ccp_nodemx_cpucfs_quota_us","static_attributes":{"server":"localhost:5432"},"value_column":"quota_us","value_type":"double"}],"sql":"SELECT\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n monitor.cgroup_scalar_bigint('cpu.cfs_period_us')\n ELSE\n (monitor.cgroup_array_bigint('cpu.max'))[2]\n END AS period_us,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n GREATEST(monitor.cgroup_scalar_bigint('cpu.cfs_quota_us'), 0)\n ELSE\n GREATEST((monitor.cgroup_array_bigint('cpu.max'))[1], 0)\n END AS quota_us;\n"},{"metrics":[{"description":"Number of periods that any thread was runnable","metric_name":"ccp_nodemx_cpustat_nr_periods","static_attributes":{"server":"localhost:5432"},"value_column":"nr_periods","value_type":"double"},{"description":"Number of runnable periods in which the application used its entire quota and was throttled","metric_name":"ccp_nodemx_cpustat_nr_throttled","static_attributes":{"server":"localhost:5432"},"value_column":"nr_throttled"},{"description":"CPU stat snapshot timestamp","metric_name":"ccp_nodemx_cpustat_snap_ts","static_attributes":{"server":"localhost:5432"},"value_column":"snap_ts","value_type":"double"},{"description":"Sum total amount of time individual threads within the monitor.cgroup were throttled","metric_name":"ccp_nodemx_cpustat_throttled_time","static_attributes":{"server":"localhost:5432"},"value_column":"throttled_time","value_type":"double"}],"sql":"WITH d(key, val) AS (select key, val from monitor.cgroup_setof_kv('cpu.stat')) SELECT\n (SELECT val FROM d WHERE key='nr_periods') AS nr_periods,\n (SELECT val FROM d WHERE key='nr_throttled') AS nr_throttled,\n (SELECT val FROM d WHERE key='throttled_usec') AS throttled_time,\n extract(epoch from clock_timestamp()) as snap_ts;\n"},{"metrics":[{"attribute_columns":["fs_type","mount_point"],"description":"Available size in bytes","metric_name":"ccp_nodemx_data_disk_available_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"available_bytes","value_type":"double"},{"attribute_columns":["fs_type","mount_point"],"description":"Available file nodes","metric_name":"ccp_nodemx_data_disk_free_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"free_file_nodes"},{"attribute_columns":["fs_type","mount_point"],"description":"Size in bytes","metric_name":"ccp_nodemx_data_disk_total_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_bytes"},{"attribute_columns":["fs_type","mount_point"],"description":"Total file nodes","metric_name":"ccp_nodemx_data_disk_total_file_nodes","static_attributes":{"server":"localhost:5432"},"value_column":"total_file_nodes"}],"sql":"SELECT mount_point,fs_type,total_bytes,available_bytes,total_file_nodes,free_file_nodes\n FROM monitor.proc_mountinfo() m\n JOIN monitor.fsinfo(m.mount_point) f USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%'\n"},{"metrics":[{"attribute_columns":["mount_point"],"description":"Total sectors read","metric_name":"ccp_nodemx_disk_activity_sectors_read","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_read"},{"attribute_columns":["mount_point"],"description":"Total sectors written","metric_name":"ccp_nodemx_disk_activity_sectors_written","static_attributes":{"server":"localhost:5432"},"value_column":"sectors_written"}],"sql":"SELECT mount_point,sectors_read,sectors_written\n FROM monitor.proc_mountinfo() m\n JOIN monitor.proc_diskstats() d USING (major_number, minor_number)\n WHERE m.mount_point IN ('/pgdata', '/pgwal') OR\n m.mount_point like '/tablespaces/%';\n"},{"metrics":[{"description":"Total bytes of anonymous and swap cache memory on active LRU list","metric_name":"ccp_nodemx_mem_active_anon","static_attributes":{"server":"localhost:5432"},"value_column":"active_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on active LRU list","metric_name":"ccp_nodemx_mem_active_file","static_attributes":{"server":"localhost:5432"},"value_column":"active_file","value_type":"double"},{"description":"Total bytes of page cache memory","metric_name":"ccp_nodemx_mem_cache","static_attributes":{"server":"localhost:5432"},"value_column":"cache","value_type":"double"},{"description":"Total bytes that are waiting to get written back to the disk","metric_name":"ccp_nodemx_mem_dirty","static_attributes":{"server":"localhost:5432"},"value_column":"dirty"},{"description":"Total bytes of anonymous and swap cache memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_anon","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_anon","value_type":"double"},{"description":"Total bytes of file-backed memory on inactive LRU list","metric_name":"ccp_nodemx_mem_inactive_file","static_attributes":{"server":"localhost:5432"},"value_column":"inactive_file","value_type":"double"},{"description":"Unknown metric from ccp_nodemx_mem","metric_name":"ccp_nodemx_mem_kmem_usage_in_byte","static_attributes":{"server":"localhost:5432"},"value_column":"kmem_usage_in_byte"},{"description":"Memory limit value in bytes","metric_name":"ccp_nodemx_mem_limit","static_attributes":{"server":"localhost:5432"},"value_column":"limit"},{"description":"Total bytes of mapped file (includes tmpfs/shmem)","metric_name":"ccp_nodemx_mem_mapped_file","static_attributes":{"server":"localhost:5432"},"value_column":"mapped_file"},{"description":"Memory request value in bytes","metric_name":"ccp_nodemx_mem_request","static_attributes":{"server":"localhost:5432"},"value_column":"request"},{"description":"Total bytes of anonymous and swap cache memory","metric_name":"ccp_nodemx_mem_rss","static_attributes":{"server":"localhost:5432"},"value_column":"rss","value_type":"double"},{"description":"Total bytes of shared memory","metric_name":"ccp_nodemx_mem_shmem","static_attributes":{"server":"localhost:5432"},"value_column":"shmem","value_type":"double"},{"description":"Total usage in bytes","metric_name":"ccp_nodemx_mem_usage_in_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"usage_in_bytes"}],"sql":"WITH d(key, val) as (SELECT key, val FROM monitor.cgroup_setof_kv('memory.stat')) SELECT\n monitor.kdapi_scalar_bigint('mem_request') AS request,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy' THEN\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.limit_in_bytes') = 9223372036854771712 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.limit_in_bytes') END)\n ELSE\n (CASE WHEN monitor.cgroup_scalar_bigint('memory.max') = 9223372036854775807 THEN 0 ELSE monitor.cgroup_scalar_bigint('memory.max') END)\n END AS limit,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='cache')\n ELSE 0\n END as cache,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='rss')\n ELSE 0\n END as RSS,\n (SELECT val FROM d WHERE key='shmem') as shmem,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='mapped_file')\n ELSE 0\n END as mapped_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN (SELECT val FROM d WHERE key='dirty')\n ELSE (SELECT val FROM d WHERE key='file_dirty')\n END as dirty,\n (SELECT val FROM d WHERE key='active_anon') as active_anon,\n (SELECT val FROM d WHERE key='inactive_anon') as inactive_anon,\n (SELECT val FROM d WHERE key='active_file') as active_file,\n (SELECT val FROM d WHERE key='inactive_file') as inactive_file,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.usage_in_bytes')\n ELSE monitor.cgroup_scalar_bigint('memory.current')\n END as usage_in_bytes,\n CASE\n WHEN monitor.cgroup_mode() = 'legacy'\n THEN monitor.cgroup_scalar_bigint('memory.kmem.usage_in_bytes')\n ELSE 0\n END as kmem_usage_in_byte;\n"},{"metrics":[{"attribute_columns":["interface"],"description":"Number of bytes received","metric_name":"ccp_nodemx_network_rx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"rx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets received","metric_name":"ccp_nodemx_network_rx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"rx_packets"},{"attribute_columns":["interface"],"description":"Number of bytes transmitted","metric_name":"ccp_nodemx_network_tx_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"tx_bytes"},{"attribute_columns":["interface"],"description":"Number of packets transmitted","metric_name":"ccp_nodemx_network_tx_packets","static_attributes":{"server":"localhost:5432"},"value_column":"tx_packets"}],"sql":"SELECT interface\n ,tx_bytes\n ,tx_packets\n ,rx_bytes\n ,rx_packets from monitor.proc_network_stats()\n"},{"metrics":[{"description":"Total number of database processes","metric_name":"ccp_nodemx_process_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT monitor.cgroup_process_count() as count;\n"},{"metrics":[{"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_reset_time","static_attributes":{"server":"localhost:5432"},"value_column":"time"}],"sql":"SELECT monitor.pg_stat_statements_reset_info(-1) as time;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Average query runtime in milliseconds","metric_name":"ccp_pg_stat_statements_top_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"top_mean_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max(monitor.mean_exec_time) AS top_mean_exec_time_ms\nFROM monitor GROUP BY 1,2,3,4 ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","role"],"description":"Total number of queries run per user/database","metric_name":"ccp_pg_stat_statements_total_calls_count","static_attributes":{"server":"localhost:5432"},"value_column":"calls_count","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total runtime of all queries per user/database","metric_name":"ccp_pg_stat_statements_total_mean_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"mean_exec_time_ms","value_type":"double"},{"attribute_columns":["dbname","role"],"description":"Total rows returned from all queries per user/database","metric_name":"ccp_pg_stat_statements_total_row_count","static_attributes":{"server":"localhost:5432"},"value_column":"row_count","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.calls\n , s.total_exec_time\n , s.mean_exec_time\n , s.rows\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , sum(calls) AS calls_count\n , sum(total_exec_time) AS exec_time_ms\n , avg(mean_exec_time) AS mean_exec_time_ms\n , sum(rows) AS row_count\nFROM monitor GROUP BY 1,2;\n"},{"metrics":[{"description":"The current version of PostgreSQL that this exporter is running on as a 6 digit integer (######).","metric_name":"ccp_postgresql_version_current","static_attributes":{"server":"localhost:5432"},"value_column":"current"}],"sql":"SELECT current_setting('server_version_num')::int AS current;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_postmaster_uptime_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"seconds","value_type":"double"}],"sql":"SELECT extract(epoch from (clock_timestamp() - pg_postmaster_start_time() )) AS seconds;\n"},{"metrics":[{"description":"Time interval in seconds since PostgreSQL database was last restarted.","metric_name":"ccp_replication_lag_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"}],"sql":"SELECT * FROM get_replication_lag();\n"},{"metrics":[{"attribute_columns":["role"],"description":"Length of time since the last WAL file was received and replayed on replica.\nAlways increases, possibly causing false positives if the primary stops writing.\nMonitors for replicas that stop receiving WAL all together.\n","metric_name":"ccp_replication_lag_received_time","static_attributes":{"server":"localhost:5432"},"value_column":"received_time","value_type":"double"},{"attribute_columns":["role"],"description":"Length of time since the last transaction was replayed on replica.\nReturns zero if last WAL received equals last WAL replayed. Avoids\nfalse positives when primary stops writing. Monitors for replicas that\ncannot keep up with primary WAL generation.\n","metric_name":"ccp_replication_lag_replay_time","static_attributes":{"server":"localhost:5432"},"value_column":"replay_time","value_type":"double"}],"sql":"SELECT\n COALESCE(\n CASE\n WHEN (pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn()) OR (pg_is_in_recovery() = false) THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END,\n 0\n ) AS replay_time,\n COALESCE(\n CASE\n WHEN pg_is_in_recovery() = false THEN 0\n ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER\n END,\n 0\n ) AS received_time,\n CASE\n WHEN pg_is_in_recovery() = true THEN 'replica'\n ELSE 'primary'\n END AS role;\n"},{"metrics":[{"description":"Number of settings from pg_settings catalog in a pending_restart state","metric_name":"ccp_settings_pending_restart_count","static_attributes":{"server":"localhost:5432"},"value_column":"count"}],"sql":"SELECT count(*) AS count FROM pg_catalog.pg_settings WHERE pending_restart = true;\n"},{"metrics":[{"description":"Number of buffers allocated","metric_name":"ccp_stat_bgwriter_buffers_alloc","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_alloc"},{"data_type":"sum","description":"Number of buffers written by the background writer","metric_name":"ccp_stat_bgwriter_buffers_clean","static_attributes":{"server":"localhost:5432"},"value_column":"buffers_clean"},{"description":"Number of times the background writer stopped a cleaning scan because it had written too many buffers","metric_name":"ccp_stat_bgwriter_maxwritten_clean","static_attributes":{"server":"localhost:5432"},"value_column":"maxwritten_clean"}],"sql":"SELECT\n buffers_clean\n , maxwritten_clean\n , buffers_alloc\nFROM pg_catalog.pg_stat_bgwriter;\n"},{"metrics":[{"description":"Oldest current transaction ID in cluster","metric_name":"ccp_transaction_wraparound_oldest_current_xid","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_current_xid"},{"description":"Percentage towards emergency autovacuum process starting","metric_name":"ccp_transaction_wraparound_percent_towards_emergency_autovac","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_emergency_autovac"},{"description":"Percentage towards transaction ID wraparound","metric_name":"ccp_transaction_wraparound_percent_towards_wraparound","static_attributes":{"server":"localhost:5432"},"value_column":"percent_towards_wraparound"}],"sql":"WITH max_age AS (\n SELECT 2000000000 as max_old_xid\n , setting AS autovacuum_freeze_max_age\n FROM pg_catalog.pg_settings\n WHERE name = 'autovacuum_freeze_max_age')\n, per_database_stats AS (\n SELECT datname\n , m.max_old_xid::int\n , m.autovacuum_freeze_max_age::int\n , age(d.datfrozenxid) AS oldest_current_xid\n FROM pg_catalog.pg_database d\n JOIN max_age m ON (true)\n WHERE d.datallowconn)\nSELECT max(oldest_current_xid) AS oldest_current_xid , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac FROM per_database_stats;\n"},{"metrics":[{"description":"Current size in bytes of the WAL directory","metric_name":"ccp_wal_activity_total_size_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"total_size_bytes"}],"sql":"SELECT last_5_min_size_bytes,\n (SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes\n FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification \u003e CURRENT_TIMESTAMP - '5 minutes'::interval) x;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Epoch time when stats were reset","metric_name":"ccp_pg_stat_statements_top_max_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"max_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , max_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total time spent in the statement in milliseconds","metric_name":"ccp_pg_stat_statements_top_total_exec_time_ms","static_attributes":{"server":"localhost:5432"},"value_column":"total_exec_time_ms","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time_ms\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , queryid\n , query\n , total_exec_time_ms\n , records\nFROM monitor ORDER BY 5 DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["dbname","query","queryid","role"],"description":"Total amount of WAL generated by the statement in bytes","metric_name":"ccp_pg_stat_statements_top_wal_bytes","static_attributes":{"server":"localhost:5432"},"value_column":"bytes","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL full page images generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_fpi","static_attributes":{"server":"localhost:5432"},"value_column":"fpi","value_type":"double"},{"attribute_columns":["dbname","query","queryid","role"],"description":"Total number of WAL records generated by the statement","metric_name":"ccp_pg_stat_statements_top_wal_records","static_attributes":{"server":"localhost:5432"},"value_column":"records","value_type":"double"}],"sql":"WITH monitor AS (\n SELECT\n pg_get_userbyid(s.userid) AS role\n , d.datname AS dbname\n , s.queryid AS queryid\n , btrim(replace(left(s.query, 40), '\\n', '')) AS query\n , s.calls\n , s.total_exec_time AS total_exec_time\n , s.max_exec_time AS max_exec_time\n , s.mean_exec_time AS mean_exec_time\n , s.rows\n , s.wal_records AS records\n , s.wal_fpi AS fpi\n , s.wal_bytes AS bytes\n FROM public.pg_stat_statements s\n JOIN pg_catalog.pg_database d ON d.oid = s.dbid\n) SELECT role\n , dbname\n , query\n , queryid\n , records\n , fpi\n , bytes\nFROM monitor ORDER BY bytes DESC LIMIT 20;\n"},{"metrics":[{"attribute_columns":["repo"],"description":"Seconds since the last completed full or differential backup. Differential is always based off last full.","metric_name":"ccp_backrest_last_diff_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_diff_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full backup","metric_name":"ccp_backrest_last_full_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_full_backup"},{"attribute_columns":["repo"],"description":"Seconds since the last completed full, differential or incremental backup.\nIncremental is always based off last full or differential.\n","metric_name":"ccp_backrest_last_incr_backup_time_since_completion_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_incr_backup"},{"attribute_columns":["backup_type","repo"],"description":"pgBackRest version number when this backup was performed","metric_name":"ccp_backrest_last_info_backrest_repo_version","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backrest_repo_version"},{"attribute_columns":["backup_type","repo"],"description":"An error has been encountered in the backup. Check logs for more information.","metric_name":"ccp_backrest_last_info_backup_error","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"last_info_backup_error"},{"attribute_columns":["backup_type","repo"],"description":"Total runtime in seconds of this backup","metric_name":"ccp_backrest_last_info_backup_runtime_seconds","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"backup_runtime_seconds"},{"attribute_columns":["backup_type","repo"],"description":"Actual size of only this individual backup in the pgbackrest repository","metric_name":"ccp_backrest_last_info_repo_backup_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_backup_size_bytes"},{"attribute_columns":["backup_type","repo"],"description":"Total size of this backup in the pgbackrest repository, including all required previous backups and WAL","metric_name":"ccp_backrest_last_info_repo_total_size_bytes","static_attributes":{"server":"localhost:5432","stanza":"db"},"value_column":"repo_total_size_bytes"},{"attribute_columns":["repo"],"description":"Seconds since the oldest completed full backup","metric_name":"ccp_backrest_oldest_full_backup_time_seconds","static_attributes":{"server":"localhost:5432"},"value_column":"oldest_full_backup"}],"sql":"SELECT * FROM get_pgbackrest_info();\n"}] diff --git a/internal/collector/postgres_5s_metrics.yaml b/internal/collector/postgres_5s_metrics.yaml index 835c9d4337..4f1a142782 100644 --- a/internal/collector/postgres_5s_metrics.yaml +++ b/internal/collector/postgres_5s_metrics.yaml @@ -43,8 +43,9 @@ - sql: > SELECT - EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)) AS seconds_since_last_archive - FROM pg_catalog.pg_stat_archiver + COALESCE(EXTRACT(epoch from (CURRENT_TIMESTAMP - last_archived_time)), 0) AS seconds_since_last_archive + FROM pg_catalog.pg_stat_archiver; + metrics: - metric_name: ccp_archive_command_status_seconds_since_last_archive value_column: seconds_since_last_archive From e043b156118a4e0dabd643a7324928544c274174 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 09:58:39 -0500 Subject: [PATCH 11/22] Don't disable ssl_mode --- internal/collector/postgres_metrics.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/collector/postgres_metrics.go b/internal/collector/postgres_metrics.go index 92aed244f5..16712dd00d 100644 --- a/internal/collector/postgres_metrics.go +++ b/internal/collector/postgres_metrics.go @@ -56,7 +56,7 @@ func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresClust config.Receivers[FiveSecondSqlQuery] = map[string]any{ "driver": "postgres", - "datasource": fmt.Sprintf(`host=localhost dbname=postgres port=5432 user=%s password=${env:PGPASSWORD} sslmode=disable`, pgmonitor.MonitoringUser), + "datasource": fmt.Sprintf(`host=localhost dbname=postgres port=5432 user=%s password=${env:PGPASSWORD}`, pgmonitor.MonitoringUser), "collection_interval": "5s", // Give Postgres time to finish setup. "initial_delay": "10s", @@ -65,7 +65,7 @@ func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresClust config.Receivers[FiveMinuteSqlQuery] = map[string]any{ "driver": "postgres", - "datasource": fmt.Sprintf(`host=localhost dbname=postgres port=5432 user=%s password=${env:PGPASSWORD} sslmode=disable`, pgmonitor.MonitoringUser), + "datasource": fmt.Sprintf(`host=localhost dbname=postgres port=5432 user=%s password=${env:PGPASSWORD}`, pgmonitor.MonitoringUser), "collection_interval": "300s", // Give Postgres time to finish setup. "initial_delay": "10s", From 9008cc5c576ac75b85575a81372d41931e3fe45e Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 10:04:56 -0500 Subject: [PATCH 12/22] Moves guard into reconcileExporterWebConfig --- internal/controller/postgrescluster/controller.go | 3 +-- internal/controller/postgrescluster/pgmonitor.go | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/internal/controller/postgrescluster/controller.go b/internal/controller/postgrescluster/controller.go index c5daef0c01..f884398902 100644 --- a/internal/controller/postgrescluster/controller.go +++ b/internal/controller/postgrescluster/controller.go @@ -30,7 +30,6 @@ import ( "github.com/crunchydata/postgres-operator/internal/collector" "github.com/crunchydata/postgres-operator/internal/config" "github.com/crunchydata/postgres-operator/internal/controller/runtime" - "github.com/crunchydata/postgres-operator/internal/feature" "github.com/crunchydata/postgres-operator/internal/initialize" "github.com/crunchydata/postgres-operator/internal/kubernetes" "github.com/crunchydata/postgres-operator/internal/logging" @@ -345,7 +344,7 @@ func (r *Reconciler) Reconcile( if err == nil { exporterQueriesConfig, err = r.reconcileExporterQueriesConfig(ctx, cluster) } - if err == nil && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if err == nil { exporterWebConfig, err = r.reconcileExporterWebConfig(ctx, cluster) } if err == nil { diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index 70990f92e8..4ab37266fb 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -402,6 +402,10 @@ func addPGMonitorExporterToInstancePodSpec( func (r *Reconciler) reconcileExporterWebConfig(ctx context.Context, cluster *v1beta1.PostgresCluster) (*corev1.ConfigMap, error) { + if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + return nil, nil + } + existing := &corev1.ConfigMap{ObjectMeta: naming.ExporterWebConfigMap(cluster)} err := errors.WithStack(r.Client.Get(ctx, client.ObjectKeyFromObject(existing), existing)) if client.IgnoreNotFound(err) != nil { From c75a6ba5d9063855cdcf741a72b96bc3466c13ec Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 10:15:08 -0500 Subject: [PATCH 13/22] Removes superfluous function --- internal/controller/postgrescluster/instance.go | 2 +- internal/controller/postgrescluster/pgmonitor.go | 13 ------------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/internal/controller/postgrescluster/instance.go b/internal/controller/postgrescluster/instance.go index 1c32ba8f1d..d7d3d5fc01 100644 --- a/internal/controller/postgrescluster/instance.go +++ b/internal/controller/postgrescluster/instance.go @@ -1224,7 +1224,7 @@ func (r *Reconciler) reconcileInstance( // Add postgres-exporter to the instance Pod spec if err == nil && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { - err = addPGExporterToInstancePodSpec(ctx, cluster, &instance.Spec.Template, exporterQueriesConfig, exporterWebConfig) + err = addPGMonitorExporterToInstancePodSpec(ctx, cluster, &instance.Spec.Template, exporterQueriesConfig, exporterWebConfig) } // add nss_wrapper init container and add nss_wrapper env vars to the database and pgbackrest diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index 4ab37266fb..598d3bb782 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -236,19 +236,6 @@ func (r *Reconciler) reconcileMonitoringSecret( return nil, err } -// addPGExporterToInstancePodSpec performs the necessary setup to add -// pgMonitor resources on a PodTemplateSpec for running postgres-exporter. -func addPGExporterToInstancePodSpec( - ctx context.Context, - cluster *v1beta1.PostgresCluster, - template *corev1.PodTemplateSpec, - exporterQueriesConfig, exporterWebConfig *corev1.ConfigMap) error { - - err := addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, exporterWebConfig) - - return err -} - // addPGMonitorExporterToInstancePodSpec performs the necessary setup to // add pgMonitor exporter resources to a PodTemplateSpec // TODO (jmckulk): refactor to pass around monitoring secret; Without the secret From 50554ddecc8e9f8b66c5c5c56ab9d515c5f017dd Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 10:35:38 -0500 Subject: [PATCH 14/22] Applies processors --- internal/collector/postgres_metrics.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/collector/postgres_metrics.go b/internal/collector/postgres_metrics.go index 16712dd00d..bf541c338d 100644 --- a/internal/collector/postgres_metrics.go +++ b/internal/collector/postgres_metrics.go @@ -74,6 +74,10 @@ func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresClust // Add Metrics Pipeline config.Pipelines[PostgresMetrics] = Pipeline{ Receivers: []ComponentID{FiveSecondSqlQuery, FiveMinuteSqlQuery}, + Processors: []ComponentID{ + SubSecondBatchProcessor, + CompactingProcessor, + }, Exporters: []ComponentID{Prometheus}, } } From 7e61cfa6f2451b510632b470b4fbda206ac6b5f3 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 10:59:14 -0500 Subject: [PATCH 15/22] More cleanup around exporter logic --- internal/controller/postgrescluster/controller.go | 2 +- internal/controller/postgrescluster/instance.go | 2 +- internal/controller/postgrescluster/pgmonitor.go | 14 +------------- 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/internal/controller/postgrescluster/controller.go b/internal/controller/postgrescluster/controller.go index f884398902..c200fa0e27 100644 --- a/internal/controller/postgrescluster/controller.go +++ b/internal/controller/postgrescluster/controller.go @@ -383,7 +383,7 @@ func (r *Reconciler) Reconcile( err = r.reconcilePGBouncer(ctx, cluster, instances, primaryCertificate, rootCA) } if err == nil { - err = r.reconcilePGMonitor(ctx, cluster, instances, monitoringSecret) + err = r.reconcilePGMonitorExporter(ctx, cluster, instances, monitoringSecret) } if err == nil { err = r.reconcileDatabaseInitSQL(ctx, cluster, instances) diff --git a/internal/controller/postgrescluster/instance.go b/internal/controller/postgrescluster/instance.go index d7d3d5fc01..363a51e8a0 100644 --- a/internal/controller/postgrescluster/instance.go +++ b/internal/controller/postgrescluster/instance.go @@ -1223,7 +1223,7 @@ func (r *Reconciler) reconcileInstance( } // Add postgres-exporter to the instance Pod spec - if err == nil && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if err == nil { err = addPGMonitorExporterToInstancePodSpec(ctx, cluster, &instance.Spec.Template, exporterQueriesConfig, exporterWebConfig) } diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index 598d3bb782..eff67b166c 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -31,18 +31,6 @@ import ( //go:embed "metrics_setup.sql" var metricsSetupForOTelCollector string -// If pgMonitor is enabled the pgMonitor sidecar(s) have been added to the -// instance pod. reconcilePGMonitor will update the database to -// create the necessary objects for the tool to run -func (r *Reconciler) reconcilePGMonitor(ctx context.Context, - cluster *v1beta1.PostgresCluster, instances *observedInstances, - monitoringSecret *corev1.Secret) error { - - err := r.reconcilePGMonitorExporter(ctx, cluster, instances, monitoringSecret) - - return err -} - // reconcilePGMonitorExporter performs setup the postgres_exporter sidecar // - PodExec to run the sql in the primary database // Status.Monitoring.ExporterConfiguration is used to determine when the @@ -247,7 +235,7 @@ func addPGMonitorExporterToInstancePodSpec( template *corev1.PodTemplateSpec, exporterQueriesConfig, exporterWebConfig *corev1.ConfigMap) error { - if !pgmonitor.ExporterEnabled(cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if !pgmonitor.ExporterEnabled(cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { return nil } From 420fe70af7d0ff53e281b31c85a69c8df8478ce0 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 11:15:00 -0500 Subject: [PATCH 16/22] Removes unused error return type --- internal/controller/postgrescluster/instance.go | 2 +- internal/controller/postgrescluster/pgmonitor.go | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/internal/controller/postgrescluster/instance.go b/internal/controller/postgrescluster/instance.go index 363a51e8a0..3bbd10b0c3 100644 --- a/internal/controller/postgrescluster/instance.go +++ b/internal/controller/postgrescluster/instance.go @@ -1224,7 +1224,7 @@ func (r *Reconciler) reconcileInstance( // Add postgres-exporter to the instance Pod spec if err == nil { - err = addPGMonitorExporterToInstancePodSpec(ctx, cluster, &instance.Spec.Template, exporterQueriesConfig, exporterWebConfig) + addPGMonitorExporterToInstancePodSpec(ctx, cluster, &instance.Spec.Template, exporterQueriesConfig, exporterWebConfig) } // add nss_wrapper init container and add nss_wrapper env vars to the database and pgbackrest diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index eff67b166c..69ab0f38ed 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -233,10 +233,10 @@ func addPGMonitorExporterToInstancePodSpec( ctx context.Context, cluster *v1beta1.PostgresCluster, template *corev1.PodTemplateSpec, - exporterQueriesConfig, exporterWebConfig *corev1.ConfigMap) error { + exporterQueriesConfig, exporterWebConfig *corev1.ConfigMap) { if !pgmonitor.ExporterEnabled(cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { - return nil + return } certSecret := cluster.Spec.Monitoring.PGMonitor.Exporter.CustomTLSSecret @@ -369,8 +369,6 @@ func addPGMonitorExporterToInstancePodSpec( // add the proper label to support Pod discovery by Prometheus per pgMonitor configuration initialize.Labels(template) template.Labels[naming.LabelPGMonitorDiscovery] = "true" - - return nil } // reconcileExporterWebConfig reconciles the configmap containing the webconfig for exporter tls From 0faa18e1f9c2b92544e8fde60053567ecb3a7e4e Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 11:45:18 -0500 Subject: [PATCH 17/22] When OpenTelemetryMetrics enabled, ~ExporterEnabled --- .../controller/postgrescluster/pgmonitor.go | 12 +++++------ .../postgrescluster/pgmonitor_test.go | 20 ++++++++----------- internal/pgmonitor/postgres.go | 4 ++-- internal/pgmonitor/util.go | 6 +++++- internal/pgmonitor/util_test.go | 18 +++++++++++++---- 5 files changed, 35 insertions(+), 25 deletions(-) diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index 69ab0f38ed..9dae851db4 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -62,7 +62,7 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context, // the `EnableExporterInPostgreSQL` funcs; that way we are always running // that function against an updated and running pod. - if pgmonitor.ExporterEnabled(cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if pgmonitor.ExporterEnabled(ctx, cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { sql, err := os.ReadFile(fmt.Sprintf("%s/pg%d/setup.sql", pgmonitor.GetQueriesConfigDir(ctx), cluster.Spec.PostgresVersion)) if err != nil { return err @@ -99,7 +99,7 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context, return pgmonitor.EnableExporterInPostgreSQL(ctx, exec, monitoringSecret, pgmonitor.ExporterDB, setup) } - if !pgmonitor.ExporterEnabled(cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if !pgmonitor.ExporterEnabled(ctx, cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { action = func(ctx context.Context, exec postgres.Executor) error { return pgmonitor.DisableExporterInPostgreSQL(ctx, exec) } @@ -157,7 +157,7 @@ func (r *Reconciler) reconcileMonitoringSecret( return nil, err } - if !pgmonitor.ExporterEnabled(cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if !pgmonitor.ExporterEnabled(ctx, cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // TODO: Checking if the exporter is enabled to determine when monitoring // secret should be created. If more tools are added to the monitoring // suite, they could need the secret when the exporter is not enabled. @@ -235,7 +235,7 @@ func addPGMonitorExporterToInstancePodSpec( template *corev1.PodTemplateSpec, exporterQueriesConfig, exporterWebConfig *corev1.ConfigMap) { - if !pgmonitor.ExporterEnabled(cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if !pgmonitor.ExporterEnabled(ctx, cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { return } @@ -385,7 +385,7 @@ func (r *Reconciler) reconcileExporterWebConfig(ctx context.Context, return nil, err } - if !pgmonitor.ExporterEnabled(cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) || cluster.Spec.Monitoring.PGMonitor.Exporter.CustomTLSSecret == nil { + if !pgmonitor.ExporterEnabled(ctx, cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) || cluster.Spec.Monitoring.PGMonitor.Exporter.CustomTLSSecret == nil { // We could still have a NotFound error here so check the err. // If no error that means the configmap is found and needs to be deleted if err == nil { @@ -442,7 +442,7 @@ func (r *Reconciler) reconcileExporterQueriesConfig(ctx context.Context, return nil, err } - if !pgmonitor.ExporterEnabled(cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if !pgmonitor.ExporterEnabled(ctx, cluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // We could still have a NotFound error here so check the err. // If no error that means the configmap is found and needs to be deleted if err == nil { diff --git a/internal/controller/postgrescluster/pgmonitor_test.go b/internal/controller/postgrescluster/pgmonitor_test.go index 36a5027aaa..bf46dd204b 100644 --- a/internal/controller/postgrescluster/pgmonitor_test.go +++ b/internal/controller/postgrescluster/pgmonitor_test.go @@ -39,7 +39,7 @@ func testExporterCollectorsAnnotation(t *testing.T, ctx context.Context, cluster naming.PostgresExporterCollectorsAnnotation: "wrong-value", }) - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig)) + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig) assert.Equal(t, len(template.Spec.Containers), 1) container := template.Spec.Containers[0] @@ -56,7 +56,7 @@ func testExporterCollectorsAnnotation(t *testing.T, ctx context.Context, cluster naming.PostgresExporterCollectorsAnnotation: "None", }) - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig)) + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig) assert.Equal(t, len(template.Spec.Containers), 1) container := template.Spec.Containers[0] @@ -71,7 +71,7 @@ func testExporterCollectorsAnnotation(t *testing.T, ctx context.Context, cluster naming.PostgresExporterCollectorsAnnotation: "none", }) - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig)) + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, queriesConfig, webConfig) assert.Assert(t, cmp.Contains(strings.Join(template.Spec.Containers[0].Command, "\n"), "--[no-]collector")) }) }) @@ -100,7 +100,7 @@ func TestAddPGMonitorExporterToInstancePodSpec(t *testing.T) { t.Run("ExporterDisabled", func(t *testing.T) { template := &corev1.PodTemplateSpec{} - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, nil, nil)) + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, nil, nil) assert.DeepEqual(t, template, &corev1.PodTemplateSpec{}) }) @@ -121,8 +121,7 @@ func TestAddPGMonitorExporterToInstancePodSpec(t *testing.T) { }, } - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil)) - + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil) assert.Equal(t, len(template.Spec.Containers), 2) container := template.Spec.Containers[1] @@ -205,8 +204,7 @@ volumeMounts: }, } - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil)) - + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil) assert.Equal(t, len(template.Spec.Containers), 2) container := template.Spec.Containers[1] @@ -255,8 +253,7 @@ name: exporter-config }, } - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil)) - + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, nil) assert.Equal(t, len(template.Spec.Containers), 2) container := template.Spec.Containers[1] @@ -301,8 +298,7 @@ name: exporter-config testConfigMap := new(corev1.ConfigMap) testConfigMap.Name = "test-web-conf" - assert.NilError(t, addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, testConfigMap)) - + addPGMonitorExporterToInstancePodSpec(ctx, cluster, template, exporterQueriesConfig, testConfigMap) assert.Equal(t, len(template.Spec.Containers), 2) container := template.Spec.Containers[1] diff --git a/internal/pgmonitor/postgres.go b/internal/pgmonitor/postgres.go index ae01614ab9..19268f5c54 100644 --- a/internal/pgmonitor/postgres.go +++ b/internal/pgmonitor/postgres.go @@ -24,7 +24,7 @@ const ( // PostgreSQLHBAs provides the Postgres HBA rules for allowing the monitoring // exporter to be accessible func PostgreSQLHBAs(ctx context.Context, inCluster *v1beta1.PostgresCluster, outHBAs *postgres.HBAs) { - if ExporterEnabled(inCluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if ExporterEnabled(ctx, inCluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // Limit the monitoring user to local connections using SCRAM. outHBAs.Mandatory = append(outHBAs.Mandatory, postgres.NewHBA().TCP().User(MonitoringUser).Method("scram-sha-256").Network("127.0.0.0/8"), @@ -36,7 +36,7 @@ func PostgreSQLHBAs(ctx context.Context, inCluster *v1beta1.PostgresCluster, out // PostgreSQLParameters provides additional required configuration parameters // that Postgres needs to support monitoring func PostgreSQLParameters(ctx context.Context, inCluster *v1beta1.PostgresCluster, outParameters *postgres.Parameters) { - if ExporterEnabled(inCluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + if ExporterEnabled(ctx, inCluster) || feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // Exporter expects that shared_preload_libraries are installed // pg_stat_statements: https://access.crunchydata.com/documentation/pgmonitor/latest/exporter/ // pgnodemx: https://github.com/CrunchyData/pgnodemx diff --git a/internal/pgmonitor/util.go b/internal/pgmonitor/util.go index 8c89815829..32cf222448 100644 --- a/internal/pgmonitor/util.go +++ b/internal/pgmonitor/util.go @@ -8,6 +8,7 @@ import ( "context" "os" + "github.com/crunchydata/postgres-operator/internal/feature" "github.com/crunchydata/postgres-operator/internal/logging" "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" ) @@ -26,7 +27,7 @@ func GetQueriesConfigDir(ctx context.Context) string { } // ExporterEnabled returns true if the monitoring exporter is enabled -func ExporterEnabled(cluster *v1beta1.PostgresCluster) bool { +func ExporterEnabled(ctx context.Context, cluster *v1beta1.PostgresCluster) bool { if cluster.Spec.Monitoring == nil { return false } @@ -36,5 +37,8 @@ func ExporterEnabled(cluster *v1beta1.PostgresCluster) bool { if cluster.Spec.Monitoring.PGMonitor.Exporter == nil { return false } + if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { + return false + } return true } diff --git a/internal/pgmonitor/util_test.go b/internal/pgmonitor/util_test.go index 30d28b45d7..e83bbb3730 100644 --- a/internal/pgmonitor/util_test.go +++ b/internal/pgmonitor/util_test.go @@ -5,24 +5,34 @@ package pgmonitor import ( + "context" "testing" "gotest.tools/v3/assert" + "github.com/crunchydata/postgres-operator/internal/feature" "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" ) func TestExporterEnabled(t *testing.T) { cluster := &v1beta1.PostgresCluster{} - assert.Assert(t, !ExporterEnabled(cluster)) + ctx := context.Background() + assert.Assert(t, !ExporterEnabled(ctx, cluster)) cluster.Spec.Monitoring = &v1beta1.MonitoringSpec{} - assert.Assert(t, !ExporterEnabled(cluster)) + assert.Assert(t, !ExporterEnabled(ctx, cluster)) cluster.Spec.Monitoring.PGMonitor = &v1beta1.PGMonitorSpec{} - assert.Assert(t, !ExporterEnabled(cluster)) + assert.Assert(t, !ExporterEnabled(ctx, cluster)) cluster.Spec.Monitoring.PGMonitor.Exporter = &v1beta1.ExporterSpec{} - assert.Assert(t, ExporterEnabled(cluster)) + assert.Assert(t, ExporterEnabled(ctx, cluster)) + gate := feature.NewGate() + assert.NilError(t, gate.SetFromMap(map[string]bool{ + feature.OpenTelemetryMetrics: true, + })) + ctx = feature.NewContext(ctx, gate) + cluster.Spec.Monitoring.PGMonitor.Exporter = &v1beta1.ExporterSpec{} + assert.Assert(t, !ExporterEnabled(ctx, cluster)) } From 90ae16392a31970d15b94aabc54c01682ef232b4 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 11:50:28 -0500 Subject: [PATCH 18/22] Add Processors for patroni and bouncer metrics --- internal/collector/patroni.go | 4 ++++ internal/collector/pgbouncer.go | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/internal/collector/patroni.go b/internal/collector/patroni.go index 0924167987..61b18919cc 100644 --- a/internal/collector/patroni.go +++ b/internal/collector/patroni.go @@ -162,6 +162,10 @@ func EnablePatroniMetrics(ctx context.Context, // Add Metrics Pipeline outConfig.Pipelines[PatroniMetrics] = Pipeline{ Receivers: []ComponentID{Prometheus}, + Processors: []ComponentID{ + SubSecondBatchProcessor, + CompactingProcessor, + }, Exporters: []ComponentID{Prometheus}, } } diff --git a/internal/collector/pgbouncer.go b/internal/collector/pgbouncer.go index 424683e3af..7bb1d5acd3 100644 --- a/internal/collector/pgbouncer.go +++ b/internal/collector/pgbouncer.go @@ -186,6 +186,10 @@ func EnablePgBouncerMetrics(ctx context.Context, config *Config, sqlQueryUsernam // Add Metrics Pipeline config.Pipelines[PGBouncerMetrics] = Pipeline{ Receivers: []ComponentID{SqlQuery}, + Processors: []ComponentID{ + SubSecondBatchProcessor, + CompactingProcessor, + }, Exporters: []ComponentID{Prometheus}, } } From d71afacc98b1b8a14ed9d2750fe73b66e314a4bf Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 14:21:42 -0500 Subject: [PATCH 19/22] Prometheus exporter port should match postgres-exporter --- internal/collector/patroni.go | 2 +- internal/collector/pgbouncer.go | 2 +- internal/collector/postgres_metrics.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/collector/patroni.go b/internal/collector/patroni.go index 61b18919cc..1f0846eedb 100644 --- a/internal/collector/patroni.go +++ b/internal/collector/patroni.go @@ -133,7 +133,7 @@ func EnablePatroniMetrics(ctx context.Context, if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // Add Prometheus exporter outConfig.Exporters[Prometheus] = map[string]any{ - "endpoint": "0.0.0.0:8889", + "endpoint": "0.0.0.0:9187", } // Add Prometheus Receiver diff --git a/internal/collector/pgbouncer.go b/internal/collector/pgbouncer.go index 7bb1d5acd3..59ba0b7495 100644 --- a/internal/collector/pgbouncer.go +++ b/internal/collector/pgbouncer.go @@ -172,7 +172,7 @@ func EnablePgBouncerMetrics(ctx context.Context, config *Config, sqlQueryUsernam if feature.Enabled(ctx, feature.OpenTelemetryMetrics) { // Add Prometheus exporter config.Exporters[Prometheus] = map[string]any{ - "endpoint": "0.0.0.0:8889", + "endpoint": "0.0.0.0:9187", } // Add SqlQuery Receiver diff --git a/internal/collector/postgres_metrics.go b/internal/collector/postgres_metrics.go index bf541c338d..8377676813 100644 --- a/internal/collector/postgres_metrics.go +++ b/internal/collector/postgres_metrics.go @@ -51,7 +51,7 @@ func EnablePostgresMetrics(ctx context.Context, inCluster *v1beta1.PostgresClust } // Add Prometheus exporter config.Exporters[Prometheus] = map[string]any{ - "endpoint": "0.0.0.0:8889", + "endpoint": "0.0.0.0:9187", } config.Receivers[FiveSecondSqlQuery] = map[string]any{ From 92d95058bb0053f00f5dbc6fbd367129fed0f1c7 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 17:32:10 -0500 Subject: [PATCH 20/22] Updates comment --- internal/controller/postgrescluster/pgmonitor.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index 9dae851db4..58c16eeef9 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -157,12 +157,9 @@ func (r *Reconciler) reconcileMonitoringSecret( return nil, err } + // Checking if the exporter is enabled to determine when monitoring + // secret should be created. if !pgmonitor.ExporterEnabled(ctx, cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { - // TODO: Checking if the exporter is enabled to determine when monitoring - // secret should be created. If more tools are added to the monitoring - // suite, they could need the secret when the exporter is not enabled. - // This check may need to be updated. - // Exporter is disabled; delete monitoring secret if it exists. if err == nil { err = errors.WithStack(r.deleteControlled(ctx, cluster, existing)) } From a01b5e775a0101926666439b3c24c2a9763f9bd1 Mon Sep 17 00:00:00 2001 From: tony-landreth Date: Fri, 21 Feb 2025 17:38:51 -0500 Subject: [PATCH 21/22] Rename fn to DisableMonitoringUserInPostgres --- internal/controller/postgrescluster/pgmonitor.go | 2 +- internal/pgmonitor/postgres.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index 58c16eeef9..9740ccc6f5 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -101,7 +101,7 @@ func (r *Reconciler) reconcilePGMonitorExporter(ctx context.Context, if !pgmonitor.ExporterEnabled(ctx, cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { action = func(ctx context.Context, exec postgres.Executor) error { - return pgmonitor.DisableExporterInPostgreSQL(ctx, exec) + return pgmonitor.DisableMonitoringUserInPostgres(ctx, exec) } } diff --git a/internal/pgmonitor/postgres.go b/internal/pgmonitor/postgres.go index 19268f5c54..08a428d465 100644 --- a/internal/pgmonitor/postgres.go +++ b/internal/pgmonitor/postgres.go @@ -46,11 +46,11 @@ func PostgreSQLParameters(ctx context.Context, inCluster *v1beta1.PostgresCluste } } -// DisableExporterInPostgreSQL disables the exporter configuration in PostgreSQL. +// DisableMonitoringUserInPostgres disables the exporter configuration in PostgreSQL. // Currently the exporter is disabled by removing login permissions for the // monitoring user. // TODO: evaluate other uninstall/removal options -func DisableExporterInPostgreSQL(ctx context.Context, exec postgres.Executor) error { +func DisableMonitoringUserInPostgres(ctx context.Context, exec postgres.Executor) error { log := logging.FromContext(ctx) stdout, stderr, err := exec.Exec(ctx, strings.NewReader(` From 69686012a7b6d6bd4553ace88017f5b98ecce972 Mon Sep 17 00:00:00 2001 From: Drew Sessler <36803518+dsessler7@users.noreply.github.com> Date: Fri, 21 Feb 2025 14:52:23 -0800 Subject: [PATCH 22/22] Update internal/controller/postgrescluster/pgmonitor.go --- internal/controller/postgrescluster/pgmonitor.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internal/controller/postgrescluster/pgmonitor.go b/internal/controller/postgrescluster/pgmonitor.go index 9740ccc6f5..84b955559a 100644 --- a/internal/controller/postgrescluster/pgmonitor.go +++ b/internal/controller/postgrescluster/pgmonitor.go @@ -157,8 +157,10 @@ func (r *Reconciler) reconcileMonitoringSecret( return nil, err } - // Checking if the exporter is enabled to determine when monitoring - // secret should be created. + // Checking if the exporter is enabled or OpenTelemetryMetrics feature + // is enabled to determine when monitoring secret should be created, + // since our implementation of the SqlQuery receiver in the OTel Collector + // uses the monitoring user as well. if !pgmonitor.ExporterEnabled(ctx, cluster) && !feature.Enabled(ctx, feature.OpenTelemetryMetrics) { if err == nil { err = errors.WithStack(r.deleteControlled(ctx, cluster, existing))